1*7c478bd9Sstevel@tonic-gate /*-
2*7c478bd9Sstevel@tonic-gate * See the file LICENSE for redistribution information.
3*7c478bd9Sstevel@tonic-gate *
4*7c478bd9Sstevel@tonic-gate * Copyright (c) 1996, 1997, 1998
5*7c478bd9Sstevel@tonic-gate * Sleepycat Software. All rights reserved.
6*7c478bd9Sstevel@tonic-gate */
7*7c478bd9Sstevel@tonic-gate /*
8*7c478bd9Sstevel@tonic-gate * Copyright (c) 1990, 1993, 1994, 1995, 1996
9*7c478bd9Sstevel@tonic-gate * Keith Bostic. All rights reserved.
10*7c478bd9Sstevel@tonic-gate */
11*7c478bd9Sstevel@tonic-gate /*
12*7c478bd9Sstevel@tonic-gate * Copyright (c) 1990, 1993, 1994, 1995
13*7c478bd9Sstevel@tonic-gate * The Regents of the University of California. All rights reserved.
14*7c478bd9Sstevel@tonic-gate *
15*7c478bd9Sstevel@tonic-gate * Redistribution and use in source and binary forms, with or without
16*7c478bd9Sstevel@tonic-gate * modification, are permitted provided that the following conditions
17*7c478bd9Sstevel@tonic-gate * are met:
18*7c478bd9Sstevel@tonic-gate * 1. Redistributions of source code must retain the above copyright
19*7c478bd9Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer.
20*7c478bd9Sstevel@tonic-gate * 2. Redistributions in binary form must reproduce the above copyright
21*7c478bd9Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer in the
22*7c478bd9Sstevel@tonic-gate * documentation and/or other materials provided with the distribution.
23*7c478bd9Sstevel@tonic-gate * 3. All advertising materials mentioning features or use of this software
24*7c478bd9Sstevel@tonic-gate * must display the following acknowledgement:
25*7c478bd9Sstevel@tonic-gate * This product includes software developed by the University of
26*7c478bd9Sstevel@tonic-gate * California, Berkeley and its contributors.
27*7c478bd9Sstevel@tonic-gate * 4. Neither the name of the University nor the names of its contributors
28*7c478bd9Sstevel@tonic-gate * may be used to endorse or promote products derived from this software
29*7c478bd9Sstevel@tonic-gate * without specific prior written permission.
30*7c478bd9Sstevel@tonic-gate *
31*7c478bd9Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32*7c478bd9Sstevel@tonic-gate * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33*7c478bd9Sstevel@tonic-gate * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34*7c478bd9Sstevel@tonic-gate * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35*7c478bd9Sstevel@tonic-gate * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36*7c478bd9Sstevel@tonic-gate * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37*7c478bd9Sstevel@tonic-gate * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38*7c478bd9Sstevel@tonic-gate * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39*7c478bd9Sstevel@tonic-gate * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40*7c478bd9Sstevel@tonic-gate * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41*7c478bd9Sstevel@tonic-gate * SUCH DAMAGE.
42*7c478bd9Sstevel@tonic-gate */
43*7c478bd9Sstevel@tonic-gate
44*7c478bd9Sstevel@tonic-gate #include "config.h"
45*7c478bd9Sstevel@tonic-gate
46*7c478bd9Sstevel@tonic-gate #ifndef lint
47*7c478bd9Sstevel@tonic-gate static const char sccsid[] = "@(#)bt_split.c 10.33 (Sleepycat) 10/13/98";
48*7c478bd9Sstevel@tonic-gate #endif /* not lint */
49*7c478bd9Sstevel@tonic-gate
50*7c478bd9Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES
51*7c478bd9Sstevel@tonic-gate #include <sys/types.h>
52*7c478bd9Sstevel@tonic-gate
53*7c478bd9Sstevel@tonic-gate #include <errno.h>
54*7c478bd9Sstevel@tonic-gate #include <limits.h>
55*7c478bd9Sstevel@tonic-gate #include <string.h>
56*7c478bd9Sstevel@tonic-gate #endif
57*7c478bd9Sstevel@tonic-gate
58*7c478bd9Sstevel@tonic-gate #include "db_int.h"
59*7c478bd9Sstevel@tonic-gate #include "db_page.h"
60*7c478bd9Sstevel@tonic-gate #include "btree.h"
61*7c478bd9Sstevel@tonic-gate
62*7c478bd9Sstevel@tonic-gate static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *));
63*7c478bd9Sstevel@tonic-gate static int __bam_page __P((DBC *, EPG *, EPG *));
64*7c478bd9Sstevel@tonic-gate static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *));
65*7c478bd9Sstevel@tonic-gate static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));
66*7c478bd9Sstevel@tonic-gate static int __bam_root __P((DBC *, EPG *));
67*7c478bd9Sstevel@tonic-gate static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
68*7c478bd9Sstevel@tonic-gate
69*7c478bd9Sstevel@tonic-gate /*
70*7c478bd9Sstevel@tonic-gate * __bam_split --
71*7c478bd9Sstevel@tonic-gate * Split a page.
72*7c478bd9Sstevel@tonic-gate *
73*7c478bd9Sstevel@tonic-gate * PUBLIC: int __bam_split __P((DBC *, void *));
74*7c478bd9Sstevel@tonic-gate */
75*7c478bd9Sstevel@tonic-gate int
__bam_split(dbc,arg)76*7c478bd9Sstevel@tonic-gate __bam_split(dbc, arg)
77*7c478bd9Sstevel@tonic-gate DBC *dbc;
78*7c478bd9Sstevel@tonic-gate void *arg;
79*7c478bd9Sstevel@tonic-gate {
80*7c478bd9Sstevel@tonic-gate BTREE *t;
81*7c478bd9Sstevel@tonic-gate CURSOR *cp;
82*7c478bd9Sstevel@tonic-gate DB *dbp;
83*7c478bd9Sstevel@tonic-gate enum { UP, DOWN } dir;
84*7c478bd9Sstevel@tonic-gate int exact, level, ret;
85*7c478bd9Sstevel@tonic-gate
86*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp;
87*7c478bd9Sstevel@tonic-gate cp = dbc->internal;
88*7c478bd9Sstevel@tonic-gate
89*7c478bd9Sstevel@tonic-gate /*
90*7c478bd9Sstevel@tonic-gate * The locking protocol we use to avoid deadlock to acquire locks by
91*7c478bd9Sstevel@tonic-gate * walking down the tree, but we do it as lazily as possible, locking
92*7c478bd9Sstevel@tonic-gate * the root only as a last resort. We expect all stack pages to have
93*7c478bd9Sstevel@tonic-gate * been discarded before we're called; we discard all short-term locks.
94*7c478bd9Sstevel@tonic-gate *
95*7c478bd9Sstevel@tonic-gate * When __bam_split is first called, we know that a leaf page was too
96*7c478bd9Sstevel@tonic-gate * full for an insert. We don't know what leaf page it was, but we
97*7c478bd9Sstevel@tonic-gate * have the key/recno that caused the problem. We call XX_search to
98*7c478bd9Sstevel@tonic-gate * reacquire the leaf page, but this time get both the leaf page and
99*7c478bd9Sstevel@tonic-gate * its parent, locked. We then split the leaf page and see if the new
100*7c478bd9Sstevel@tonic-gate * internal key will fit into the parent page. If it will, we're done.
101*7c478bd9Sstevel@tonic-gate *
102*7c478bd9Sstevel@tonic-gate * If it won't, we discard our current locks and repeat the process,
103*7c478bd9Sstevel@tonic-gate * only this time acquiring the parent page and its parent, locked.
104*7c478bd9Sstevel@tonic-gate * This process repeats until we succeed in the split, splitting the
105*7c478bd9Sstevel@tonic-gate * root page as the final resort. The entire process then repeats,
106*7c478bd9Sstevel@tonic-gate * as necessary, until we split a leaf page.
107*7c478bd9Sstevel@tonic-gate *
108*7c478bd9Sstevel@tonic-gate * XXX
109*7c478bd9Sstevel@tonic-gate * A traditional method of speeding this up is to maintain a stack of
110*7c478bd9Sstevel@tonic-gate * the pages traversed in the original search. You can detect if the
111*7c478bd9Sstevel@tonic-gate * stack is correct by storing the page's LSN when it was searched and
112*7c478bd9Sstevel@tonic-gate * comparing that LSN with the current one when it's locked during the
113*7c478bd9Sstevel@tonic-gate * split. This would be an easy change for this code, but I have no
114*7c478bd9Sstevel@tonic-gate * numbers that indicate it's worthwhile.
115*7c478bd9Sstevel@tonic-gate */
116*7c478bd9Sstevel@tonic-gate t = dbp->internal;
117*7c478bd9Sstevel@tonic-gate for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) {
118*7c478bd9Sstevel@tonic-gate /*
119*7c478bd9Sstevel@tonic-gate * Acquire a page and its parent, locked.
120*7c478bd9Sstevel@tonic-gate */
121*7c478bd9Sstevel@tonic-gate if ((ret = (dbp->type == DB_BTREE ?
122*7c478bd9Sstevel@tonic-gate __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) :
123*7c478bd9Sstevel@tonic-gate __bam_rsearch(dbc,
124*7c478bd9Sstevel@tonic-gate (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0)
125*7c478bd9Sstevel@tonic-gate return (ret);
126*7c478bd9Sstevel@tonic-gate
127*7c478bd9Sstevel@tonic-gate /*
128*7c478bd9Sstevel@tonic-gate * Split the page if it still needs it (it's possible another
129*7c478bd9Sstevel@tonic-gate * thread of control has already split the page). If we are
130*7c478bd9Sstevel@tonic-gate * guaranteed that two items will fit on the page, the split
131*7c478bd9Sstevel@tonic-gate * is no longer necessary.
132*7c478bd9Sstevel@tonic-gate */
133*7c478bd9Sstevel@tonic-gate if (t->bt_ovflsize * 2 <=
134*7c478bd9Sstevel@tonic-gate (db_indx_t)P_FREESPACE(cp->csp[0].page)) {
135*7c478bd9Sstevel@tonic-gate __bam_stkrel(dbc, 1);
136*7c478bd9Sstevel@tonic-gate return (0);
137*7c478bd9Sstevel@tonic-gate }
138*7c478bd9Sstevel@tonic-gate ret = cp->csp[0].page->pgno == PGNO_ROOT ?
139*7c478bd9Sstevel@tonic-gate __bam_root(dbc, &cp->csp[0]) :
140*7c478bd9Sstevel@tonic-gate __bam_page(dbc, &cp->csp[-1], &cp->csp[0]);
141*7c478bd9Sstevel@tonic-gate BT_STK_CLR(cp);
142*7c478bd9Sstevel@tonic-gate
143*7c478bd9Sstevel@tonic-gate switch (ret) {
144*7c478bd9Sstevel@tonic-gate case 0:
145*7c478bd9Sstevel@tonic-gate /* Once we've split the leaf page, we're done. */
146*7c478bd9Sstevel@tonic-gate if (level == LEAFLEVEL)
147*7c478bd9Sstevel@tonic-gate return (0);
148*7c478bd9Sstevel@tonic-gate
149*7c478bd9Sstevel@tonic-gate /* Switch directions. */
150*7c478bd9Sstevel@tonic-gate if (dir == UP)
151*7c478bd9Sstevel@tonic-gate dir = DOWN;
152*7c478bd9Sstevel@tonic-gate break;
153*7c478bd9Sstevel@tonic-gate case DB_NEEDSPLIT:
154*7c478bd9Sstevel@tonic-gate /*
155*7c478bd9Sstevel@tonic-gate * It's possible to fail to split repeatedly, as other
156*7c478bd9Sstevel@tonic-gate * threads may be modifying the tree, or the page usage
157*7c478bd9Sstevel@tonic-gate * is sufficiently bad that we don't get enough space
158*7c478bd9Sstevel@tonic-gate * the first time.
159*7c478bd9Sstevel@tonic-gate */
160*7c478bd9Sstevel@tonic-gate if (dir == DOWN)
161*7c478bd9Sstevel@tonic-gate dir = UP;
162*7c478bd9Sstevel@tonic-gate break;
163*7c478bd9Sstevel@tonic-gate default:
164*7c478bd9Sstevel@tonic-gate return (ret);
165*7c478bd9Sstevel@tonic-gate }
166*7c478bd9Sstevel@tonic-gate }
167*7c478bd9Sstevel@tonic-gate /* NOTREACHED */
168*7c478bd9Sstevel@tonic-gate }
169*7c478bd9Sstevel@tonic-gate
170*7c478bd9Sstevel@tonic-gate /*
171*7c478bd9Sstevel@tonic-gate * __bam_root --
172*7c478bd9Sstevel@tonic-gate * Split the root page of a btree.
173*7c478bd9Sstevel@tonic-gate */
174*7c478bd9Sstevel@tonic-gate static int
__bam_root(dbc,cp)175*7c478bd9Sstevel@tonic-gate __bam_root(dbc, cp)
176*7c478bd9Sstevel@tonic-gate DBC *dbc;
177*7c478bd9Sstevel@tonic-gate EPG *cp;
178*7c478bd9Sstevel@tonic-gate {
179*7c478bd9Sstevel@tonic-gate DB *dbp;
180*7c478bd9Sstevel@tonic-gate PAGE *lp, *rp;
181*7c478bd9Sstevel@tonic-gate db_indx_t split;
182*7c478bd9Sstevel@tonic-gate int ret;
183*7c478bd9Sstevel@tonic-gate
184*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp;
185*7c478bd9Sstevel@tonic-gate
186*7c478bd9Sstevel@tonic-gate /* Yeah, right. */
187*7c478bd9Sstevel@tonic-gate if (cp->page->level >= MAXBTREELEVEL) {
188*7c478bd9Sstevel@tonic-gate ret = ENOSPC;
189*7c478bd9Sstevel@tonic-gate goto err;
190*7c478bd9Sstevel@tonic-gate }
191*7c478bd9Sstevel@tonic-gate
192*7c478bd9Sstevel@tonic-gate /* Create new left and right pages for the split. */
193*7c478bd9Sstevel@tonic-gate lp = rp = NULL;
194*7c478bd9Sstevel@tonic-gate if ((ret = __bam_new(dbc, TYPE(cp->page), &lp)) != 0 ||
195*7c478bd9Sstevel@tonic-gate (ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0)
196*7c478bd9Sstevel@tonic-gate goto err;
197*7c478bd9Sstevel@tonic-gate P_INIT(lp, dbp->pgsize, lp->pgno,
198*7c478bd9Sstevel@tonic-gate PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
199*7c478bd9Sstevel@tonic-gate cp->page->level, TYPE(cp->page));
200*7c478bd9Sstevel@tonic-gate P_INIT(rp, dbp->pgsize, rp->pgno,
201*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID,
202*7c478bd9Sstevel@tonic-gate cp->page->level, TYPE(cp->page));
203*7c478bd9Sstevel@tonic-gate
204*7c478bd9Sstevel@tonic-gate /* Split the page. */
205*7c478bd9Sstevel@tonic-gate if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
206*7c478bd9Sstevel@tonic-gate goto err;
207*7c478bd9Sstevel@tonic-gate
208*7c478bd9Sstevel@tonic-gate /* Log the change. */
209*7c478bd9Sstevel@tonic-gate if (DB_LOGGING(dbc)) {
210*7c478bd9Sstevel@tonic-gate DBT __a;
211*7c478bd9Sstevel@tonic-gate DB_LSN __lsn;
212*7c478bd9Sstevel@tonic-gate memset(&__a, 0, sizeof(__a));
213*7c478bd9Sstevel@tonic-gate __a.data = cp->page;
214*7c478bd9Sstevel@tonic-gate __a.size = dbp->pgsize;
215*7c478bd9Sstevel@tonic-gate ZERO_LSN(__lsn);
216*7c478bd9Sstevel@tonic-gate if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn,
217*7c478bd9Sstevel@tonic-gate &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp),
218*7c478bd9Sstevel@tonic-gate PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn,
219*7c478bd9Sstevel@tonic-gate &__a)) != 0)
220*7c478bd9Sstevel@tonic-gate goto err;
221*7c478bd9Sstevel@tonic-gate LSN(lp) = LSN(rp) = LSN(cp->page);
222*7c478bd9Sstevel@tonic-gate }
223*7c478bd9Sstevel@tonic-gate
224*7c478bd9Sstevel@tonic-gate /* Clean up the new root page. */
225*7c478bd9Sstevel@tonic-gate if ((ret = (dbp->type == DB_RECNO ?
226*7c478bd9Sstevel@tonic-gate __ram_root(dbc, cp->page, lp, rp) :
227*7c478bd9Sstevel@tonic-gate __bam_broot(dbc, cp->page, lp, rp))) != 0)
228*7c478bd9Sstevel@tonic-gate goto err;
229*7c478bd9Sstevel@tonic-gate
230*7c478bd9Sstevel@tonic-gate /* Adjust any cursors. Do it last so we don't have to undo it. */
231*7c478bd9Sstevel@tonic-gate __bam_ca_split(dbp, cp->page->pgno, lp->pgno, rp->pgno, split, 1);
232*7c478bd9Sstevel@tonic-gate
233*7c478bd9Sstevel@tonic-gate /* Success -- write the real pages back to the store. */
234*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
235*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock);
236*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY);
237*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
238*7c478bd9Sstevel@tonic-gate
239*7c478bd9Sstevel@tonic-gate return (0);
240*7c478bd9Sstevel@tonic-gate
241*7c478bd9Sstevel@tonic-gate err: if (lp != NULL)
242*7c478bd9Sstevel@tonic-gate (void)__bam_free(dbc, lp);
243*7c478bd9Sstevel@tonic-gate if (rp != NULL)
244*7c478bd9Sstevel@tonic-gate (void)__bam_free(dbc, rp);
245*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, 0);
246*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock);
247*7c478bd9Sstevel@tonic-gate return (ret);
248*7c478bd9Sstevel@tonic-gate }
249*7c478bd9Sstevel@tonic-gate
250*7c478bd9Sstevel@tonic-gate /*
251*7c478bd9Sstevel@tonic-gate * __bam_page --
252*7c478bd9Sstevel@tonic-gate * Split the non-root page of a btree.
253*7c478bd9Sstevel@tonic-gate */
254*7c478bd9Sstevel@tonic-gate static int
__bam_page(dbc,pp,cp)255*7c478bd9Sstevel@tonic-gate __bam_page(dbc, pp, cp)
256*7c478bd9Sstevel@tonic-gate DBC *dbc;
257*7c478bd9Sstevel@tonic-gate EPG *pp, *cp;
258*7c478bd9Sstevel@tonic-gate {
259*7c478bd9Sstevel@tonic-gate DB *dbp;
260*7c478bd9Sstevel@tonic-gate DB_LOCK tplock;
261*7c478bd9Sstevel@tonic-gate PAGE *lp, *rp, *tp;
262*7c478bd9Sstevel@tonic-gate db_indx_t split;
263*7c478bd9Sstevel@tonic-gate int ret;
264*7c478bd9Sstevel@tonic-gate
265*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp;
266*7c478bd9Sstevel@tonic-gate lp = rp = tp = NULL;
267*7c478bd9Sstevel@tonic-gate ret = -1;
268*7c478bd9Sstevel@tonic-gate
269*7c478bd9Sstevel@tonic-gate /* Create new right page for the split. */
270*7c478bd9Sstevel@tonic-gate if ((ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0)
271*7c478bd9Sstevel@tonic-gate goto err;
272*7c478bd9Sstevel@tonic-gate P_INIT(rp, dbp->pgsize, rp->pgno,
273*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno,
274*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->next_pgno,
275*7c478bd9Sstevel@tonic-gate cp->page->level, TYPE(cp->page));
276*7c478bd9Sstevel@tonic-gate
277*7c478bd9Sstevel@tonic-gate /* Create new left page for the split. */
278*7c478bd9Sstevel@tonic-gate if ((ret = __os_malloc(dbp->pgsize, NULL, &lp)) != 0)
279*7c478bd9Sstevel@tonic-gate goto err;
280*7c478bd9Sstevel@tonic-gate P_INIT(lp, dbp->pgsize, cp->page->pgno,
281*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->prev_pgno,
282*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
283*7c478bd9Sstevel@tonic-gate cp->page->level, TYPE(cp->page));
284*7c478bd9Sstevel@tonic-gate ZERO_LSN(lp->lsn);
285*7c478bd9Sstevel@tonic-gate
286*7c478bd9Sstevel@tonic-gate /*
287*7c478bd9Sstevel@tonic-gate * Split right.
288*7c478bd9Sstevel@tonic-gate *
289*7c478bd9Sstevel@tonic-gate * Only the indices are sorted on the page, i.e., the key/data pairs
290*7c478bd9Sstevel@tonic-gate * aren't, so it's simpler to copy the data from the split page onto
291*7c478bd9Sstevel@tonic-gate * two new pages instead of copying half the data to the right page
292*7c478bd9Sstevel@tonic-gate * and compacting the left page in place. Since the left page can't
293*7c478bd9Sstevel@tonic-gate * change, we swap the original and the allocated left page after the
294*7c478bd9Sstevel@tonic-gate * split.
295*7c478bd9Sstevel@tonic-gate */
296*7c478bd9Sstevel@tonic-gate if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
297*7c478bd9Sstevel@tonic-gate goto err;
298*7c478bd9Sstevel@tonic-gate
299*7c478bd9Sstevel@tonic-gate /*
300*7c478bd9Sstevel@tonic-gate * Fix up the previous pointer of any leaf page following the split
301*7c478bd9Sstevel@tonic-gate * page.
302*7c478bd9Sstevel@tonic-gate *
303*7c478bd9Sstevel@tonic-gate * !!!
304*7c478bd9Sstevel@tonic-gate * There are interesting deadlock situations here as we write-lock a
305*7c478bd9Sstevel@tonic-gate * page that's not in our direct ancestry. Consider a cursor walking
306*7c478bd9Sstevel@tonic-gate * through the leaf pages, that has the previous page read-locked and
307*7c478bd9Sstevel@tonic-gate * is waiting on a lock for the page we just split. It will deadlock
308*7c478bd9Sstevel@tonic-gate * here. If this is a problem, we can fail in the split; it's not a
309*7c478bd9Sstevel@tonic-gate * problem as the split will succeed after the cursor passes through
310*7c478bd9Sstevel@tonic-gate * the page we're splitting.
311*7c478bd9Sstevel@tonic-gate */
312*7c478bd9Sstevel@tonic-gate if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) {
313*7c478bd9Sstevel@tonic-gate if ((ret = __bam_lget(dbc,
314*7c478bd9Sstevel@tonic-gate 0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0)
315*7c478bd9Sstevel@tonic-gate goto err;
316*7c478bd9Sstevel@tonic-gate if ((ret = memp_fget(dbp->mpf, &rp->next_pgno, 0, &tp)) != 0)
317*7c478bd9Sstevel@tonic-gate goto err;
318*7c478bd9Sstevel@tonic-gate }
319*7c478bd9Sstevel@tonic-gate
320*7c478bd9Sstevel@tonic-gate /* Insert the new pages into the parent page. */
321*7c478bd9Sstevel@tonic-gate if ((ret = __bam_pinsert(dbc, pp, lp, rp)) != 0)
322*7c478bd9Sstevel@tonic-gate goto err;
323*7c478bd9Sstevel@tonic-gate
324*7c478bd9Sstevel@tonic-gate /* Log the change. */
325*7c478bd9Sstevel@tonic-gate if (DB_LOGGING(dbc)) {
326*7c478bd9Sstevel@tonic-gate DBT __a;
327*7c478bd9Sstevel@tonic-gate DB_LSN __lsn;
328*7c478bd9Sstevel@tonic-gate memset(&__a, 0, sizeof(__a));
329*7c478bd9Sstevel@tonic-gate __a.data = cp->page;
330*7c478bd9Sstevel@tonic-gate __a.size = dbp->pgsize;
331*7c478bd9Sstevel@tonic-gate if (tp == NULL)
332*7c478bd9Sstevel@tonic-gate ZERO_LSN(__lsn);
333*7c478bd9Sstevel@tonic-gate if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn,
334*7c478bd9Sstevel@tonic-gate &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page),
335*7c478bd9Sstevel@tonic-gate &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp),
336*7c478bd9Sstevel@tonic-gate tp == NULL ? 0 : PGNO(tp),
337*7c478bd9Sstevel@tonic-gate tp == NULL ? &__lsn : &LSN(tp), &__a)) != 0)
338*7c478bd9Sstevel@tonic-gate goto err;
339*7c478bd9Sstevel@tonic-gate
340*7c478bd9Sstevel@tonic-gate LSN(lp) = LSN(rp) = LSN(cp->page);
341*7c478bd9Sstevel@tonic-gate if (tp != NULL)
342*7c478bd9Sstevel@tonic-gate LSN(tp) = LSN(cp->page);
343*7c478bd9Sstevel@tonic-gate }
344*7c478bd9Sstevel@tonic-gate
345*7c478bd9Sstevel@tonic-gate /* Copy the allocated page into place. */
346*7c478bd9Sstevel@tonic-gate memcpy(cp->page, lp, LOFFSET(lp));
347*7c478bd9Sstevel@tonic-gate memcpy((u_int8_t *)cp->page + HOFFSET(lp),
348*7c478bd9Sstevel@tonic-gate (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
349*7c478bd9Sstevel@tonic-gate __os_free(lp, dbp->pgsize);
350*7c478bd9Sstevel@tonic-gate lp = NULL;
351*7c478bd9Sstevel@tonic-gate
352*7c478bd9Sstevel@tonic-gate /* Finish the next-page link. */
353*7c478bd9Sstevel@tonic-gate if (tp != NULL)
354*7c478bd9Sstevel@tonic-gate tp->prev_pgno = rp->pgno;
355*7c478bd9Sstevel@tonic-gate
356*7c478bd9Sstevel@tonic-gate /* Adjust any cursors. Do so last so we don't have to undo it. */
357*7c478bd9Sstevel@tonic-gate __bam_ca_split(dbp, cp->page->pgno, cp->page->pgno, rp->pgno, split, 0);
358*7c478bd9Sstevel@tonic-gate
359*7c478bd9Sstevel@tonic-gate /* Success -- write the real pages back to the store. */
360*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY);
361*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, pp->lock);
362*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
363*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock);
364*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
365*7c478bd9Sstevel@tonic-gate if (tp != NULL) {
366*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY);
367*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, tplock);
368*7c478bd9Sstevel@tonic-gate }
369*7c478bd9Sstevel@tonic-gate return (0);
370*7c478bd9Sstevel@tonic-gate
371*7c478bd9Sstevel@tonic-gate err: if (lp != NULL)
372*7c478bd9Sstevel@tonic-gate __os_free(lp, dbp->pgsize);
373*7c478bd9Sstevel@tonic-gate if (rp != NULL)
374*7c478bd9Sstevel@tonic-gate (void)__bam_free(dbc, rp);
375*7c478bd9Sstevel@tonic-gate if (tp != NULL) {
376*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, tp, 0);
377*7c478bd9Sstevel@tonic-gate if (ret == DB_NEEDSPLIT)
378*7c478bd9Sstevel@tonic-gate (void)__BT_LPUT(dbc, tplock);
379*7c478bd9Sstevel@tonic-gate else
380*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, tplock);
381*7c478bd9Sstevel@tonic-gate }
382*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, pp->page, 0);
383*7c478bd9Sstevel@tonic-gate if (ret == DB_NEEDSPLIT)
384*7c478bd9Sstevel@tonic-gate (void)__BT_LPUT(dbc, pp->lock);
385*7c478bd9Sstevel@tonic-gate else
386*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, pp->lock);
387*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, 0);
388*7c478bd9Sstevel@tonic-gate if (ret == DB_NEEDSPLIT)
389*7c478bd9Sstevel@tonic-gate (void)__BT_LPUT(dbc, cp->lock);
390*7c478bd9Sstevel@tonic-gate else
391*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock);
392*7c478bd9Sstevel@tonic-gate return (ret);
393*7c478bd9Sstevel@tonic-gate }
394*7c478bd9Sstevel@tonic-gate
395*7c478bd9Sstevel@tonic-gate /*
396*7c478bd9Sstevel@tonic-gate * __bam_broot --
397*7c478bd9Sstevel@tonic-gate * Fix up the btree root page after it has been split.
398*7c478bd9Sstevel@tonic-gate */
399*7c478bd9Sstevel@tonic-gate static int
__bam_broot(dbc,rootp,lp,rp)400*7c478bd9Sstevel@tonic-gate __bam_broot(dbc, rootp, lp, rp)
401*7c478bd9Sstevel@tonic-gate DBC *dbc;
402*7c478bd9Sstevel@tonic-gate PAGE *rootp, *lp, *rp;
403*7c478bd9Sstevel@tonic-gate {
404*7c478bd9Sstevel@tonic-gate BINTERNAL bi, *child_bi;
405*7c478bd9Sstevel@tonic-gate BKEYDATA *child_bk;
406*7c478bd9Sstevel@tonic-gate DB *dbp;
407*7c478bd9Sstevel@tonic-gate DBT hdr, data;
408*7c478bd9Sstevel@tonic-gate int ret;
409*7c478bd9Sstevel@tonic-gate
410*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp;
411*7c478bd9Sstevel@tonic-gate
412*7c478bd9Sstevel@tonic-gate /*
413*7c478bd9Sstevel@tonic-gate * If the root page was a leaf page, change it into an internal page.
414*7c478bd9Sstevel@tonic-gate * We copy the key we split on (but not the key's data, in the case of
415*7c478bd9Sstevel@tonic-gate * a leaf page) to the new root page.
416*7c478bd9Sstevel@tonic-gate */
417*7c478bd9Sstevel@tonic-gate P_INIT(rootp, dbp->pgsize,
418*7c478bd9Sstevel@tonic-gate PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE);
419*7c478bd9Sstevel@tonic-gate
420*7c478bd9Sstevel@tonic-gate memset(&data, 0, sizeof(data));
421*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr));
422*7c478bd9Sstevel@tonic-gate
423*7c478bd9Sstevel@tonic-gate /*
424*7c478bd9Sstevel@tonic-gate * The btree comparison code guarantees that the left-most key on any
425*7c478bd9Sstevel@tonic-gate * level of the tree is never used, so it doesn't need to be filled in.
426*7c478bd9Sstevel@tonic-gate */
427*7c478bd9Sstevel@tonic-gate memset(&bi, 0, sizeof(bi));
428*7c478bd9Sstevel@tonic-gate bi.len = 0;
429*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, B_KEYDATA, 0);
430*7c478bd9Sstevel@tonic-gate bi.pgno = lp->pgno;
431*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) {
432*7c478bd9Sstevel@tonic-gate bi.nrecs = __bam_total(lp);
433*7c478bd9Sstevel@tonic-gate RE_NREC_SET(rootp, bi.nrecs);
434*7c478bd9Sstevel@tonic-gate }
435*7c478bd9Sstevel@tonic-gate hdr.data = &bi;
436*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data);
437*7c478bd9Sstevel@tonic-gate if ((ret =
438*7c478bd9Sstevel@tonic-gate __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0)
439*7c478bd9Sstevel@tonic-gate return (ret);
440*7c478bd9Sstevel@tonic-gate
441*7c478bd9Sstevel@tonic-gate switch (TYPE(rp)) {
442*7c478bd9Sstevel@tonic-gate case P_IBTREE:
443*7c478bd9Sstevel@tonic-gate /* Copy the first key of the child page onto the root page. */
444*7c478bd9Sstevel@tonic-gate child_bi = GET_BINTERNAL(rp, 0);
445*7c478bd9Sstevel@tonic-gate
446*7c478bd9Sstevel@tonic-gate bi.len = child_bi->len;
447*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bi->type, 0);
448*7c478bd9Sstevel@tonic-gate bi.pgno = rp->pgno;
449*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) {
450*7c478bd9Sstevel@tonic-gate bi.nrecs = __bam_total(rp);
451*7c478bd9Sstevel@tonic-gate RE_NREC_ADJ(rootp, bi.nrecs);
452*7c478bd9Sstevel@tonic-gate }
453*7c478bd9Sstevel@tonic-gate hdr.data = &bi;
454*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data);
455*7c478bd9Sstevel@tonic-gate data.data = child_bi->data;
456*7c478bd9Sstevel@tonic-gate data.size = child_bi->len;
457*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1,
458*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
459*7c478bd9Sstevel@tonic-gate return (ret);
460*7c478bd9Sstevel@tonic-gate
461*7c478bd9Sstevel@tonic-gate /* Increment the overflow ref count. */
462*7c478bd9Sstevel@tonic-gate if (B_TYPE(child_bi->type) == B_OVERFLOW)
463*7c478bd9Sstevel@tonic-gate if ((ret = __db_ovref(dbc,
464*7c478bd9Sstevel@tonic-gate ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
465*7c478bd9Sstevel@tonic-gate return (ret);
466*7c478bd9Sstevel@tonic-gate break;
467*7c478bd9Sstevel@tonic-gate case P_LBTREE:
468*7c478bd9Sstevel@tonic-gate /* Copy the first key of the child page onto the root page. */
469*7c478bd9Sstevel@tonic-gate child_bk = GET_BKEYDATA(rp, 0);
470*7c478bd9Sstevel@tonic-gate switch (B_TYPE(child_bk->type)) {
471*7c478bd9Sstevel@tonic-gate case B_KEYDATA:
472*7c478bd9Sstevel@tonic-gate bi.len = child_bk->len;
473*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0);
474*7c478bd9Sstevel@tonic-gate bi.pgno = rp->pgno;
475*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) {
476*7c478bd9Sstevel@tonic-gate bi.nrecs = __bam_total(rp);
477*7c478bd9Sstevel@tonic-gate RE_NREC_ADJ(rootp, bi.nrecs);
478*7c478bd9Sstevel@tonic-gate }
479*7c478bd9Sstevel@tonic-gate hdr.data = &bi;
480*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data);
481*7c478bd9Sstevel@tonic-gate data.data = child_bk->data;
482*7c478bd9Sstevel@tonic-gate data.size = child_bk->len;
483*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1,
484*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0)
485*7c478bd9Sstevel@tonic-gate return (ret);
486*7c478bd9Sstevel@tonic-gate break;
487*7c478bd9Sstevel@tonic-gate case B_DUPLICATE:
488*7c478bd9Sstevel@tonic-gate case B_OVERFLOW:
489*7c478bd9Sstevel@tonic-gate bi.len = BOVERFLOW_SIZE;
490*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0);
491*7c478bd9Sstevel@tonic-gate bi.pgno = rp->pgno;
492*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) {
493*7c478bd9Sstevel@tonic-gate bi.nrecs = __bam_total(rp);
494*7c478bd9Sstevel@tonic-gate RE_NREC_ADJ(rootp, bi.nrecs);
495*7c478bd9Sstevel@tonic-gate }
496*7c478bd9Sstevel@tonic-gate hdr.data = &bi;
497*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data);
498*7c478bd9Sstevel@tonic-gate data.data = child_bk;
499*7c478bd9Sstevel@tonic-gate data.size = BOVERFLOW_SIZE;
500*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1,
501*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
502*7c478bd9Sstevel@tonic-gate return (ret);
503*7c478bd9Sstevel@tonic-gate
504*7c478bd9Sstevel@tonic-gate /* Increment the overflow ref count. */
505*7c478bd9Sstevel@tonic-gate if (B_TYPE(child_bk->type) == B_OVERFLOW)
506*7c478bd9Sstevel@tonic-gate if ((ret = __db_ovref(dbc,
507*7c478bd9Sstevel@tonic-gate ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
508*7c478bd9Sstevel@tonic-gate return (ret);
509*7c478bd9Sstevel@tonic-gate break;
510*7c478bd9Sstevel@tonic-gate default:
511*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, rp->pgno));
512*7c478bd9Sstevel@tonic-gate }
513*7c478bd9Sstevel@tonic-gate break;
514*7c478bd9Sstevel@tonic-gate default:
515*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, rp->pgno));
516*7c478bd9Sstevel@tonic-gate }
517*7c478bd9Sstevel@tonic-gate return (0);
518*7c478bd9Sstevel@tonic-gate }
519*7c478bd9Sstevel@tonic-gate
520*7c478bd9Sstevel@tonic-gate /*
521*7c478bd9Sstevel@tonic-gate * __ram_root --
522*7c478bd9Sstevel@tonic-gate * Fix up the recno root page after it has been split.
523*7c478bd9Sstevel@tonic-gate */
524*7c478bd9Sstevel@tonic-gate static int
__ram_root(dbc,rootp,lp,rp)525*7c478bd9Sstevel@tonic-gate __ram_root(dbc, rootp, lp, rp)
526*7c478bd9Sstevel@tonic-gate DBC *dbc;
527*7c478bd9Sstevel@tonic-gate PAGE *rootp, *lp, *rp;
528*7c478bd9Sstevel@tonic-gate {
529*7c478bd9Sstevel@tonic-gate DB *dbp;
530*7c478bd9Sstevel@tonic-gate DBT hdr;
531*7c478bd9Sstevel@tonic-gate RINTERNAL ri;
532*7c478bd9Sstevel@tonic-gate int ret;
533*7c478bd9Sstevel@tonic-gate
534*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp;
535*7c478bd9Sstevel@tonic-gate
536*7c478bd9Sstevel@tonic-gate /* Initialize the page. */
537*7c478bd9Sstevel@tonic-gate P_INIT(rootp, dbp->pgsize,
538*7c478bd9Sstevel@tonic-gate PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
539*7c478bd9Sstevel@tonic-gate
540*7c478bd9Sstevel@tonic-gate /* Initialize the header. */
541*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr));
542*7c478bd9Sstevel@tonic-gate hdr.data = &ri;
543*7c478bd9Sstevel@tonic-gate hdr.size = RINTERNAL_SIZE;
544*7c478bd9Sstevel@tonic-gate
545*7c478bd9Sstevel@tonic-gate /* Insert the left and right keys, set the header information. */
546*7c478bd9Sstevel@tonic-gate ri.pgno = lp->pgno;
547*7c478bd9Sstevel@tonic-gate ri.nrecs = __bam_total(lp);
548*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
549*7c478bd9Sstevel@tonic-gate return (ret);
550*7c478bd9Sstevel@tonic-gate RE_NREC_SET(rootp, ri.nrecs);
551*7c478bd9Sstevel@tonic-gate ri.pgno = rp->pgno;
552*7c478bd9Sstevel@tonic-gate ri.nrecs = __bam_total(rp);
553*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
554*7c478bd9Sstevel@tonic-gate return (ret);
555*7c478bd9Sstevel@tonic-gate RE_NREC_ADJ(rootp, ri.nrecs);
556*7c478bd9Sstevel@tonic-gate return (0);
557*7c478bd9Sstevel@tonic-gate }
558*7c478bd9Sstevel@tonic-gate
559*7c478bd9Sstevel@tonic-gate /*
560*7c478bd9Sstevel@tonic-gate * __bam_pinsert --
561*7c478bd9Sstevel@tonic-gate * Insert a new key into a parent page, completing the split.
562*7c478bd9Sstevel@tonic-gate */
563*7c478bd9Sstevel@tonic-gate static int
__bam_pinsert(dbc,parent,lchild,rchild)564*7c478bd9Sstevel@tonic-gate __bam_pinsert(dbc, parent, lchild, rchild)
565*7c478bd9Sstevel@tonic-gate DBC *dbc;
566*7c478bd9Sstevel@tonic-gate EPG *parent;
567*7c478bd9Sstevel@tonic-gate PAGE *lchild, *rchild;
568*7c478bd9Sstevel@tonic-gate {
569*7c478bd9Sstevel@tonic-gate BINTERNAL bi, *child_bi;
570*7c478bd9Sstevel@tonic-gate BKEYDATA *child_bk, *tmp_bk;
571*7c478bd9Sstevel@tonic-gate BTREE *t;
572*7c478bd9Sstevel@tonic-gate DB *dbp;
573*7c478bd9Sstevel@tonic-gate DBT a, b, hdr, data;
574*7c478bd9Sstevel@tonic-gate PAGE *ppage;
575*7c478bd9Sstevel@tonic-gate RINTERNAL ri;
576*7c478bd9Sstevel@tonic-gate db_indx_t off;
577*7c478bd9Sstevel@tonic-gate db_recno_t nrecs;
578*7c478bd9Sstevel@tonic-gate u_int32_t n, nbytes, nksize;
579*7c478bd9Sstevel@tonic-gate int ret;
580*7c478bd9Sstevel@tonic-gate
581*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp;
582*7c478bd9Sstevel@tonic-gate t = dbp->internal;
583*7c478bd9Sstevel@tonic-gate ppage = parent->page;
584*7c478bd9Sstevel@tonic-gate
585*7c478bd9Sstevel@tonic-gate /* If handling record numbers, count records split to the right page. */
586*7c478bd9Sstevel@tonic-gate nrecs = dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM) ?
587*7c478bd9Sstevel@tonic-gate __bam_total(rchild) : 0;
588*7c478bd9Sstevel@tonic-gate
589*7c478bd9Sstevel@tonic-gate /*
590*7c478bd9Sstevel@tonic-gate * Now we insert the new page's first key into the parent page, which
591*7c478bd9Sstevel@tonic-gate * completes the split. The parent points to a PAGE and a page index
592*7c478bd9Sstevel@tonic-gate * offset, where the new key goes ONE AFTER the index, because we split
593*7c478bd9Sstevel@tonic-gate * to the right.
594*7c478bd9Sstevel@tonic-gate *
595*7c478bd9Sstevel@tonic-gate * XXX
596*7c478bd9Sstevel@tonic-gate * Some btree algorithms replace the key for the old page as well as
597*7c478bd9Sstevel@tonic-gate * the new page. We don't, as there's no reason to believe that the
598*7c478bd9Sstevel@tonic-gate * first key on the old page is any better than the key we have, and,
599*7c478bd9Sstevel@tonic-gate * in the case of a key being placed at index 0 causing the split, the
600*7c478bd9Sstevel@tonic-gate * key is unavailable.
601*7c478bd9Sstevel@tonic-gate */
602*7c478bd9Sstevel@tonic-gate off = parent->indx + O_INDX;
603*7c478bd9Sstevel@tonic-gate
604*7c478bd9Sstevel@tonic-gate /*
605*7c478bd9Sstevel@tonic-gate * Calculate the space needed on the parent page.
606*7c478bd9Sstevel@tonic-gate *
607*7c478bd9Sstevel@tonic-gate * Prefix trees: space hack used when inserting into BINTERNAL pages.
608*7c478bd9Sstevel@tonic-gate * Retain only what's needed to distinguish between the new entry and
609*7c478bd9Sstevel@tonic-gate * the LAST entry on the page to its left. If the keys compare equal,
610*7c478bd9Sstevel@tonic-gate * retain the entire key. We ignore overflow keys, and the entire key
611*7c478bd9Sstevel@tonic-gate * must be retained for the next-to-leftmost key on the leftmost page
612*7c478bd9Sstevel@tonic-gate * of each level, or the search will fail. Applicable ONLY to internal
613*7c478bd9Sstevel@tonic-gate * pages that have leaf pages as children. Further reduction of the
614*7c478bd9Sstevel@tonic-gate * key between pairs of internal pages loses too much information.
615*7c478bd9Sstevel@tonic-gate */
616*7c478bd9Sstevel@tonic-gate switch (TYPE(rchild)) {
617*7c478bd9Sstevel@tonic-gate case P_IBTREE:
618*7c478bd9Sstevel@tonic-gate child_bi = GET_BINTERNAL(rchild, 0);
619*7c478bd9Sstevel@tonic-gate nbytes = BINTERNAL_PSIZE(child_bi->len);
620*7c478bd9Sstevel@tonic-gate
621*7c478bd9Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes)
622*7c478bd9Sstevel@tonic-gate return (DB_NEEDSPLIT);
623*7c478bd9Sstevel@tonic-gate
624*7c478bd9Sstevel@tonic-gate /* Add a new record for the right page. */
625*7c478bd9Sstevel@tonic-gate memset(&bi, 0, sizeof(bi));
626*7c478bd9Sstevel@tonic-gate bi.len = child_bi->len;
627*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bi->type, 0);
628*7c478bd9Sstevel@tonic-gate bi.pgno = rchild->pgno;
629*7c478bd9Sstevel@tonic-gate bi.nrecs = nrecs;
630*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr));
631*7c478bd9Sstevel@tonic-gate hdr.data = &bi;
632*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data);
633*7c478bd9Sstevel@tonic-gate memset(&data, 0, sizeof(data));
634*7c478bd9Sstevel@tonic-gate data.data = child_bi->data;
635*7c478bd9Sstevel@tonic-gate data.size = child_bi->len;
636*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, ppage, off,
637*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
638*7c478bd9Sstevel@tonic-gate return (ret);
639*7c478bd9Sstevel@tonic-gate
640*7c478bd9Sstevel@tonic-gate /* Increment the overflow ref count. */
641*7c478bd9Sstevel@tonic-gate if (B_TYPE(child_bi->type) == B_OVERFLOW)
642*7c478bd9Sstevel@tonic-gate if ((ret = __db_ovref(dbc,
643*7c478bd9Sstevel@tonic-gate ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
644*7c478bd9Sstevel@tonic-gate return (ret);
645*7c478bd9Sstevel@tonic-gate break;
646*7c478bd9Sstevel@tonic-gate case P_LBTREE:
647*7c478bd9Sstevel@tonic-gate child_bk = GET_BKEYDATA(rchild, 0);
648*7c478bd9Sstevel@tonic-gate switch (B_TYPE(child_bk->type)) {
649*7c478bd9Sstevel@tonic-gate case B_KEYDATA:
650*7c478bd9Sstevel@tonic-gate nbytes = BINTERNAL_PSIZE(child_bk->len);
651*7c478bd9Sstevel@tonic-gate nksize = child_bk->len;
652*7c478bd9Sstevel@tonic-gate if (t->bt_prefix == NULL)
653*7c478bd9Sstevel@tonic-gate goto noprefix;
654*7c478bd9Sstevel@tonic-gate if (ppage->prev_pgno == PGNO_INVALID && off <= 1)
655*7c478bd9Sstevel@tonic-gate goto noprefix;
656*7c478bd9Sstevel@tonic-gate tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - P_INDX);
657*7c478bd9Sstevel@tonic-gate if (B_TYPE(tmp_bk->type) != B_KEYDATA)
658*7c478bd9Sstevel@tonic-gate goto noprefix;
659*7c478bd9Sstevel@tonic-gate memset(&a, 0, sizeof(a));
660*7c478bd9Sstevel@tonic-gate a.size = tmp_bk->len;
661*7c478bd9Sstevel@tonic-gate a.data = tmp_bk->data;
662*7c478bd9Sstevel@tonic-gate memset(&b, 0, sizeof(b));
663*7c478bd9Sstevel@tonic-gate b.size = child_bk->len;
664*7c478bd9Sstevel@tonic-gate b.data = child_bk->data;
665*7c478bd9Sstevel@tonic-gate nksize = t->bt_prefix(&a, &b);
666*7c478bd9Sstevel@tonic-gate if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)
667*7c478bd9Sstevel@tonic-gate nbytes = n;
668*7c478bd9Sstevel@tonic-gate else
669*7c478bd9Sstevel@tonic-gate noprefix: nksize = child_bk->len;
670*7c478bd9Sstevel@tonic-gate
671*7c478bd9Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes)
672*7c478bd9Sstevel@tonic-gate return (DB_NEEDSPLIT);
673*7c478bd9Sstevel@tonic-gate
674*7c478bd9Sstevel@tonic-gate memset(&bi, 0, sizeof(bi));
675*7c478bd9Sstevel@tonic-gate bi.len = nksize;
676*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0);
677*7c478bd9Sstevel@tonic-gate bi.pgno = rchild->pgno;
678*7c478bd9Sstevel@tonic-gate bi.nrecs = nrecs;
679*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr));
680*7c478bd9Sstevel@tonic-gate hdr.data = &bi;
681*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data);
682*7c478bd9Sstevel@tonic-gate memset(&data, 0, sizeof(data));
683*7c478bd9Sstevel@tonic-gate data.data = child_bk->data;
684*7c478bd9Sstevel@tonic-gate data.size = nksize;
685*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, ppage, off,
686*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(nksize), &hdr, &data)) != 0)
687*7c478bd9Sstevel@tonic-gate return (ret);
688*7c478bd9Sstevel@tonic-gate break;
689*7c478bd9Sstevel@tonic-gate case B_DUPLICATE:
690*7c478bd9Sstevel@tonic-gate case B_OVERFLOW:
691*7c478bd9Sstevel@tonic-gate nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);
692*7c478bd9Sstevel@tonic-gate
693*7c478bd9Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes)
694*7c478bd9Sstevel@tonic-gate return (DB_NEEDSPLIT);
695*7c478bd9Sstevel@tonic-gate
696*7c478bd9Sstevel@tonic-gate memset(&bi, 0, sizeof(bi));
697*7c478bd9Sstevel@tonic-gate bi.len = BOVERFLOW_SIZE;
698*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0);
699*7c478bd9Sstevel@tonic-gate bi.pgno = rchild->pgno;
700*7c478bd9Sstevel@tonic-gate bi.nrecs = nrecs;
701*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr));
702*7c478bd9Sstevel@tonic-gate hdr.data = &bi;
703*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data);
704*7c478bd9Sstevel@tonic-gate memset(&data, 0, sizeof(data));
705*7c478bd9Sstevel@tonic-gate data.data = child_bk;
706*7c478bd9Sstevel@tonic-gate data.size = BOVERFLOW_SIZE;
707*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, ppage, off,
708*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
709*7c478bd9Sstevel@tonic-gate return (ret);
710*7c478bd9Sstevel@tonic-gate
711*7c478bd9Sstevel@tonic-gate /* Increment the overflow ref count. */
712*7c478bd9Sstevel@tonic-gate if (B_TYPE(child_bk->type) == B_OVERFLOW)
713*7c478bd9Sstevel@tonic-gate if ((ret = __db_ovref(dbc,
714*7c478bd9Sstevel@tonic-gate ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
715*7c478bd9Sstevel@tonic-gate return (ret);
716*7c478bd9Sstevel@tonic-gate break;
717*7c478bd9Sstevel@tonic-gate default:
718*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, rchild->pgno));
719*7c478bd9Sstevel@tonic-gate }
720*7c478bd9Sstevel@tonic-gate break;
721*7c478bd9Sstevel@tonic-gate case P_IRECNO:
722*7c478bd9Sstevel@tonic-gate case P_LRECNO:
723*7c478bd9Sstevel@tonic-gate nbytes = RINTERNAL_PSIZE;
724*7c478bd9Sstevel@tonic-gate
725*7c478bd9Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes)
726*7c478bd9Sstevel@tonic-gate return (DB_NEEDSPLIT);
727*7c478bd9Sstevel@tonic-gate
728*7c478bd9Sstevel@tonic-gate /* Add a new record for the right page. */
729*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr));
730*7c478bd9Sstevel@tonic-gate hdr.data = &ri;
731*7c478bd9Sstevel@tonic-gate hdr.size = RINTERNAL_SIZE;
732*7c478bd9Sstevel@tonic-gate ri.pgno = rchild->pgno;
733*7c478bd9Sstevel@tonic-gate ri.nrecs = nrecs;
734*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc,
735*7c478bd9Sstevel@tonic-gate ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0)
736*7c478bd9Sstevel@tonic-gate return (ret);
737*7c478bd9Sstevel@tonic-gate break;
738*7c478bd9Sstevel@tonic-gate default:
739*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, rchild->pgno));
740*7c478bd9Sstevel@tonic-gate }
741*7c478bd9Sstevel@tonic-gate
742*7c478bd9Sstevel@tonic-gate /* Adjust the parent page's left page record count. */
743*7c478bd9Sstevel@tonic-gate if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
744*7c478bd9Sstevel@tonic-gate /* Log the change. */
745*7c478bd9Sstevel@tonic-gate if (DB_LOGGING(dbc) &&
746*7c478bd9Sstevel@tonic-gate (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
747*7c478bd9Sstevel@tonic-gate dbc->txn, &LSN(ppage), 0, dbp->log_fileid,
748*7c478bd9Sstevel@tonic-gate PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx,
749*7c478bd9Sstevel@tonic-gate -(int32_t)nrecs, (int32_t)0)) != 0)
750*7c478bd9Sstevel@tonic-gate return (ret);
751*7c478bd9Sstevel@tonic-gate
752*7c478bd9Sstevel@tonic-gate /* Update the left page count. */
753*7c478bd9Sstevel@tonic-gate if (dbp->type == DB_RECNO)
754*7c478bd9Sstevel@tonic-gate GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
755*7c478bd9Sstevel@tonic-gate else
756*7c478bd9Sstevel@tonic-gate GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
757*7c478bd9Sstevel@tonic-gate }
758*7c478bd9Sstevel@tonic-gate
759*7c478bd9Sstevel@tonic-gate return (0);
760*7c478bd9Sstevel@tonic-gate }
761*7c478bd9Sstevel@tonic-gate
762*7c478bd9Sstevel@tonic-gate /*
763*7c478bd9Sstevel@tonic-gate * __bam_psplit --
764*7c478bd9Sstevel@tonic-gate * Do the real work of splitting the page.
765*7c478bd9Sstevel@tonic-gate */
766*7c478bd9Sstevel@tonic-gate static int
__bam_psplit(dbc,cp,lp,rp,splitret)767*7c478bd9Sstevel@tonic-gate __bam_psplit(dbc, cp, lp, rp, splitret)
768*7c478bd9Sstevel@tonic-gate DBC *dbc;
769*7c478bd9Sstevel@tonic-gate EPG *cp;
770*7c478bd9Sstevel@tonic-gate PAGE *lp, *rp;
771*7c478bd9Sstevel@tonic-gate db_indx_t *splitret;
772*7c478bd9Sstevel@tonic-gate {
773*7c478bd9Sstevel@tonic-gate DB *dbp;
774*7c478bd9Sstevel@tonic-gate PAGE *pp;
775*7c478bd9Sstevel@tonic-gate db_indx_t half, nbytes, off, splitp, top;
776*7c478bd9Sstevel@tonic-gate int adjust, cnt, isbigkey, ret;
777*7c478bd9Sstevel@tonic-gate
778*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp;
779*7c478bd9Sstevel@tonic-gate pp = cp->page;
780*7c478bd9Sstevel@tonic-gate adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
781*7c478bd9Sstevel@tonic-gate
782*7c478bd9Sstevel@tonic-gate /*
783*7c478bd9Sstevel@tonic-gate * If we're splitting the first (last) page on a level because we're
784*7c478bd9Sstevel@tonic-gate * inserting (appending) a key to it, it's likely that the data is
785*7c478bd9Sstevel@tonic-gate * sorted. Moving a single item to the new page is less work and can
786*7c478bd9Sstevel@tonic-gate * push the fill factor higher than normal. If we're wrong it's not
787*7c478bd9Sstevel@tonic-gate * a big deal, we'll just do the split the right way next time.
788*7c478bd9Sstevel@tonic-gate */
789*7c478bd9Sstevel@tonic-gate off = 0;
790*7c478bd9Sstevel@tonic-gate if (NEXT_PGNO(pp) == PGNO_INVALID &&
791*7c478bd9Sstevel@tonic-gate ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) ||
792*7c478bd9Sstevel@tonic-gate (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page))))
793*7c478bd9Sstevel@tonic-gate off = NUM_ENT(cp->page) - adjust;
794*7c478bd9Sstevel@tonic-gate else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
795*7c478bd9Sstevel@tonic-gate off = adjust;
796*7c478bd9Sstevel@tonic-gate
797*7c478bd9Sstevel@tonic-gate if (off != 0)
798*7c478bd9Sstevel@tonic-gate goto sort;
799*7c478bd9Sstevel@tonic-gate
800*7c478bd9Sstevel@tonic-gate /*
801*7c478bd9Sstevel@tonic-gate * Split the data to the left and right pages. Try not to split on
802*7c478bd9Sstevel@tonic-gate * an overflow key. (Overflow keys on internal pages will slow down
803*7c478bd9Sstevel@tonic-gate * searches.) Refuse to split in the middle of a set of duplicates.
804*7c478bd9Sstevel@tonic-gate *
805*7c478bd9Sstevel@tonic-gate * First, find the optimum place to split.
806*7c478bd9Sstevel@tonic-gate *
807*7c478bd9Sstevel@tonic-gate * It's possible to try and split past the last record on the page if
808*7c478bd9Sstevel@tonic-gate * there's a very large record at the end of the page. Make sure this
809*7c478bd9Sstevel@tonic-gate * doesn't happen by bounding the check at the next-to-last entry on
810*7c478bd9Sstevel@tonic-gate * the page.
811*7c478bd9Sstevel@tonic-gate *
812*7c478bd9Sstevel@tonic-gate * Note, we try and split half the data present on the page. This is
813*7c478bd9Sstevel@tonic-gate * because another process may have already split the page and left
814*7c478bd9Sstevel@tonic-gate * it half empty. We don't try and skip the split -- we don't know
815*7c478bd9Sstevel@tonic-gate * how much space we're going to need on the page, and we may need up
816*7c478bd9Sstevel@tonic-gate * to half the page for a big item, so there's no easy test to decide
817*7c478bd9Sstevel@tonic-gate * if we need to split or not. Besides, if two threads are inserting
818*7c478bd9Sstevel@tonic-gate * data into the same place in the database, we're probably going to
819*7c478bd9Sstevel@tonic-gate * need more space soon anyway.
820*7c478bd9Sstevel@tonic-gate */
821*7c478bd9Sstevel@tonic-gate top = NUM_ENT(pp) - adjust;
822*7c478bd9Sstevel@tonic-gate half = (dbp->pgsize - HOFFSET(pp)) / 2;
823*7c478bd9Sstevel@tonic-gate for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)
824*7c478bd9Sstevel@tonic-gate switch (TYPE(pp)) {
825*7c478bd9Sstevel@tonic-gate case P_IBTREE:
826*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA)
827*7c478bd9Sstevel@tonic-gate nbytes +=
828*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len);
829*7c478bd9Sstevel@tonic-gate else
830*7c478bd9Sstevel@tonic-gate nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
831*7c478bd9Sstevel@tonic-gate break;
832*7c478bd9Sstevel@tonic-gate case P_LBTREE:
833*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)
834*7c478bd9Sstevel@tonic-gate nbytes +=
835*7c478bd9Sstevel@tonic-gate BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
836*7c478bd9Sstevel@tonic-gate else
837*7c478bd9Sstevel@tonic-gate nbytes += BOVERFLOW_SIZE;
838*7c478bd9Sstevel@tonic-gate
839*7c478bd9Sstevel@tonic-gate ++off;
840*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)
841*7c478bd9Sstevel@tonic-gate nbytes +=
842*7c478bd9Sstevel@tonic-gate BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
843*7c478bd9Sstevel@tonic-gate else
844*7c478bd9Sstevel@tonic-gate nbytes += BOVERFLOW_SIZE;
845*7c478bd9Sstevel@tonic-gate break;
846*7c478bd9Sstevel@tonic-gate case P_IRECNO:
847*7c478bd9Sstevel@tonic-gate nbytes += RINTERNAL_SIZE;
848*7c478bd9Sstevel@tonic-gate break;
849*7c478bd9Sstevel@tonic-gate case P_LRECNO:
850*7c478bd9Sstevel@tonic-gate nbytes += BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
851*7c478bd9Sstevel@tonic-gate break;
852*7c478bd9Sstevel@tonic-gate default:
853*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, pp->pgno));
854*7c478bd9Sstevel@tonic-gate }
855*7c478bd9Sstevel@tonic-gate sort: splitp = off;
856*7c478bd9Sstevel@tonic-gate
857*7c478bd9Sstevel@tonic-gate /*
858*7c478bd9Sstevel@tonic-gate * Splitp is either at or just past the optimum split point. If
859*7c478bd9Sstevel@tonic-gate * it's a big key, try and find something close by that's not.
860*7c478bd9Sstevel@tonic-gate */
861*7c478bd9Sstevel@tonic-gate if (TYPE(pp) == P_IBTREE)
862*7c478bd9Sstevel@tonic-gate isbigkey = B_TYPE(GET_BINTERNAL(pp, off)->type) != B_KEYDATA;
863*7c478bd9Sstevel@tonic-gate else if (TYPE(pp) == P_LBTREE)
864*7c478bd9Sstevel@tonic-gate isbigkey = B_TYPE(GET_BKEYDATA(pp, off)->type) != B_KEYDATA;
865*7c478bd9Sstevel@tonic-gate else
866*7c478bd9Sstevel@tonic-gate isbigkey = 0;
867*7c478bd9Sstevel@tonic-gate if (isbigkey)
868*7c478bd9Sstevel@tonic-gate for (cnt = 1; cnt <= 3; ++cnt) {
869*7c478bd9Sstevel@tonic-gate off = splitp + cnt * adjust;
870*7c478bd9Sstevel@tonic-gate if (off < (db_indx_t)NUM_ENT(pp) &&
871*7c478bd9Sstevel@tonic-gate ((TYPE(pp) == P_IBTREE &&
872*7c478bd9Sstevel@tonic-gate B_TYPE(GET_BINTERNAL(pp,off)->type) == B_KEYDATA) ||
873*7c478bd9Sstevel@tonic-gate B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)) {
874*7c478bd9Sstevel@tonic-gate splitp = off;
875*7c478bd9Sstevel@tonic-gate break;
876*7c478bd9Sstevel@tonic-gate }
877*7c478bd9Sstevel@tonic-gate if (splitp <= (db_indx_t)(cnt * adjust))
878*7c478bd9Sstevel@tonic-gate continue;
879*7c478bd9Sstevel@tonic-gate off = splitp - cnt * adjust;
880*7c478bd9Sstevel@tonic-gate if (TYPE(pp) == P_IBTREE ?
881*7c478bd9Sstevel@tonic-gate B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA :
882*7c478bd9Sstevel@tonic-gate B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) {
883*7c478bd9Sstevel@tonic-gate splitp = off;
884*7c478bd9Sstevel@tonic-gate break;
885*7c478bd9Sstevel@tonic-gate }
886*7c478bd9Sstevel@tonic-gate }
887*7c478bd9Sstevel@tonic-gate
888*7c478bd9Sstevel@tonic-gate /*
889*7c478bd9Sstevel@tonic-gate * We can't split in the middle a set of duplicates. We know that
890*7c478bd9Sstevel@tonic-gate * no duplicate set can take up more than about 25% of the page,
891*7c478bd9Sstevel@tonic-gate * because that's the point where we push it off onto a duplicate
892*7c478bd9Sstevel@tonic-gate * page set. So, this loop can't be unbounded.
893*7c478bd9Sstevel@tonic-gate */
894*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_AM_DUP) && TYPE(pp) == P_LBTREE &&
895*7c478bd9Sstevel@tonic-gate pp->inp[splitp] == pp->inp[splitp - adjust])
896*7c478bd9Sstevel@tonic-gate for (cnt = 1;; ++cnt) {
897*7c478bd9Sstevel@tonic-gate off = splitp + cnt * adjust;
898*7c478bd9Sstevel@tonic-gate if (off < NUM_ENT(pp) &&
899*7c478bd9Sstevel@tonic-gate pp->inp[splitp] != pp->inp[off]) {
900*7c478bd9Sstevel@tonic-gate splitp = off;
901*7c478bd9Sstevel@tonic-gate break;
902*7c478bd9Sstevel@tonic-gate }
903*7c478bd9Sstevel@tonic-gate if (splitp <= (db_indx_t)(cnt * adjust))
904*7c478bd9Sstevel@tonic-gate continue;
905*7c478bd9Sstevel@tonic-gate off = splitp - cnt * adjust;
906*7c478bd9Sstevel@tonic-gate if (pp->inp[splitp] != pp->inp[off]) {
907*7c478bd9Sstevel@tonic-gate splitp = off + adjust;
908*7c478bd9Sstevel@tonic-gate break;
909*7c478bd9Sstevel@tonic-gate }
910*7c478bd9Sstevel@tonic-gate }
911*7c478bd9Sstevel@tonic-gate
912*7c478bd9Sstevel@tonic-gate
913*7c478bd9Sstevel@tonic-gate /* We're going to split at splitp. */
914*7c478bd9Sstevel@tonic-gate if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)
915*7c478bd9Sstevel@tonic-gate return (ret);
916*7c478bd9Sstevel@tonic-gate if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
917*7c478bd9Sstevel@tonic-gate return (ret);
918*7c478bd9Sstevel@tonic-gate
919*7c478bd9Sstevel@tonic-gate *splitret = splitp;
920*7c478bd9Sstevel@tonic-gate return (0);
921*7c478bd9Sstevel@tonic-gate }
922*7c478bd9Sstevel@tonic-gate
923*7c478bd9Sstevel@tonic-gate /*
924*7c478bd9Sstevel@tonic-gate * __bam_copy --
925*7c478bd9Sstevel@tonic-gate * Copy a set of records from one page to another.
926*7c478bd9Sstevel@tonic-gate *
927*7c478bd9Sstevel@tonic-gate * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
928*7c478bd9Sstevel@tonic-gate */
929*7c478bd9Sstevel@tonic-gate int
__bam_copy(dbp,pp,cp,nxt,stop)930*7c478bd9Sstevel@tonic-gate __bam_copy(dbp, pp, cp, nxt, stop)
931*7c478bd9Sstevel@tonic-gate DB *dbp;
932*7c478bd9Sstevel@tonic-gate PAGE *pp, *cp;
933*7c478bd9Sstevel@tonic-gate u_int32_t nxt, stop;
934*7c478bd9Sstevel@tonic-gate {
935*7c478bd9Sstevel@tonic-gate db_indx_t nbytes, off;
936*7c478bd9Sstevel@tonic-gate
937*7c478bd9Sstevel@tonic-gate /*
938*7c478bd9Sstevel@tonic-gate * Copy the rest of the data to the right page. Nxt is the next
939*7c478bd9Sstevel@tonic-gate * offset placed on the target page.
940*7c478bd9Sstevel@tonic-gate */
941*7c478bd9Sstevel@tonic-gate for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {
942*7c478bd9Sstevel@tonic-gate switch (TYPE(pp)) {
943*7c478bd9Sstevel@tonic-gate case P_IBTREE:
944*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BINTERNAL(pp, nxt)->type) == B_KEYDATA)
945*7c478bd9Sstevel@tonic-gate nbytes =
946*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len);
947*7c478bd9Sstevel@tonic-gate else
948*7c478bd9Sstevel@tonic-gate nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);
949*7c478bd9Sstevel@tonic-gate break;
950*7c478bd9Sstevel@tonic-gate case P_LBTREE:
951*7c478bd9Sstevel@tonic-gate /*
952*7c478bd9Sstevel@tonic-gate * If we're on a key and it's a duplicate, just copy
953*7c478bd9Sstevel@tonic-gate * the offset.
954*7c478bd9Sstevel@tonic-gate */
955*7c478bd9Sstevel@tonic-gate if (off != 0 && (nxt % P_INDX) == 0 &&
956*7c478bd9Sstevel@tonic-gate pp->inp[nxt] == pp->inp[nxt - P_INDX]) {
957*7c478bd9Sstevel@tonic-gate cp->inp[off] = cp->inp[off - P_INDX];
958*7c478bd9Sstevel@tonic-gate continue;
959*7c478bd9Sstevel@tonic-gate }
960*7c478bd9Sstevel@tonic-gate /* FALLTHROUGH */
961*7c478bd9Sstevel@tonic-gate case P_LRECNO:
962*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BKEYDATA(pp, nxt)->type) == B_KEYDATA)
963*7c478bd9Sstevel@tonic-gate nbytes =
964*7c478bd9Sstevel@tonic-gate BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len);
965*7c478bd9Sstevel@tonic-gate else
966*7c478bd9Sstevel@tonic-gate nbytes = BOVERFLOW_SIZE;
967*7c478bd9Sstevel@tonic-gate break;
968*7c478bd9Sstevel@tonic-gate case P_IRECNO:
969*7c478bd9Sstevel@tonic-gate nbytes = RINTERNAL_SIZE;
970*7c478bd9Sstevel@tonic-gate break;
971*7c478bd9Sstevel@tonic-gate default:
972*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, pp->pgno));
973*7c478bd9Sstevel@tonic-gate }
974*7c478bd9Sstevel@tonic-gate cp->inp[off] = HOFFSET(cp) -= nbytes;
975*7c478bd9Sstevel@tonic-gate memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes);
976*7c478bd9Sstevel@tonic-gate }
977*7c478bd9Sstevel@tonic-gate return (0);
978*7c478bd9Sstevel@tonic-gate }
979