1 #pragma ident	"%Z%%M%	%I%	%E% SMI"
2 
3 /*-
4  * Copyright (c) 1990, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Mike Olson.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #if defined(LIBC_SCCS) && !defined(lint)
40 static char sccsid[] = "@(#)bt_delete.c	8.13 (Berkeley) 7/28/94";
41 #endif /* LIBC_SCCS and not lint */
42 
43 #include <sys/types.h>
44 
45 #include <errno.h>
46 #include <stdio.h>
47 #include <string.h>
48 
49 #include "db-int.h"
50 #include "btree.h"
51 
52 static int __bt_bdelete __P((BTREE *, const DBT *));
53 static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int));
54 static int __bt_pdelete __P((BTREE *, PAGE *));
55 static int __bt_relink __P((BTREE *, PAGE *));
56 static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *));
57 
58 /*
59  * __bt_delete
60  *	Delete the item(s) referenced by a key.
61  *
62  * Return RET_SPECIAL if the key is not found.
63  */
64 int
65 __bt_delete(dbp, key, flags)
66 	const DB *dbp;
67 	const DBT *key;
68 	u_int flags;
69 {
70 	BTREE *t;
71 	CURSOR *c;
72 	PAGE *h;
73 	int status;
74 
75 	t = dbp->internal;
76 
77 	/* Toss any page pinned across calls. */
78 	if (t->bt_pinned != NULL) {
79 		mpool_put(t->bt_mp, t->bt_pinned, 0);
80 		t->bt_pinned = NULL;
81 	}
82 
83 	/* Check for change to a read-only tree. */
84 	if (F_ISSET(t, B_RDONLY)) {
85 		errno = EPERM;
86 		return (RET_ERROR);
87 	}
88 
89 	switch (flags) {
90 	case 0:
91 		status = __bt_bdelete(t, key);
92 		break;
93 	case R_CURSOR:
94 		/*
95 		 * If flags is R_CURSOR, delete the cursor.  Must already
96 		 * have started a scan and not have already deleted it.
97 		 */
98 		c = &t->bt_cursor;
99 		if (F_ISSET(c, CURS_INIT)) {
100 			if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
101 				return (RET_SPECIAL);
102 			if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
103 				return (RET_ERROR);
104 
105 			/*
106 			 * If the page is about to be emptied, we'll need to
107 			 * delete it, which means we have to acquire a stack.
108 			 */
109 			if (NEXTINDEX(h) == 1)
110 				if (__bt_stkacq(t, &h, &t->bt_cursor))
111 					return (RET_ERROR);
112 
113 			status = __bt_dleaf(t, NULL, h, c->pg.index);
114 
115 			if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) {
116 				if (__bt_pdelete(t, h))
117 					return (RET_ERROR);
118 			} else
119 				mpool_put(t->bt_mp,
120 				    h, status == RET_SUCCESS ? MPOOL_DIRTY : 0);
121 			break;
122 		}
123 		/* FALLTHROUGH */
124 	default:
125 		errno = EINVAL;
126 		return (RET_ERROR);
127 	}
128 	if (status == RET_SUCCESS)
129 		F_SET(t, B_MODIFIED);
130 	return (status);
131 }
132 
133 /*
134  * __bt_stkacq --
135  *	Acquire a stack so we can delete a cursor entry.
136  *
137  * Parameters:
138  *	  t:	tree
139  *	 hp:	pointer to current, pinned PAGE pointer
140  *	  c:	pointer to the cursor
141  *
142  * Returns:
143  *	0 on success, 1 on failure
144  */
145 static int
146 __bt_stkacq(t, hp, c)
147 	BTREE *t;
148 	PAGE **hp;
149 	CURSOR *c;
150 {
151 	BINTERNAL *bi;
152 	EPG *e;
153 	EPGNO *parent;
154 	PAGE *h;
155 	indx_t idx;
156 	db_pgno_t pgno;
157 	recno_t nextpg, prevpg;
158 	int exact, level;
159 
160 	/*
161 	 * Find the first occurrence of the key in the tree.  Toss the
162 	 * currently locked page so we don't hit an already-locked page.
163 	 */
164 	h = *hp;
165 	mpool_put(t->bt_mp, h, 0);
166 	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
167 		return (1);
168 	h = e->page;
169 
170 	/* See if we got it in one shot. */
171 	if (h->pgno == c->pg.pgno)
172 		goto ret;
173 
174 	/*
175 	 * Move right, looking for the page.  At each move we have to move
176 	 * up the stack until we don't have to move to the next page.  If
177 	 * we have to change pages at an internal level, we have to fix the
178 	 * stack back up.
179 	 */
180 	while (h->pgno != c->pg.pgno) {
181 		if ((nextpg = h->nextpg) == P_INVALID)
182 			break;
183 		mpool_put(t->bt_mp, h, 0);
184 
185 		/* Move up the stack. */
186 		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
187 			/* Get the parent page. */
188 			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
189 				return (1);
190 
191 			/* Move to the next index. */
192 			if (parent->index != NEXTINDEX(h) - 1) {
193 				idx = parent->index + 1;
194 				BT_PUSH(t, h->pgno, idx);
195 				break;
196 			}
197 			mpool_put(t->bt_mp, h, 0);
198 		}
199 
200 		/* Restore the stack. */
201 		while (level--) {
202 			/* Push the next level down onto the stack. */
203 			bi = GETBINTERNAL(h, idx);
204 			pgno = bi->pgno;
205 			BT_PUSH(t, pgno, 0);
206 
207 			/* Lose the currently pinned page. */
208 			mpool_put(t->bt_mp, h, 0);
209 
210 			/* Get the next level down. */
211 			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
212 				return (1);
213 			idx = 0;
214 		}
215 		mpool_put(t->bt_mp, h, 0);
216 		if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL)
217 			return (1);
218 	}
219 
220 	if (h->pgno == c->pg.pgno)
221 		goto ret;
222 
223 	/* Reacquire the original stack. */
224 	mpool_put(t->bt_mp, h, 0);
225 	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
226 		return (1);
227 	h = e->page;
228 
229 	/*
230 	 * Move left, looking for the page.  At each move we have to move
231 	 * up the stack until we don't have to change pages to move to the
232 	 * next page.  If we have to change pages at an internal level, we
233 	 * have to fix the stack back up.
234 	 */
235 	while (h->pgno != c->pg.pgno) {
236 		if ((prevpg = h->prevpg) == P_INVALID)
237 			break;
238 		mpool_put(t->bt_mp, h, 0);
239 
240 		/* Move up the stack. */
241 		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
242 			/* Get the parent page. */
243 			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
244 				return (1);
245 
246 			/* Move to the next index. */
247 			if (parent->index != 0) {
248 				idx = parent->index - 1;
249 				BT_PUSH(t, h->pgno, idx);
250 				break;
251 			}
252 			mpool_put(t->bt_mp, h, 0);
253 		}
254 
255 		/* Restore the stack. */
256 		while (level--) {
257 			/* Push the next level down onto the stack. */
258 			bi = GETBINTERNAL(h, idx);
259 			pgno = bi->pgno;
260 
261 			/* Lose the currently pinned page. */
262 			mpool_put(t->bt_mp, h, 0);
263 
264 			/* Get the next level down. */
265 			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
266 				return (1);
267 
268 			idx = NEXTINDEX(h) - 1;
269 			BT_PUSH(t, pgno, idx);
270 		}
271 		mpool_put(t->bt_mp, h, 0);
272 		if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL)
273 			return (1);
274 	}
275 
276 
277 ret:	mpool_put(t->bt_mp, h, 0);
278 	return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL);
279 }
280 
281 /*
282  * __bt_bdelete --
283  *	Delete all key/data pairs matching the specified key.
284  *
285  * Parameters:
286  *	  t:	tree
287  *	key:	key to delete
288  *
289  * Returns:
290  *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
291  */
292 static int
293 __bt_bdelete(t, key)
294 	BTREE *t;
295 	const DBT *key;
296 {
297 	EPG *e;
298 	PAGE *h;
299 	int deleted, exact, redo;
300 
301 	deleted = 0;
302 
303 	/* Find any matching record; __bt_search pins the page. */
304 loop:	if ((e = __bt_search(t, key, &exact)) == NULL)
305 		return (deleted ? RET_SUCCESS : RET_ERROR);
306 	if (!exact) {
307 		mpool_put(t->bt_mp, e->page, 0);
308 		return (deleted ? RET_SUCCESS : RET_SPECIAL);
309 	}
310 
311 	/*
312 	 * Delete forward, then delete backward, from the found key.  If
313 	 * there are duplicates and we reach either side of the page, do
314 	 * the key search again, so that we get them all.
315 	 */
316 	redo = 0;
317 	h = e->page;
318 	do {
319 		if (__bt_dleaf(t, key, h, e->index)) {
320 			mpool_put(t->bt_mp, h, 0);
321 			return (RET_ERROR);
322 		}
323 		if (F_ISSET(t, B_NODUPS)) {
324 			if (NEXTINDEX(h) == 0) {
325 				if (__bt_pdelete(t, h))
326 					return (RET_ERROR);
327 			} else
328 				mpool_put(t->bt_mp, h, MPOOL_DIRTY);
329 			return (RET_SUCCESS);
330 		}
331 		deleted = 1;
332 	} while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0);
333 
334 	/* Check for right-hand edge of the page. */
335 	if (e->index == NEXTINDEX(h))
336 		redo = 1;
337 
338 	/* Delete from the key to the beginning of the page. */
339 	while (e->index-- > 0) {
340 		if (__bt_cmp(t, key, e) != 0)
341 			break;
342 		if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) {
343 			mpool_put(t->bt_mp, h, 0);
344 			return (RET_ERROR);
345 		}
346 		if (e->index == 0)
347 			redo = 1;
348 	}
349 
350 	/* Check for an empty page. */
351 	if (NEXTINDEX(h) == 0) {
352 		if (__bt_pdelete(t, h))
353 			return (RET_ERROR);
354 		goto loop;
355 	}
356 
357 	/* Put the page. */
358 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
359 
360 	if (redo)
361 		goto loop;
362 	return (RET_SUCCESS);
363 }
364 
365 /*
366  * __bt_pdelete --
367  *	Delete a single page from the tree.
368  *
369  * Parameters:
370  *	t:	tree
371  *	h:	leaf page
372  *
373  * Returns:
374  *	RET_SUCCESS, RET_ERROR.
375  *
376  * Side-effects:
377  *	mpool_put's the page
378  */
379 static int
380 __bt_pdelete(t, h)
381 	BTREE *t;
382 	PAGE *h;
383 {
384 	BINTERNAL *bi;
385 	PAGE *pg;
386 	EPGNO *parent;
387 	indx_t cnt, idx, *ip, offset;
388 	u_int32_t nksize;
389 	char *from;
390 
391 	/*
392 	 * Walk the parent page stack -- a LIFO stack of the pages that were
393 	 * traversed when we searched for the page where the delete occurred.
394 	 * Each stack entry is a page number and a page index offset.  The
395 	 * offset is for the page traversed on the search.  We've just deleted
396 	 * a page, so we have to delete the key from the parent page.
397 	 *
398 	 * If the delete from the parent page makes it empty, this process may
399 	 * continue all the way up the tree.  We stop if we reach the root page
400 	 * (which is never deleted, it's just not worth the effort) or if the
401 	 * delete does not empty the page.
402 	 */
403 	while ((parent = BT_POP(t)) != NULL) {
404 		/* Get the parent page. */
405 		if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
406 			return (RET_ERROR);
407 
408 		idx = parent->index;
409 		bi = GETBINTERNAL(pg, idx);
410 
411 		/* Free any overflow pages. */
412 		if (bi->flags & P_BIGKEY &&
413 		    __ovfl_delete(t, bi->bytes) == RET_ERROR) {
414 			mpool_put(t->bt_mp, pg, 0);
415 			return (RET_ERROR);
416 		}
417 
418 		/*
419 		 * Free the parent if it has only the one key and it's not the
420 		 * root page. If it's the rootpage, turn it back into an empty
421 		 * leaf page.
422 		 */
423 		if (NEXTINDEX(pg) == 1)
424 			if (pg->pgno == P_ROOT) {
425 				pg->lower = BTDATAOFF;
426 				pg->upper = t->bt_psize;
427 				pg->flags = P_BLEAF;
428 			} else {
429 				if (__bt_relink(t, pg) || __bt_free(t, pg))
430 					return (RET_ERROR);
431 				continue;
432 			}
433 		else {
434 			/* Pack remaining key items at the end of the page. */
435 			nksize = NBINTERNAL(bi->ksize);
436 			from = (char *)pg + pg->upper;
437 			memmove(from + nksize, from, (char *)bi - from);
438 			pg->upper += nksize;
439 
440 			/* Adjust indices' offsets, shift the indices down. */
441 			offset = pg->linp[idx];
442 			for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip)
443 				if (ip[0] < offset)
444 					ip[0] += nksize;
445 			for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip)
446 				ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1];
447 			pg->lower -= sizeof(indx_t);
448 		}
449 
450 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
451 		break;
452 	}
453 
454 	/* Free the leaf page, as long as it wasn't the root. */
455 	if (h->pgno == P_ROOT) {
456 		mpool_put(t->bt_mp, h, MPOOL_DIRTY);
457 		return (RET_SUCCESS);
458 	}
459 	return (__bt_relink(t, h) || __bt_free(t, h));
460 }
461 
462 /*
463  * __bt_dleaf --
464  *	Delete a single record from a leaf page.
465  *
466  * Parameters:
467  *	t:	tree
468  *    key:	referenced key
469  *	h:	page
470  *	idx:	index on page to delete
471  *
472  * Returns:
473  *	RET_SUCCESS, RET_ERROR.
474  */
475 int
476 __bt_dleaf(t, key, h, idx)
477 	BTREE *t;
478 	const DBT *key;
479 	PAGE *h;
480 	u_int idx;
481 {
482 	BLEAF *bl;
483 	indx_t cnt, *ip, offset;
484 	u_int32_t nbytes;
485 	void *to;
486 	char *from;
487 
488 	/* If this record is referenced by the cursor, delete the cursor. */
489 	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
490 	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
491 	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == idx &&
492 	    __bt_curdel(t, key, h, idx))
493 		return (RET_ERROR);
494 
495 	/* If the entry uses overflow pages, make them available for reuse. */
496 	to = bl = GETBLEAF(h, idx);
497 	if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR)
498 		return (RET_ERROR);
499 	if (bl->flags & P_BIGDATA &&
500 	    __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR)
501 		return (RET_ERROR);
502 
503 	/* Pack the remaining key/data items at the end of the page. */
504 	nbytes = NBLEAF(bl);
505 	from = (char *)h + h->upper;
506 	memmove(from + nbytes, from, (char *)to - from);
507 	h->upper += nbytes;
508 
509 	/* Adjust the indices' offsets, shift the indices down. */
510 	offset = h->linp[idx];
511 	for (cnt = idx, ip = &h->linp[0]; cnt--; ++ip)
512 		if (ip[0] < offset)
513 			ip[0] += nbytes;
514 	for (cnt = NEXTINDEX(h) - idx; --cnt; ++ip)
515 		ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
516 	h->lower -= sizeof(indx_t);
517 
518 	/* If the cursor is on this page, adjust it as necessary. */
519 	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
520 	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
521 	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > idx)
522 		--t->bt_cursor.pg.index;
523 
524 	return (RET_SUCCESS);
525 }
526 
527 /*
528  * __bt_curdel --
529  *	Delete the cursor.
530  *
531  * Parameters:
532  *	t:	tree
533  *    key:	referenced key (or NULL)
534  *	h:	page
535  *  idx:	idx on page to delete
536  *
537  * Returns:
538  *	RET_SUCCESS, RET_ERROR.
539  */
540 static int
541 __bt_curdel(t, key, h, idx)
542 	BTREE *t;
543 	const DBT *key;
544 	PAGE *h;
545 	u_int idx;
546 {
547 	CURSOR *c;
548 	EPG e;
549 	PAGE *pg;
550 	int curcopy, status;
551 
552 	/*
553 	 * If there are duplicates, move forward or backward to one.
554 	 * Otherwise, copy the key into the cursor area.
555 	 */
556 	c = &t->bt_cursor;
557 	F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE);
558 
559 	curcopy = 0;
560 	if (!F_ISSET(t, B_NODUPS)) {
561 		/*
562 		 * We're going to have to do comparisons.  If we weren't
563 		 * provided a copy of the key, i.e. the user is deleting
564 		 * the current cursor position, get one.
565 		 */
566 		if (key == NULL) {
567 			e.page = h;
568 			e.index = idx;
569 			if ((status = __bt_ret(t, &e,
570 			    &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS)
571 				return (status);
572 			curcopy = 1;
573 			key = &c->key;
574 		}
575 		/* Check previous key, if not at the beginning of the page. */
576 		if (idx > 0) {
577 			e.page = h;
578 			e.index = idx - 1;
579 			if (__bt_cmp(t, key, &e) == 0) {
580 				F_SET(c, CURS_BEFORE);
581 				goto dup2;
582 			}
583 		}
584 		/* Check next key, if not at the end of the page. */
585 		if (idx < NEXTINDEX(h) - 1) {
586 			e.page = h;
587 			e.index = idx + 1;
588 			if (__bt_cmp(t, key, &e) == 0) {
589 				F_SET(c, CURS_AFTER);
590 				goto dup2;
591 			}
592 		}
593 		/* Check previous key if at the beginning of the page. */
594 		if (idx == 0 && h->prevpg != P_INVALID) {
595 			if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
596 				return (RET_ERROR);
597 			e.page = pg;
598 			e.index = NEXTINDEX(pg) - 1;
599 			if (__bt_cmp(t, key, &e) == 0) {
600 				F_SET(c, CURS_BEFORE);
601 				goto dup1;
602 			}
603 			mpool_put(t->bt_mp, pg, 0);
604 		}
605 		/* Check next key if at the end of the page. */
606 		if (idx == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) {
607 			if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
608 				return (RET_ERROR);
609 			e.page = pg;
610 			e.index = 0;
611 			if (__bt_cmp(t, key, &e) == 0) {
612 				F_SET(c, CURS_AFTER);
613 dup1:				mpool_put(t->bt_mp, pg, 0);
614 dup2:				c->pg.pgno = e.page->pgno;
615 				c->pg.index = e.index;
616 				return (RET_SUCCESS);
617 			}
618 			mpool_put(t->bt_mp, pg, 0);
619 		}
620 	}
621 	e.page = h;
622 	e.index = idx;
623 	if (curcopy || (status =
624 	    __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) {
625 		F_SET(c, CURS_ACQUIRE);
626 		return (RET_SUCCESS);
627 	}
628 	return (status);
629 }
630 
631 /*
632  * __bt_relink --
633  *	Link around a deleted page.
634  *
635  * Parameters:
636  *	t:	tree
637  *	h:	page to be deleted
638  */
639 static int
640 __bt_relink(t, h)
641 	BTREE *t;
642 	PAGE *h;
643 {
644 	PAGE *pg;
645 
646 	if (h->nextpg != P_INVALID) {
647 		if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
648 			return (RET_ERROR);
649 		pg->prevpg = h->prevpg;
650 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
651 	}
652 	if (h->prevpg != P_INVALID) {
653 		if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
654 			return (RET_ERROR);
655 		pg->nextpg = h->nextpg;
656 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
657 	}
658 	return (0);
659 }
660