xref: /illumos-gate/usr/src/lib/libsqlite/src/btree.c (revision 1da57d55)
1 /*
2  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7 ** 2001 September 15
8 **
9 ** The author disclaims copyright to this source code.  In place of
10 ** a legal notice, here is a blessing:
11 **
12 **    May you do good and not evil.
13 **    May you find forgiveness for yourself and forgive others.
14 **    May you share freely, never taking more than you give.
15 **
16 *************************************************************************
17 ** $Id: btree.c,v 1.103 2004/03/10 13:42:38 drh Exp $
18 **
19 ** This file implements a external (disk-based) database using BTrees.
20 ** For a detailed discussion of BTrees, refer to
21 **
22 **     Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
23 **     "Sorting And Searching", pages 473-480. Addison-Wesley
24 **     Publishing Company, Reading, Massachusetts.
25 **
26 ** The basic idea is that each page of the file contains N database
27 ** entries and N+1 pointers to subpages.
28 **
29 **   ----------------------------------------------------------------
30 **   |  Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
31 **   ----------------------------------------------------------------
32 **
33 ** All of the keys on the page that Ptr(0) points to have values less
34 ** than Key(0).  All of the keys on page Ptr(1) and its subpages have
35 ** values greater than Key(0) and less than Key(1).  All of the keys
36 ** on Ptr(N+1) and its subpages have values greater than Key(N).  And
37 ** so forth.
38 **
39 ** Finding a particular key requires reading O(log(M)) pages from the
40 ** disk where M is the number of entries in the tree.
41 **
42 ** In this implementation, a single file can hold one or more separate
43 ** BTrees.  Each BTree is identified by the index of its root page.  The
44 ** key and data for any entry are combined to form the "payload".  Up to
45 ** MX_LOCAL_PAYLOAD bytes of payload can be carried directly on the
46 ** database page.  If the payload is larger than MX_LOCAL_PAYLOAD bytes
47 ** then surplus bytes are stored on overflow pages.  The payload for an
48 ** entry and the preceding pointer are combined to form a "Cell".  Each
49 ** page has a small header which contains the Ptr(N+1) pointer.
50 **
51 ** The first page of the file contains a magic string used to verify that
52 ** the file really is a valid BTree database, a pointer to a list of unused
53 ** pages in the file, and some meta information.  The root of the first
54 ** BTree begins on page 2 of the file.  (Pages are numbered beginning with
55 ** 1, not 0.)  Thus a minimum database contains 2 pages.
56 */
57 #include "sqliteInt.h"
58 #include "pager.h"
59 #include "btree.h"
60 #include <assert.h>
61 
62 /* Forward declarations */
63 static BtOps sqliteBtreeOps;
64 static BtCursorOps sqliteBtreeCursorOps;
65 
66 /*
67 ** Macros used for byteswapping.  B is a pointer to the Btree
68 ** structure.  This is needed to access the Btree.needSwab boolean
69 ** in order to tell if byte swapping is needed or not.
70 ** X is an unsigned integer.  SWAB16 byte swaps a 16-bit integer.
71 ** SWAB32 byteswaps a 32-bit integer.
72 */
73 #define SWAB16(B,X)   ((B)->needSwab? swab16((u16)X) : ((u16)X))
74 #define SWAB32(B,X)   ((B)->needSwab? swab32(X) : (X))
75 #define SWAB_ADD(B,X,A) \
76    if((B)->needSwab){ X=swab32(swab32(X)+A); }else{ X += (A); }
77 
78 /*
79 ** The following global variable - available only if SQLITE_TEST is
80 ** defined - is used to determine whether new databases are created in
81 ** native byte order or in non-native byte order.  Non-native byte order
82 ** databases are created for testing purposes only.  Under normal operation,
83 ** only native byte-order databases should be created, but we should be
84 ** able to read or write existing databases regardless of the byteorder.
85 */
86 #ifdef SQLITE_TEST
87 int btree_native_byte_order = 1;
88 #else
89 # define btree_native_byte_order 1
90 #endif
91 
92 /*
93 ** Forward declarations of structures used only in this file.
94 */
95 typedef struct PageOne PageOne;
96 typedef struct MemPage MemPage;
97 typedef struct PageHdr PageHdr;
98 typedef struct Cell Cell;
99 typedef struct CellHdr CellHdr;
100 typedef struct FreeBlk FreeBlk;
101 typedef struct OverflowPage OverflowPage;
102 typedef struct FreelistInfo FreelistInfo;
103 
104 /*
105 ** All structures on a database page are aligned to 4-byte boundries.
106 ** This routine rounds up a number of bytes to the next multiple of 4.
107 **
108 ** This might need to change for computer architectures that require
109 ** and 8-byte alignment boundry for structures.
110 */
111 #define ROUNDUP(X)  ((X+3) & ~3)
112 
113 /*
114 ** This is a magic string that appears at the beginning of every
115 ** SQLite database in order to identify the file as a real database.
116 */
117 static const char zMagicHeader[] =
118    "** This file contains an SQLite 2.1 database **";
119 #define MAGIC_SIZE (sizeof(zMagicHeader))
120 
121 /*
122 ** This is a magic integer also used to test the integrity of the database
123 ** file.  This integer is used in addition to the string above so that
124 ** if the file is written on a little-endian architecture and read
125 ** on a big-endian architectures (or vice versa) we can detect the
126 ** problem.
127 **
128 ** The number used was obtained at random and has no special
129 ** significance other than the fact that it represents a different
130 ** integer on little-endian and big-endian machines.
131 */
132 #define MAGIC 0xdae37528
133 
134 /*
135 ** The first page of the database file contains a magic header string
136 ** to identify the file as an SQLite database file.  It also contains
137 ** a pointer to the first free page of the file.  Page 2 contains the
138 ** root of the principle BTree.  The file might contain other BTrees
139 ** rooted on pages above 2.
140 **
141 ** The first page also contains SQLITE_N_BTREE_META integers that
142 ** can be used by higher-level routines.
143 **
144 ** Remember that pages are numbered beginning with 1.  (See pager.c
145 ** for additional information.)  Page 0 does not exist and a page
146 ** number of 0 is used to mean "no such page".
147 */
148 struct PageOne {
149   char zMagic[MAGIC_SIZE]; /* String that identifies the file as a database */
150   int iMagic;              /* Integer to verify correct byte order */
151   Pgno freeList;           /* First free page in a list of all free pages */
152   int nFree;               /* Number of pages on the free list */
153   int aMeta[SQLITE_N_BTREE_META-1];  /* User defined integers */
154 };
155 
156 /*
157 ** Each database page has a header that is an instance of this
158 ** structure.
159 **
160 ** PageHdr.firstFree is 0 if there is no free space on this page.
161 ** Otherwise, PageHdr.firstFree is the index in MemPage.u.aDisk[] of a
162 ** FreeBlk structure that describes the first block of free space.
163 ** All free space is defined by a linked list of FreeBlk structures.
164 **
165 ** Data is stored in a linked list of Cell structures.  PageHdr.firstCell
166 ** is the index into MemPage.u.aDisk[] of the first cell on the page.  The
167 ** Cells are kept in sorted order.
168 **
169 ** A Cell contains all information about a database entry and a pointer
170 ** to a child page that contains other entries less than itself.  In
171 ** other words, the i-th Cell contains both Ptr(i) and Key(i).  The
172 ** right-most pointer of the page is contained in PageHdr.rightChild.
173 */
174 struct PageHdr {
175   Pgno rightChild;  /* Child page that comes after all cells on this page */
176   u16 firstCell;    /* Index in MemPage.u.aDisk[] of the first cell */
177   u16 firstFree;    /* Index in MemPage.u.aDisk[] of the first free block */
178 };
179 
180 /*
181 ** Entries on a page of the database are called "Cells".  Each Cell
182 ** has a header and data.  This structure defines the header.  The
183 ** key and data (collectively the "payload") follow this header on
184 ** the database page.
185 **
186 ** A definition of the complete Cell structure is given below.  The
187 ** header for the cell must be defined first in order to do some
188 ** of the sizing #defines that follow.
189 */
190 struct CellHdr {
191   Pgno leftChild; /* Child page that comes before this cell */
192   u16 nKey;       /* Number of bytes in the key */
193   u16 iNext;      /* Index in MemPage.u.aDisk[] of next cell in sorted order */
194   u8 nKeyHi;      /* Upper 8 bits of key size for keys larger than 64K bytes */
195   u8 nDataHi;     /* Upper 8 bits of data size when the size is more than 64K */
196   u16 nData;      /* Number of bytes of data */
197 };
198 
199 /*
200 ** The key and data size are split into a lower 16-bit segment and an
201 ** upper 8-bit segment in order to pack them together into a smaller
202 ** space.  The following macros reassembly a key or data size back
203 ** into an integer.
204 */
205 #define NKEY(b,h)  (SWAB16(b,h.nKey) + h.nKeyHi*65536)
206 #define NDATA(b,h) (SWAB16(b,h.nData) + h.nDataHi*65536)
207 
208 /*
209 ** The minimum size of a complete Cell.  The Cell must contain a header
210 ** and at least 4 bytes of payload.
211 */
212 #define MIN_CELL_SIZE  (sizeof(CellHdr)+4)
213 
214 /*
215 ** The maximum number of database entries that can be held in a single
216 ** page of the database.
217 */
218 #define MX_CELL ((SQLITE_USABLE_SIZE-sizeof(PageHdr))/MIN_CELL_SIZE)
219 
220 /*
221 ** The amount of usable space on a single page of the BTree.  This is the
222 ** page size minus the overhead of the page header.
223 */
224 #define USABLE_SPACE  (SQLITE_USABLE_SIZE - sizeof(PageHdr))
225 
226 /*
227 ** The maximum amount of payload (in bytes) that can be stored locally for
228 ** a database entry.  If the entry contains more data than this, the
229 ** extra goes onto overflow pages.
230 **
231 ** This number is chosen so that at least 4 cells will fit on every page.
232 */
233 #define MX_LOCAL_PAYLOAD ((USABLE_SPACE/4-(sizeof(CellHdr)+sizeof(Pgno)))&~3)
234 
235 /*
236 ** Data on a database page is stored as a linked list of Cell structures.
237 ** Both the key and the data are stored in aPayload[].  The key always comes
238 ** first.  The aPayload[] field grows as necessary to hold the key and data,
239 ** up to a maximum of MX_LOCAL_PAYLOAD bytes.  If the size of the key and
240 ** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the
241 ** page number of the first overflow page.
242 **
243 ** Though this structure is fixed in size, the Cell on the database
244 ** page varies in size.  Every cell has a CellHdr and at least 4 bytes
245 ** of payload space.  Additional payload bytes (up to the maximum of
246 ** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as
247 ** needed.
248 */
249 struct Cell {
250   CellHdr h;                        /* The cell header */
251   char aPayload[MX_LOCAL_PAYLOAD];  /* Key and data */
252   Pgno ovfl;                        /* The first overflow page */
253 };
254 
255 /*
256 ** Free space on a page is remembered using a linked list of the FreeBlk
257 ** structures.  Space on a database page is allocated in increments of
258 ** at least 4 bytes and is always aligned to a 4-byte boundry.  The
259 ** linked list of FreeBlks is always kept in order by address.
260 */
261 struct FreeBlk {
262   u16 iSize;      /* Number of bytes in this block of free space */
263   u16 iNext;      /* Index in MemPage.u.aDisk[] of the next free block */
264 };
265 
266 /*
267 ** The number of bytes of payload that will fit on a single overflow page.
268 */
269 #define OVERFLOW_SIZE (SQLITE_USABLE_SIZE-sizeof(Pgno))
270 
271 /*
272 ** When the key and data for a single entry in the BTree will not fit in
273 ** the MX_LOCAL_PAYLOAD bytes of space available on the database page,
274 ** then all extra bytes are written to a linked list of overflow pages.
275 ** Each overflow page is an instance of the following structure.
276 **
277 ** Unused pages in the database are also represented by instances of
278 ** the OverflowPage structure.  The PageOne.freeList field is the
279 ** page number of the first page in a linked list of unused database
280 ** pages.
281 */
282 struct OverflowPage {
283   Pgno iNext;
284   char aPayload[OVERFLOW_SIZE];
285 };
286 
287 /*
288 ** The PageOne.freeList field points to a linked list of overflow pages
289 ** hold information about free pages.  The aPayload section of each
290 ** overflow page contains an instance of the following structure.  The
291 ** aFree[] array holds the page number of nFree unused pages in the disk
292 ** file.
293 */
294 struct FreelistInfo {
295   int nFree;
296   Pgno aFree[(OVERFLOW_SIZE-sizeof(int))/sizeof(Pgno)];
297 };
298 
299 /*
300 ** For every page in the database file, an instance of the following structure
301 ** is stored in memory.  The u.aDisk[] array contains the raw bits read from
302 ** the disk.  The rest is auxiliary information held in memory only. The
303 ** auxiliary info is only valid for regular database pages - it is not
304 ** used for overflow pages and pages on the freelist.
305 **
306 ** Of particular interest in the auxiliary info is the apCell[] entry.  Each
307 ** apCell[] entry is a pointer to a Cell structure in u.aDisk[].  The cells are
308 ** put in this array so that they can be accessed in constant time, rather
309 ** than in linear time which would be needed if we had to walk the linked
310 ** list on every access.
311 **
312 ** Note that apCell[] contains enough space to hold up to two more Cells
313 ** than can possibly fit on one page.  In the steady state, every apCell[]
314 ** points to memory inside u.aDisk[].  But in the middle of an insert
315 ** operation, some apCell[] entries may temporarily point to data space
316 ** outside of u.aDisk[].  This is a transient situation that is quickly
317 ** resolved.  But while it is happening, it is possible for a database
318 ** page to hold as many as two more cells than it might otherwise hold.
319 ** The extra two entries in apCell[] are an allowance for this situation.
320 **
321 ** The pParent field points back to the parent page.  This allows us to
322 ** walk up the BTree from any leaf to the root.  Care must be taken to
323 ** unref() the parent page pointer when this page is no longer referenced.
324 ** The pageDestructor() routine handles that chore.
325 */
326 struct MemPage {
327   union u_page_data {
328     char aDisk[SQLITE_PAGE_SIZE];  /* Page data stored on disk */
329     PageHdr hdr;                   /* Overlay page header */
330   } u;
331   u8 isInit;                     /* True if auxiliary data is initialized */
332   u8 idxShift;                   /* True if apCell[] indices have changed */
333   u8 isOverfull;                 /* Some apCell[] points outside u.aDisk[] */
334   MemPage *pParent;              /* The parent of this page.  NULL for root */
335   int idxParent;                 /* Index in pParent->apCell[] of this node */
336   int nFree;                     /* Number of free bytes in u.aDisk[] */
337   int nCell;                     /* Number of entries on this page */
338   Cell *apCell[MX_CELL+2];       /* All data entires in sorted order */
339 };
340 
341 /*
342 ** The in-memory image of a disk page has the auxiliary information appended
343 ** to the end.  EXTRA_SIZE is the number of bytes of space needed to hold
344 ** that extra information.
345 */
346 #define EXTRA_SIZE (sizeof(MemPage)-sizeof(union u_page_data))
347 
348 /*
349 ** Everything we need to know about an open database
350 */
351 struct Btree {
352   BtOps *pOps;          /* Function table */
353   Pager *pPager;        /* The page cache */
354   BtCursor *pCursor;    /* A list of all open cursors */
355   PageOne *page1;       /* First page of the database */
356   u8 inTrans;           /* True if a transaction is in progress */
357   u8 inCkpt;            /* True if there is a checkpoint on the transaction */
358   u8 readOnly;          /* True if the underlying file is readonly */
359   u8 needSwab;          /* Need to byte-swapping */
360 };
361 typedef Btree Bt;
362 
363 /*
364 ** A cursor is a pointer to a particular entry in the BTree.
365 ** The entry is identified by its MemPage and the index in
366 ** MemPage.apCell[] of the entry.
367 */
368 struct BtCursor {
369   BtCursorOps *pOps;        /* Function table */
370   Btree *pBt;               /* The Btree to which this cursor belongs */
371   BtCursor *pNext, *pPrev;  /* Forms a linked list of all cursors */
372   BtCursor *pShared;        /* Loop of cursors with the same root page */
373   Pgno pgnoRoot;            /* The root page of this tree */
374   MemPage *pPage;           /* Page that contains the entry */
375   int idx;                  /* Index of the entry in pPage->apCell[] */
376   u8 wrFlag;                /* True if writable */
377   u8 eSkip;                 /* Determines if next step operation is a no-op */
378   u8 iMatch;                /* compare result from last sqliteBtreeMoveto() */
379 };
380 
381 /*
382 ** Legal values for BtCursor.eSkip.
383 */
384 #define SKIP_NONE     0   /* Always step the cursor */
385 #define SKIP_NEXT     1   /* The next sqliteBtreeNext() is a no-op */
386 #define SKIP_PREV     2   /* The next sqliteBtreePrevious() is a no-op */
387 #define SKIP_INVALID  3   /* Calls to Next() and Previous() are invalid */
388 
389 /* Forward declarations */
390 static int fileBtreeCloseCursor(BtCursor *pCur);
391 
392 /*
393 ** Routines for byte swapping.
394 */
swab16(u16 x)395 u16 swab16(u16 x){
396   return ((x & 0xff)<<8) | ((x>>8)&0xff);
397 }
swab32(u32 x)398 u32 swab32(u32 x){
399   return ((x & 0xff)<<24) | ((x & 0xff00)<<8) |
400          ((x>>8) & 0xff00) | ((x>>24)&0xff);
401 }
402 
403 /*
404 ** Compute the total number of bytes that a Cell needs on the main
405 ** database page.  The number returned includes the Cell header,
406 ** local payload storage, and the pointer to overflow pages (if
407 ** applicable).  Additional space allocated on overflow pages
408 ** is NOT included in the value returned from this routine.
409 */
cellSize(Btree * pBt,Cell * pCell)410 static int cellSize(Btree *pBt, Cell *pCell){
411   int n = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
412   if( n>MX_LOCAL_PAYLOAD ){
413     n = MX_LOCAL_PAYLOAD + sizeof(Pgno);
414   }else{
415     n = ROUNDUP(n);
416   }
417   n += sizeof(CellHdr);
418   return n;
419 }
420 
421 /*
422 ** Defragment the page given.  All Cells are moved to the
423 ** beginning of the page and all free space is collected
424 ** into one big FreeBlk at the end of the page.
425 */
defragmentPage(Btree * pBt,MemPage * pPage)426 static void defragmentPage(Btree *pBt, MemPage *pPage){
427   int pc, i, n;
428   FreeBlk *pFBlk;
429   char newPage[SQLITE_USABLE_SIZE];
430 
431   assert( sqlitepager_iswriteable(pPage) );
432   assert( pPage->isInit );
433   pc = sizeof(PageHdr);
434   pPage->u.hdr.firstCell = SWAB16(pBt, pc);
435   memcpy(newPage, pPage->u.aDisk, pc);
436   for(i=0; i<pPage->nCell; i++){
437     Cell *pCell = pPage->apCell[i];
438 
439     /* This routine should never be called on an overfull page.  The
440     ** following asserts verify that constraint. */
441     assert( Addr(pCell) > Addr(pPage) );
442     assert( Addr(pCell) < Addr(pPage) + SQLITE_USABLE_SIZE );
443 
444     n = cellSize(pBt, pCell);
445     pCell->h.iNext = SWAB16(pBt, pc + n);
446     memcpy(&newPage[pc], pCell, n);
447     pPage->apCell[i] = (Cell*)&pPage->u.aDisk[pc];
448     pc += n;
449   }
450   assert( pPage->nFree==SQLITE_USABLE_SIZE-pc );
451   memcpy(pPage->u.aDisk, newPage, pc);
452   if( pPage->nCell>0 ){
453     pPage->apCell[pPage->nCell-1]->h.iNext = 0;
454   }
455   pFBlk = (FreeBlk*)&pPage->u.aDisk[pc];
456   pFBlk->iSize = SWAB16(pBt, SQLITE_USABLE_SIZE - pc);
457   pFBlk->iNext = 0;
458   pPage->u.hdr.firstFree = SWAB16(pBt, pc);
459   memset(&pFBlk[1], 0, SQLITE_USABLE_SIZE - pc - sizeof(FreeBlk));
460 }
461 
462 /*
463 ** Allocate nByte bytes of space on a page.  nByte must be a
464 ** multiple of 4.
465 **
466 ** Return the index into pPage->u.aDisk[] of the first byte of
467 ** the new allocation. Or return 0 if there is not enough free
468 ** space on the page to satisfy the allocation request.
469 **
470 ** If the page contains nBytes of free space but does not contain
471 ** nBytes of contiguous free space, then this routine automatically
472 ** calls defragementPage() to consolidate all free space before
473 ** allocating the new chunk.
474 */
allocateSpace(Btree * pBt,MemPage * pPage,int nByte)475 static int allocateSpace(Btree *pBt, MemPage *pPage, int nByte){
476   FreeBlk *p;
477   u16 *pIdx;
478   int start;
479   int iSize;
480 #ifndef NDEBUG
481   int cnt = 0;
482 #endif
483 
484   assert( sqlitepager_iswriteable(pPage) );
485   assert( nByte==ROUNDUP(nByte) );
486   assert( pPage->isInit );
487   if( pPage->nFree<nByte || pPage->isOverfull ) return 0;
488   pIdx = &pPage->u.hdr.firstFree;
489   p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)];
490   while( (iSize = SWAB16(pBt, p->iSize))<nByte ){
491     assert( cnt++ < SQLITE_USABLE_SIZE/4 );
492     if( p->iNext==0 ){
493       defragmentPage(pBt, pPage);
494       pIdx = &pPage->u.hdr.firstFree;
495     }else{
496       pIdx = &p->iNext;
497     }
498     p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)];
499   }
500   if( iSize==nByte ){
501     start = SWAB16(pBt, *pIdx);
502     *pIdx = p->iNext;
503   }else{
504     FreeBlk *pNew;
505     start = SWAB16(pBt, *pIdx);
506     pNew = (FreeBlk*)&pPage->u.aDisk[start + nByte];
507     pNew->iNext = p->iNext;
508     pNew->iSize = SWAB16(pBt, iSize - nByte);
509     *pIdx = SWAB16(pBt, start + nByte);
510   }
511   pPage->nFree -= nByte;
512   return start;
513 }
514 
515 /*
516 ** Return a section of the MemPage.u.aDisk[] to the freelist.
517 ** The first byte of the new free block is pPage->u.aDisk[start]
518 ** and the size of the block is "size" bytes.  Size must be
519 ** a multiple of 4.
520 **
521 ** Most of the effort here is involved in coalesing adjacent
522 ** free blocks into a single big free block.
523 */
freeSpace(Btree * pBt,MemPage * pPage,int start,int size)524 static void freeSpace(Btree *pBt, MemPage *pPage, int start, int size){
525   int end = start + size;
526   u16 *pIdx, idx;
527   FreeBlk *pFBlk;
528   FreeBlk *pNew;
529   FreeBlk *pNext;
530   int iSize;
531 
532   assert( sqlitepager_iswriteable(pPage) );
533   assert( size == ROUNDUP(size) );
534   assert( start == ROUNDUP(start) );
535   assert( pPage->isInit );
536   pIdx = &pPage->u.hdr.firstFree;
537   idx = SWAB16(pBt, *pIdx);
538   while( idx!=0 && idx<start ){
539     pFBlk = (FreeBlk*)&pPage->u.aDisk[idx];
540     iSize = SWAB16(pBt, pFBlk->iSize);
541     if( idx + iSize == start ){
542       pFBlk->iSize = SWAB16(pBt, iSize + size);
543       if( idx + iSize + size == SWAB16(pBt, pFBlk->iNext) ){
544         pNext = (FreeBlk*)&pPage->u.aDisk[idx + iSize + size];
545         if( pBt->needSwab ){
546           pFBlk->iSize = swab16((u16)swab16(pNext->iSize)+iSize+size);
547         }else{
548           pFBlk->iSize += pNext->iSize;
549         }
550         pFBlk->iNext = pNext->iNext;
551       }
552       pPage->nFree += size;
553       return;
554     }
555     pIdx = &pFBlk->iNext;
556     idx = SWAB16(pBt, *pIdx);
557   }
558   pNew = (FreeBlk*)&pPage->u.aDisk[start];
559   if( idx != end ){
560     pNew->iSize = SWAB16(pBt, size);
561     pNew->iNext = SWAB16(pBt, idx);
562   }else{
563     pNext = (FreeBlk*)&pPage->u.aDisk[idx];
564     pNew->iSize = SWAB16(pBt, size + SWAB16(pBt, pNext->iSize));
565     pNew->iNext = pNext->iNext;
566   }
567   *pIdx = SWAB16(pBt, start);
568   pPage->nFree += size;
569 }
570 
571 /*
572 ** Initialize the auxiliary information for a disk block.
573 **
574 ** The pParent parameter must be a pointer to the MemPage which
575 ** is the parent of the page being initialized.  The root of the
576 ** BTree (usually page 2) has no parent and so for that page,
577 ** pParent==NULL.
578 **
579 ** Return SQLITE_OK on success.  If we see that the page does
580 ** not contain a well-formed database page, then return
581 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
582 ** guarantee that the page is well-formed.  It only shows that
583 ** we failed to detect any corruption.
584 */
initPage(Bt * pBt,MemPage * pPage,Pgno pgnoThis,MemPage * pParent)585 static int initPage(Bt *pBt, MemPage *pPage, Pgno pgnoThis, MemPage *pParent){
586   int idx;           /* An index into pPage->u.aDisk[] */
587   Cell *pCell;       /* A pointer to a Cell in pPage->u.aDisk[] */
588   FreeBlk *pFBlk;    /* A pointer to a free block in pPage->u.aDisk[] */
589   int sz;            /* The size of a Cell in bytes */
590   int freeSpace;     /* Amount of free space on the page */
591 
592   if( pPage->pParent ){
593     assert( pPage->pParent==pParent );
594     return SQLITE_OK;
595   }
596   if( pParent ){
597     pPage->pParent = pParent;
598     sqlitepager_ref(pParent);
599   }
600   if( pPage->isInit ) return SQLITE_OK;
601   pPage->isInit = 1;
602   pPage->nCell = 0;
603   freeSpace = USABLE_SPACE;
604   idx = SWAB16(pBt, pPage->u.hdr.firstCell);
605   while( idx!=0 ){
606     if( idx>SQLITE_USABLE_SIZE-MIN_CELL_SIZE ) goto page_format_error;
607     if( idx<sizeof(PageHdr) ) goto page_format_error;
608     if( idx!=ROUNDUP(idx) ) goto page_format_error;
609     pCell = (Cell*)&pPage->u.aDisk[idx];
610     sz = cellSize(pBt, pCell);
611     if( idx+sz > SQLITE_USABLE_SIZE ) goto page_format_error;
612     freeSpace -= sz;
613     pPage->apCell[pPage->nCell++] = pCell;
614     idx = SWAB16(pBt, pCell->h.iNext);
615   }
616   pPage->nFree = 0;
617   idx = SWAB16(pBt, pPage->u.hdr.firstFree);
618   while( idx!=0 ){
619     int iNext;
620     if( idx>SQLITE_USABLE_SIZE-sizeof(FreeBlk) ) goto page_format_error;
621     if( idx<sizeof(PageHdr) ) goto page_format_error;
622     pFBlk = (FreeBlk*)&pPage->u.aDisk[idx];
623     pPage->nFree += SWAB16(pBt, pFBlk->iSize);
624     iNext = SWAB16(pBt, pFBlk->iNext);
625     if( iNext>0 && iNext <= idx ) goto page_format_error;
626     idx = iNext;
627   }
628   if( pPage->nCell==0 && pPage->nFree==0 ){
629     /* As a special case, an uninitialized root page appears to be
630     ** an empty database */
631     return SQLITE_OK;
632   }
633   if( pPage->nFree!=freeSpace ) goto page_format_error;
634   return SQLITE_OK;
635 
636 page_format_error:
637   return SQLITE_CORRUPT;
638 }
639 
640 /*
641 ** Set up a raw page so that it looks like a database page holding
642 ** no entries.
643 */
zeroPage(Btree * pBt,MemPage * pPage)644 static void zeroPage(Btree *pBt, MemPage *pPage){
645   PageHdr *pHdr;
646   FreeBlk *pFBlk;
647   assert( sqlitepager_iswriteable(pPage) );
648   memset(pPage, 0, SQLITE_USABLE_SIZE);
649   pHdr = &pPage->u.hdr;
650   pHdr->firstCell = 0;
651   pHdr->firstFree = SWAB16(pBt, sizeof(*pHdr));
652   pFBlk = (FreeBlk*)&pHdr[1];
653   pFBlk->iNext = 0;
654   pPage->nFree = SQLITE_USABLE_SIZE - sizeof(*pHdr);
655   pFBlk->iSize = SWAB16(pBt, pPage->nFree);
656   pPage->nCell = 0;
657   pPage->isOverfull = 0;
658 }
659 
660 /*
661 ** This routine is called when the reference count for a page
662 ** reaches zero.  We need to unref the pParent pointer when that
663 ** happens.
664 */
pageDestructor(void * pData)665 static void pageDestructor(void *pData){
666   MemPage *pPage = (MemPage*)pData;
667   if( pPage->pParent ){
668     MemPage *pParent = pPage->pParent;
669     pPage->pParent = 0;
670     sqlitepager_unref(pParent);
671   }
672 }
673 
674 /*
675 ** Open a new database.
676 **
677 ** Actually, this routine just sets up the internal data structures
678 ** for accessing the database.  We do not open the database file
679 ** until the first page is loaded.
680 **
681 ** zFilename is the name of the database file.  If zFilename is NULL
682 ** a new database with a random name is created.  This randomly named
683 ** database file will be deleted when sqliteBtreeClose() is called.
684 */
sqliteBtreeOpen(const char * zFilename,int omitJournal,int nCache,Btree ** ppBtree)685 int sqliteBtreeOpen(
686   const char *zFilename,    /* Name of the file containing the BTree database */
687   int omitJournal,          /* if TRUE then do not journal this file */
688   int nCache,               /* How many pages in the page cache */
689   Btree **ppBtree           /* Pointer to new Btree object written here */
690 ){
691   Btree *pBt;
692   int rc;
693 
694   /*
695   ** The following asserts make sure that structures used by the btree are
696   ** the right size.  This is to guard against size changes that result
697   ** when compiling on a different architecture.
698   */
699   assert( sizeof(u32)==4 );
700   assert( sizeof(u16)==2 );
701   assert( sizeof(Pgno)==4 );
702   assert( sizeof(PageHdr)==8 );
703   assert( sizeof(CellHdr)==12 );
704   assert( sizeof(FreeBlk)==4 );
705   assert( sizeof(OverflowPage)==SQLITE_USABLE_SIZE );
706   assert( sizeof(FreelistInfo)==OVERFLOW_SIZE );
707   assert( sizeof(ptr)==sizeof(char*) );
708   assert( sizeof(uptr)==sizeof(ptr) );
709 
710   pBt = sqliteMalloc( sizeof(*pBt) );
711   if( pBt==0 ){
712     *ppBtree = 0;
713     return SQLITE_NOMEM;
714   }
715   if( nCache<10 ) nCache = 10;
716   rc = sqlitepager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
717                         !omitJournal);
718   if( rc!=SQLITE_OK ){
719     if( pBt->pPager ) sqlitepager_close(pBt->pPager);
720     sqliteFree(pBt);
721     *ppBtree = 0;
722     return rc;
723   }
724   sqlitepager_set_destructor(pBt->pPager, pageDestructor);
725   pBt->pCursor = 0;
726   pBt->page1 = 0;
727   pBt->readOnly = sqlitepager_isreadonly(pBt->pPager);
728   pBt->pOps = &sqliteBtreeOps;
729   *ppBtree = pBt;
730   return SQLITE_OK;
731 }
732 
733 /*
734 ** Close an open database and invalidate all cursors.
735 */
fileBtreeClose(Btree * pBt)736 static int fileBtreeClose(Btree *pBt){
737   while( pBt->pCursor ){
738     fileBtreeCloseCursor(pBt->pCursor);
739   }
740   sqlitepager_close(pBt->pPager);
741   sqliteFree(pBt);
742   return SQLITE_OK;
743 }
744 
745 /*
746 ** Change the limit on the number of pages allowed in the cache.
747 **
748 ** The maximum number of cache pages is set to the absolute
749 ** value of mxPage.  If mxPage is negative, the pager will
750 ** operate asynchronously - it will not stop to do fsync()s
751 ** to insure data is written to the disk surface before
752 ** continuing.  Transactions still work if synchronous is off,
753 ** and the database cannot be corrupted if this program
754 ** crashes.  But if the operating system crashes or there is
755 ** an abrupt power failure when synchronous is off, the database
756 ** could be left in an inconsistent and unrecoverable state.
757 ** Synchronous is on by default so database corruption is not
758 ** normally a worry.
759 */
fileBtreeSetCacheSize(Btree * pBt,int mxPage)760 static int fileBtreeSetCacheSize(Btree *pBt, int mxPage){
761   sqlitepager_set_cachesize(pBt->pPager, mxPage);
762   return SQLITE_OK;
763 }
764 
765 /*
766 ** Change the way data is synced to disk in order to increase or decrease
767 ** how well the database resists damage due to OS crashes and power
768 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
769 ** there is a high probability of damage)  Level 2 is the default.  There
770 ** is a very low but non-zero probability of damage.  Level 3 reduces the
771 ** probability of damage to near zero but with a write performance reduction.
772 */
fileBtreeSetSafetyLevel(Btree * pBt,int level)773 static int fileBtreeSetSafetyLevel(Btree *pBt, int level){
774   sqlitepager_set_safety_level(pBt->pPager, level);
775   return SQLITE_OK;
776 }
777 
778 /*
779 ** Get a reference to page1 of the database file.  This will
780 ** also acquire a readlock on that file.
781 **
782 ** SQLITE_OK is returned on success.  If the file is not a
783 ** well-formed database file, then SQLITE_CORRUPT is returned.
784 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
785 ** is returned if we run out of memory.  SQLITE_PROTOCOL is returned
786 ** if there is a locking protocol violation.
787 */
lockBtree(Btree * pBt)788 static int lockBtree(Btree *pBt){
789   int rc;
790   if( pBt->page1 ) return SQLITE_OK;
791   rc = sqlitepager_get(pBt->pPager, 1, (void**)&pBt->page1);
792   if( rc!=SQLITE_OK ) return rc;
793 
794   /* Do some checking to help insure the file we opened really is
795   ** a valid database file.
796   */
797   if( sqlitepager_pagecount(pBt->pPager)>0 ){
798     PageOne *pP1 = pBt->page1;
799     if( strcmp(pP1->zMagic,zMagicHeader)!=0 ||
800           (pP1->iMagic!=MAGIC && swab32(pP1->iMagic)!=MAGIC) ){
801       rc = SQLITE_NOTADB;
802       goto page1_init_failed;
803     }
804     pBt->needSwab = pP1->iMagic!=MAGIC;
805   }
806   return rc;
807 
808 page1_init_failed:
809   sqlitepager_unref(pBt->page1);
810   pBt->page1 = 0;
811   return rc;
812 }
813 
814 /*
815 ** If there are no outstanding cursors and we are not in the middle
816 ** of a transaction but there is a read lock on the database, then
817 ** this routine unrefs the first page of the database file which
818 ** has the effect of releasing the read lock.
819 **
820 ** If there are any outstanding cursors, this routine is a no-op.
821 **
822 ** If there is a transaction in progress, this routine is a no-op.
823 */
unlockBtreeIfUnused(Btree * pBt)824 static void unlockBtreeIfUnused(Btree *pBt){
825   if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->page1!=0 ){
826     sqlitepager_unref(pBt->page1);
827     pBt->page1 = 0;
828     pBt->inTrans = 0;
829     pBt->inCkpt = 0;
830   }
831 }
832 
833 /*
834 ** Create a new database by initializing the first two pages of the
835 ** file.
836 */
newDatabase(Btree * pBt)837 static int newDatabase(Btree *pBt){
838   MemPage *pRoot;
839   PageOne *pP1;
840   int rc;
841   if( sqlitepager_pagecount(pBt->pPager)>1 ) return SQLITE_OK;
842   pP1 = pBt->page1;
843   rc = sqlitepager_write(pBt->page1);
844   if( rc ) return rc;
845   rc = sqlitepager_get(pBt->pPager, 2, (void**)&pRoot);
846   if( rc ) return rc;
847   rc = sqlitepager_write(pRoot);
848   if( rc ){
849     sqlitepager_unref(pRoot);
850     return rc;
851   }
852   strcpy(pP1->zMagic, zMagicHeader);
853   if( btree_native_byte_order ){
854     pP1->iMagic = MAGIC;
855     pBt->needSwab = 0;
856   }else{
857     pP1->iMagic = swab32(MAGIC);
858     pBt->needSwab = 1;
859   }
860   zeroPage(pBt, pRoot);
861   sqlitepager_unref(pRoot);
862   return SQLITE_OK;
863 }
864 
865 /*
866 ** Attempt to start a new transaction.
867 **
868 ** A transaction must be started before attempting any changes
869 ** to the database.  None of the following routines will work
870 ** unless a transaction is started first:
871 **
872 **      sqliteBtreeCreateTable()
873 **      sqliteBtreeCreateIndex()
874 **      sqliteBtreeClearTable()
875 **      sqliteBtreeDropTable()
876 **      sqliteBtreeInsert()
877 **      sqliteBtreeDelete()
878 **      sqliteBtreeUpdateMeta()
879 */
fileBtreeBeginTrans(Btree * pBt)880 static int fileBtreeBeginTrans(Btree *pBt){
881   int rc;
882   if( pBt->inTrans ) return SQLITE_ERROR;
883   if( pBt->readOnly ) return SQLITE_READONLY;
884   if( pBt->page1==0 ){
885     rc = lockBtree(pBt);
886     if( rc!=SQLITE_OK ){
887       return rc;
888     }
889   }
890   rc = sqlitepager_begin(pBt->page1);
891   if( rc==SQLITE_OK ){
892     rc = newDatabase(pBt);
893   }
894   if( rc==SQLITE_OK ){
895     pBt->inTrans = 1;
896     pBt->inCkpt = 0;
897   }else{
898     unlockBtreeIfUnused(pBt);
899   }
900   return rc;
901 }
902 
903 /*
904 ** Commit the transaction currently in progress.
905 **
906 ** This will release the write lock on the database file.  If there
907 ** are no active cursors, it also releases the read lock.
908 */
fileBtreeCommit(Btree * pBt)909 static int fileBtreeCommit(Btree *pBt){
910   int rc;
911   rc = pBt->readOnly ? SQLITE_OK : sqlitepager_commit(pBt->pPager);
912   pBt->inTrans = 0;
913   pBt->inCkpt = 0;
914   unlockBtreeIfUnused(pBt);
915   return rc;
916 }
917 
918 /*
919 ** Rollback the transaction in progress.  All cursors will be
920 ** invalided by this operation.  Any attempt to use a cursor
921 ** that was open at the beginning of this operation will result
922 ** in an error.
923 **
924 ** This will release the write lock on the database file.  If there
925 ** are no active cursors, it also releases the read lock.
926 */
fileBtreeRollback(Btree * pBt)927 static int fileBtreeRollback(Btree *pBt){
928   int rc;
929   BtCursor *pCur;
930   if( pBt->inTrans==0 ) return SQLITE_OK;
931   pBt->inTrans = 0;
932   pBt->inCkpt = 0;
933   rc = pBt->readOnly ? SQLITE_OK : sqlitepager_rollback(pBt->pPager);
934   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
935     if( pCur->pPage && pCur->pPage->isInit==0 ){
936       sqlitepager_unref(pCur->pPage);
937       pCur->pPage = 0;
938     }
939   }
940   unlockBtreeIfUnused(pBt);
941   return rc;
942 }
943 
944 /*
945 ** Set the checkpoint for the current transaction.  The checkpoint serves
946 ** as a sub-transaction that can be rolled back independently of the
947 ** main transaction.  You must start a transaction before starting a
948 ** checkpoint.  The checkpoint is ended automatically if the transaction
949 ** commits or rolls back.
950 **
951 ** Only one checkpoint may be active at a time.  It is an error to try
952 ** to start a new checkpoint if another checkpoint is already active.
953 */
fileBtreeBeginCkpt(Btree * pBt)954 static int fileBtreeBeginCkpt(Btree *pBt){
955   int rc;
956   if( !pBt->inTrans || pBt->inCkpt ){
957     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
958   }
959   rc = pBt->readOnly ? SQLITE_OK : sqlitepager_ckpt_begin(pBt->pPager);
960   pBt->inCkpt = 1;
961   return rc;
962 }
963 
964 
965 /*
966 ** Commit a checkpoint to transaction currently in progress.  If no
967 ** checkpoint is active, this is a no-op.
968 */
fileBtreeCommitCkpt(Btree * pBt)969 static int fileBtreeCommitCkpt(Btree *pBt){
970   int rc;
971   if( pBt->inCkpt && !pBt->readOnly ){
972     rc = sqlitepager_ckpt_commit(pBt->pPager);
973   }else{
974     rc = SQLITE_OK;
975   }
976   pBt->inCkpt = 0;
977   return rc;
978 }
979 
980 /*
981 ** Rollback the checkpoint to the current transaction.  If there
982 ** is no active checkpoint or transaction, this routine is a no-op.
983 **
984 ** All cursors will be invalided by this operation.  Any attempt
985 ** to use a cursor that was open at the beginning of this operation
986 ** will result in an error.
987 */
fileBtreeRollbackCkpt(Btree * pBt)988 static int fileBtreeRollbackCkpt(Btree *pBt){
989   int rc;
990   BtCursor *pCur;
991   if( pBt->inCkpt==0 || pBt->readOnly ) return SQLITE_OK;
992   rc = sqlitepager_ckpt_rollback(pBt->pPager);
993   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
994     if( pCur->pPage && pCur->pPage->isInit==0 ){
995       sqlitepager_unref(pCur->pPage);
996       pCur->pPage = 0;
997     }
998   }
999   pBt->inCkpt = 0;
1000   return rc;
1001 }
1002 
1003 /*
1004 ** Create a new cursor for the BTree whose root is on the page
1005 ** iTable.  The act of acquiring a cursor gets a read lock on
1006 ** the database file.
1007 **
1008 ** If wrFlag==0, then the cursor can only be used for reading.
1009 ** If wrFlag==1, then the cursor can be used for reading or for
1010 ** writing if other conditions for writing are also met.  These
1011 ** are the conditions that must be met in order for writing to
1012 ** be allowed:
1013 **
1014 ** 1:  The cursor must have been opened with wrFlag==1
1015 **
1016 ** 2:  No other cursors may be open with wrFlag==0 on the same table
1017 **
1018 ** 3:  The database must be writable (not on read-only media)
1019 **
1020 ** 4:  There must be an active transaction.
1021 **
1022 ** Condition 2 warrants further discussion.  If any cursor is opened
1023 ** on a table with wrFlag==0, that prevents all other cursors from
1024 ** writing to that table.  This is a kind of "read-lock".  When a cursor
1025 ** is opened with wrFlag==0 it is guaranteed that the table will not
1026 ** change as long as the cursor is open.  This allows the cursor to
1027 ** do a sequential scan of the table without having to worry about
1028 ** entries being inserted or deleted during the scan.  Cursors should
1029 ** be opened with wrFlag==0 only if this read-lock property is needed.
1030 ** That is to say, cursors should be opened with wrFlag==0 only if they
1031 ** intend to use the sqliteBtreeNext() system call.  All other cursors
1032 ** should be opened with wrFlag==1 even if they never really intend
1033 ** to write.
1034 **
1035 ** No checking is done to make sure that page iTable really is the
1036 ** root page of a b-tree.  If it is not, then the cursor acquired
1037 ** will not work correctly.
1038 */
1039 static
fileBtreeCursor(Btree * pBt,int iTable,int wrFlag,BtCursor ** ppCur)1040 int fileBtreeCursor(Btree *pBt, int iTable, int wrFlag, BtCursor **ppCur){
1041   int rc;
1042   BtCursor *pCur, *pRing;
1043 
1044   if( pBt->readOnly && wrFlag ){
1045     *ppCur = 0;
1046     return SQLITE_READONLY;
1047   }
1048   if( pBt->page1==0 ){
1049     rc = lockBtree(pBt);
1050     if( rc!=SQLITE_OK ){
1051       *ppCur = 0;
1052       return rc;
1053     }
1054   }
1055   pCur = sqliteMalloc( sizeof(*pCur) );
1056   if( pCur==0 ){
1057     rc = SQLITE_NOMEM;
1058     goto create_cursor_exception;
1059   }
1060   pCur->pgnoRoot = (Pgno)iTable;
1061   rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pCur->pPage);
1062   if( rc!=SQLITE_OK ){
1063     goto create_cursor_exception;
1064   }
1065   rc = initPage(pBt, pCur->pPage, pCur->pgnoRoot, 0);
1066   if( rc!=SQLITE_OK ){
1067     goto create_cursor_exception;
1068   }
1069   pCur->pOps = &sqliteBtreeCursorOps;
1070   pCur->pBt = pBt;
1071   pCur->wrFlag = wrFlag;
1072   pCur->idx = 0;
1073   pCur->eSkip = SKIP_INVALID;
1074   pCur->pNext = pBt->pCursor;
1075   if( pCur->pNext ){
1076     pCur->pNext->pPrev = pCur;
1077   }
1078   pCur->pPrev = 0;
1079   pRing = pBt->pCursor;
1080   while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1081   if( pRing ){
1082     pCur->pShared = pRing->pShared;
1083     pRing->pShared = pCur;
1084   }else{
1085     pCur->pShared = pCur;
1086   }
1087   pBt->pCursor = pCur;
1088   *ppCur = pCur;
1089   return SQLITE_OK;
1090 
1091 create_cursor_exception:
1092   *ppCur = 0;
1093   if( pCur ){
1094     if( pCur->pPage ) sqlitepager_unref(pCur->pPage);
1095     sqliteFree(pCur);
1096   }
1097   unlockBtreeIfUnused(pBt);
1098   return rc;
1099 }
1100 
1101 /*
1102 ** Close a cursor.  The read lock on the database file is released
1103 ** when the last cursor is closed.
1104 */
fileBtreeCloseCursor(BtCursor * pCur)1105 static int fileBtreeCloseCursor(BtCursor *pCur){
1106   Btree *pBt = pCur->pBt;
1107   if( pCur->pPrev ){
1108     pCur->pPrev->pNext = pCur->pNext;
1109   }else{
1110     pBt->pCursor = pCur->pNext;
1111   }
1112   if( pCur->pNext ){
1113     pCur->pNext->pPrev = pCur->pPrev;
1114   }
1115   if( pCur->pPage ){
1116     sqlitepager_unref(pCur->pPage);
1117   }
1118   if( pCur->pShared!=pCur ){
1119     BtCursor *pRing = pCur->pShared;
1120     while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1121     pRing->pShared = pCur->pShared;
1122   }
1123   unlockBtreeIfUnused(pBt);
1124   sqliteFree(pCur);
1125   return SQLITE_OK;
1126 }
1127 
1128 /*
1129 ** Make a temporary cursor by filling in the fields of pTempCur.
1130 ** The temporary cursor is not on the cursor list for the Btree.
1131 */
getTempCursor(BtCursor * pCur,BtCursor * pTempCur)1132 static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
1133   memcpy(pTempCur, pCur, sizeof(*pCur));
1134   pTempCur->pNext = 0;
1135   pTempCur->pPrev = 0;
1136   if( pTempCur->pPage ){
1137     sqlitepager_ref(pTempCur->pPage);
1138   }
1139 }
1140 
1141 /*
1142 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
1143 ** function above.
1144 */
releaseTempCursor(BtCursor * pCur)1145 static void releaseTempCursor(BtCursor *pCur){
1146   if( pCur->pPage ){
1147     sqlitepager_unref(pCur->pPage);
1148   }
1149 }
1150 
1151 /*
1152 ** Set *pSize to the number of bytes of key in the entry the
1153 ** cursor currently points to.  Always return SQLITE_OK.
1154 ** Failure is not possible.  If the cursor is not currently
1155 ** pointing to an entry (which can happen, for example, if
1156 ** the database is empty) then *pSize is set to 0.
1157 */
fileBtreeKeySize(BtCursor * pCur,int * pSize)1158 static int fileBtreeKeySize(BtCursor *pCur, int *pSize){
1159   Cell *pCell;
1160   MemPage *pPage;
1161 
1162   pPage = pCur->pPage;
1163   assert( pPage!=0 );
1164   if( pCur->idx >= pPage->nCell ){
1165     *pSize = 0;
1166   }else{
1167     pCell = pPage->apCell[pCur->idx];
1168     *pSize = NKEY(pCur->pBt, pCell->h);
1169   }
1170   return SQLITE_OK;
1171 }
1172 
1173 /*
1174 ** Read payload information from the entry that the pCur cursor is
1175 ** pointing to.  Begin reading the payload at "offset" and read
1176 ** a total of "amt" bytes.  Put the result in zBuf.
1177 **
1178 ** This routine does not make a distinction between key and data.
1179 ** It just reads bytes from the payload area.
1180 */
getPayload(BtCursor * pCur,int offset,int amt,char * zBuf)1181 static int getPayload(BtCursor *pCur, int offset, int amt, char *zBuf){
1182   char *aPayload;
1183   Pgno nextPage;
1184   int rc;
1185   Btree *pBt = pCur->pBt;
1186   assert( pCur!=0 && pCur->pPage!=0 );
1187   assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
1188   aPayload = pCur->pPage->apCell[pCur->idx]->aPayload;
1189   if( offset<MX_LOCAL_PAYLOAD ){
1190     int a = amt;
1191     if( a+offset>MX_LOCAL_PAYLOAD ){
1192       a = MX_LOCAL_PAYLOAD - offset;
1193     }
1194     memcpy(zBuf, &aPayload[offset], a);
1195     if( a==amt ){
1196       return SQLITE_OK;
1197     }
1198     offset = 0;
1199     zBuf += a;
1200     amt -= a;
1201   }else{
1202     offset -= MX_LOCAL_PAYLOAD;
1203   }
1204   if( amt>0 ){
1205     nextPage = SWAB32(pBt, pCur->pPage->apCell[pCur->idx]->ovfl);
1206   }
1207   while( amt>0 && nextPage ){
1208     OverflowPage *pOvfl;
1209     rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl);
1210     if( rc!=0 ){
1211       return rc;
1212     }
1213     nextPage = SWAB32(pBt, pOvfl->iNext);
1214     if( offset<OVERFLOW_SIZE ){
1215       int a = amt;
1216       if( a + offset > OVERFLOW_SIZE ){
1217         a = OVERFLOW_SIZE - offset;
1218       }
1219       memcpy(zBuf, &pOvfl->aPayload[offset], a);
1220       offset = 0;
1221       amt -= a;
1222       zBuf += a;
1223     }else{
1224       offset -= OVERFLOW_SIZE;
1225     }
1226     sqlitepager_unref(pOvfl);
1227   }
1228   if( amt>0 ){
1229     return SQLITE_CORRUPT;
1230   }
1231   return SQLITE_OK;
1232 }
1233 
1234 /*
1235 ** Read part of the key associated with cursor pCur.  A maximum
1236 ** of "amt" bytes will be transfered into zBuf[].  The transfer
1237 ** begins at "offset".  The number of bytes actually read is
1238 ** returned.
1239 **
1240 ** Change:  It used to be that the amount returned will be smaller
1241 ** than the amount requested if there are not enough bytes in the key
1242 ** to satisfy the request.  But now, it must be the case that there
1243 ** is enough data available to satisfy the request.  If not, an exception
1244 ** is raised.  The change was made in an effort to boost performance
1245 ** by eliminating unneeded tests.
1246 */
fileBtreeKey(BtCursor * pCur,int offset,int amt,char * zBuf)1247 static int fileBtreeKey(BtCursor *pCur, int offset, int amt, char *zBuf){
1248   MemPage *pPage;
1249 
1250   assert( amt>=0 );
1251   assert( offset>=0 );
1252   assert( pCur->pPage!=0 );
1253   pPage = pCur->pPage;
1254   if( pCur->idx >= pPage->nCell ){
1255     return 0;
1256   }
1257   assert( amt+offset <= NKEY(pCur->pBt, pPage->apCell[pCur->idx]->h) );
1258   getPayload(pCur, offset, amt, zBuf);
1259   return amt;
1260 }
1261 
1262 /*
1263 ** Set *pSize to the number of bytes of data in the entry the
1264 ** cursor currently points to.  Always return SQLITE_OK.
1265 ** Failure is not possible.  If the cursor is not currently
1266 ** pointing to an entry (which can happen, for example, if
1267 ** the database is empty) then *pSize is set to 0.
1268 */
fileBtreeDataSize(BtCursor * pCur,int * pSize)1269 static int fileBtreeDataSize(BtCursor *pCur, int *pSize){
1270   Cell *pCell;
1271   MemPage *pPage;
1272 
1273   pPage = pCur->pPage;
1274   assert( pPage!=0 );
1275   if( pCur->idx >= pPage->nCell ){
1276     *pSize = 0;
1277   }else{
1278     pCell = pPage->apCell[pCur->idx];
1279     *pSize = NDATA(pCur->pBt, pCell->h);
1280   }
1281   return SQLITE_OK;
1282 }
1283 
1284 /*
1285 ** Read part of the data associated with cursor pCur.  A maximum
1286 ** of "amt" bytes will be transfered into zBuf[].  The transfer
1287 ** begins at "offset".  The number of bytes actually read is
1288 ** returned.  The amount returned will be smaller than the
1289 ** amount requested if there are not enough bytes in the data
1290 ** to satisfy the request.
1291 */
fileBtreeData(BtCursor * pCur,int offset,int amt,char * zBuf)1292 static int fileBtreeData(BtCursor *pCur, int offset, int amt, char *zBuf){
1293   Cell *pCell;
1294   MemPage *pPage;
1295 
1296   assert( amt>=0 );
1297   assert( offset>=0 );
1298   assert( pCur->pPage!=0 );
1299   pPage = pCur->pPage;
1300   if( pCur->idx >= pPage->nCell ){
1301     return 0;
1302   }
1303   pCell = pPage->apCell[pCur->idx];
1304   assert( amt+offset <= NDATA(pCur->pBt, pCell->h) );
1305   getPayload(pCur, offset + NKEY(pCur->pBt, pCell->h), amt, zBuf);
1306   return amt;
1307 }
1308 
1309 /*
1310 ** Compare an external key against the key on the entry that pCur points to.
1311 **
1312 ** The external key is pKey and is nKey bytes long.  The last nIgnore bytes
1313 ** of the key associated with pCur are ignored, as if they do not exist.
1314 ** (The normal case is for nIgnore to be zero in which case the entire
1315 ** internal key is used in the comparison.)
1316 **
1317 ** The comparison result is written to *pRes as follows:
1318 **
1319 **    *pRes<0    This means pCur<pKey
1320 **
1321 **    *pRes==0   This means pCur==pKey for all nKey bytes
1322 **
1323 **    *pRes>0    This means pCur>pKey
1324 **
1325 ** When one key is an exact prefix of the other, the shorter key is
1326 ** considered less than the longer one.  In order to be equal the
1327 ** keys must be exactly the same length. (The length of the pCur key
1328 ** is the actual key length minus nIgnore bytes.)
1329 */
fileBtreeKeyCompare(BtCursor * pCur,const void * pKey,int nKey,int nIgnore,int * pResult)1330 static int fileBtreeKeyCompare(
1331   BtCursor *pCur,       /* Pointer to entry to compare against */
1332   const void *pKey,     /* Key to compare against entry that pCur points to */
1333   int nKey,             /* Number of bytes in pKey */
1334   int nIgnore,          /* Ignore this many bytes at the end of pCur */
1335   int *pResult          /* Write the result here */
1336 ){
1337   Pgno nextPage;
1338   int n, c, rc, nLocal;
1339   Cell *pCell;
1340   Btree *pBt = pCur->pBt;
1341   const char *zKey  = (const char*)pKey;
1342 
1343   assert( pCur->pPage );
1344   assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
1345   pCell = pCur->pPage->apCell[pCur->idx];
1346   nLocal = NKEY(pBt, pCell->h) - nIgnore;
1347   if( nLocal<0 ) nLocal = 0;
1348   n = nKey<nLocal ? nKey : nLocal;
1349   if( n>MX_LOCAL_PAYLOAD ){
1350     n = MX_LOCAL_PAYLOAD;
1351   }
1352   c = memcmp(pCell->aPayload, zKey, n);
1353   if( c!=0 ){
1354     *pResult = c;
1355     return SQLITE_OK;
1356   }
1357   zKey += n;
1358   nKey -= n;
1359   nLocal -= n;
1360   nextPage = SWAB32(pBt, pCell->ovfl);
1361   while( nKey>0 && nLocal>0 ){
1362     OverflowPage *pOvfl;
1363     if( nextPage==0 ){
1364       return SQLITE_CORRUPT;
1365     }
1366     rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl);
1367     if( rc ){
1368       return rc;
1369     }
1370     nextPage = SWAB32(pBt, pOvfl->iNext);
1371     n = nKey<nLocal ? nKey : nLocal;
1372     if( n>OVERFLOW_SIZE ){
1373       n = OVERFLOW_SIZE;
1374     }
1375     c = memcmp(pOvfl->aPayload, zKey, n);
1376     sqlitepager_unref(pOvfl);
1377     if( c!=0 ){
1378       *pResult = c;
1379       return SQLITE_OK;
1380     }
1381     nKey -= n;
1382     nLocal -= n;
1383     zKey += n;
1384   }
1385   if( c==0 ){
1386     c = nLocal - nKey;
1387   }
1388   *pResult = c;
1389   return SQLITE_OK;
1390 }
1391 
1392 /*
1393 ** Move the cursor down to a new child page.  The newPgno argument is the
1394 ** page number of the child page in the byte order of the disk image.
1395 */
moveToChild(BtCursor * pCur,int newPgno)1396 static int moveToChild(BtCursor *pCur, int newPgno){
1397   int rc;
1398   MemPage *pNewPage;
1399   Btree *pBt = pCur->pBt;
1400 
1401   newPgno = SWAB32(pBt, newPgno);
1402   rc = sqlitepager_get(pBt->pPager, newPgno, (void**)&pNewPage);
1403   if( rc ) return rc;
1404   rc = initPage(pBt, pNewPage, newPgno, pCur->pPage);
1405   if( rc ) return rc;
1406   assert( pCur->idx>=pCur->pPage->nCell
1407           || pCur->pPage->apCell[pCur->idx]->h.leftChild==SWAB32(pBt,newPgno) );
1408   assert( pCur->idx<pCur->pPage->nCell
1409           || pCur->pPage->u.hdr.rightChild==SWAB32(pBt,newPgno) );
1410   pNewPage->idxParent = pCur->idx;
1411   pCur->pPage->idxShift = 0;
1412   sqlitepager_unref(pCur->pPage);
1413   pCur->pPage = pNewPage;
1414   pCur->idx = 0;
1415   if( pNewPage->nCell<1 ){
1416     return SQLITE_CORRUPT;
1417   }
1418   return SQLITE_OK;
1419 }
1420 
1421 /*
1422 ** Move the cursor up to the parent page.
1423 **
1424 ** pCur->idx is set to the cell index that contains the pointer
1425 ** to the page we are coming from.  If we are coming from the
1426 ** right-most child page then pCur->idx is set to one more than
1427 ** the largest cell index.
1428 */
moveToParent(BtCursor * pCur)1429 static void moveToParent(BtCursor *pCur){
1430   Pgno oldPgno;
1431   MemPage *pParent;
1432   MemPage *pPage;
1433   int idxParent;
1434   pPage = pCur->pPage;
1435   assert( pPage!=0 );
1436   pParent = pPage->pParent;
1437   assert( pParent!=0 );
1438   idxParent = pPage->idxParent;
1439   sqlitepager_ref(pParent);
1440   sqlitepager_unref(pPage);
1441   pCur->pPage = pParent;
1442   assert( pParent->idxShift==0 );
1443   if( pParent->idxShift==0 ){
1444     pCur->idx = idxParent;
1445 #ifndef NDEBUG
1446     /* Verify that pCur->idx is the correct index to point back to the child
1447     ** page we just came from
1448     */
1449     oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage));
1450     if( pCur->idx<pParent->nCell ){
1451       assert( pParent->apCell[idxParent]->h.leftChild==oldPgno );
1452     }else{
1453       assert( pParent->u.hdr.rightChild==oldPgno );
1454     }
1455 #endif
1456   }else{
1457     /* The MemPage.idxShift flag indicates that cell indices might have
1458     ** changed since idxParent was set and hence idxParent might be out
1459     ** of date.  So recompute the parent cell index by scanning all cells
1460     ** and locating the one that points to the child we just came from.
1461     */
1462     int i;
1463     pCur->idx = pParent->nCell;
1464     oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage));
1465     for(i=0; i<pParent->nCell; i++){
1466       if( pParent->apCell[i]->h.leftChild==oldPgno ){
1467         pCur->idx = i;
1468         break;
1469       }
1470     }
1471   }
1472 }
1473 
1474 /*
1475 ** Move the cursor to the root page
1476 */
moveToRoot(BtCursor * pCur)1477 static int moveToRoot(BtCursor *pCur){
1478   MemPage *pNew;
1479   int rc;
1480   Btree *pBt = pCur->pBt;
1481 
1482   rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pNew);
1483   if( rc ) return rc;
1484   rc = initPage(pBt, pNew, pCur->pgnoRoot, 0);
1485   if( rc ) return rc;
1486   sqlitepager_unref(pCur->pPage);
1487   pCur->pPage = pNew;
1488   pCur->idx = 0;
1489   return SQLITE_OK;
1490 }
1491 
1492 /*
1493 ** Move the cursor down to the left-most leaf entry beneath the
1494 ** entry to which it is currently pointing.
1495 */
moveToLeftmost(BtCursor * pCur)1496 static int moveToLeftmost(BtCursor *pCur){
1497   Pgno pgno;
1498   int rc;
1499 
1500   while( (pgno = pCur->pPage->apCell[pCur->idx]->h.leftChild)!=0 ){
1501     rc = moveToChild(pCur, pgno);
1502     if( rc ) return rc;
1503   }
1504   return SQLITE_OK;
1505 }
1506 
1507 /*
1508 ** Move the cursor down to the right-most leaf entry beneath the
1509 ** page to which it is currently pointing.  Notice the difference
1510 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
1511 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1512 ** finds the right-most entry beneath the *page*.
1513 */
moveToRightmost(BtCursor * pCur)1514 static int moveToRightmost(BtCursor *pCur){
1515   Pgno pgno;
1516   int rc;
1517 
1518   while( (pgno = pCur->pPage->u.hdr.rightChild)!=0 ){
1519     pCur->idx = pCur->pPage->nCell;
1520     rc = moveToChild(pCur, pgno);
1521     if( rc ) return rc;
1522   }
1523   pCur->idx = pCur->pPage->nCell - 1;
1524   return SQLITE_OK;
1525 }
1526 
1527 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
1528 ** on success.  Set *pRes to 0 if the cursor actually points to something
1529 ** or set *pRes to 1 if the table is empty.
1530 */
fileBtreeFirst(BtCursor * pCur,int * pRes)1531 static int fileBtreeFirst(BtCursor *pCur, int *pRes){
1532   int rc;
1533   if( pCur->pPage==0 ) return SQLITE_ABORT;
1534   rc = moveToRoot(pCur);
1535   if( rc ) return rc;
1536   if( pCur->pPage->nCell==0 ){
1537     *pRes = 1;
1538     return SQLITE_OK;
1539   }
1540   *pRes = 0;
1541   rc = moveToLeftmost(pCur);
1542   pCur->eSkip = SKIP_NONE;
1543   return rc;
1544 }
1545 
1546 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
1547 ** on success.  Set *pRes to 0 if the cursor actually points to something
1548 ** or set *pRes to 1 if the table is empty.
1549 */
fileBtreeLast(BtCursor * pCur,int * pRes)1550 static int fileBtreeLast(BtCursor *pCur, int *pRes){
1551   int rc;
1552   if( pCur->pPage==0 ) return SQLITE_ABORT;
1553   rc = moveToRoot(pCur);
1554   if( rc ) return rc;
1555   assert( pCur->pPage->isInit );
1556   if( pCur->pPage->nCell==0 ){
1557     *pRes = 1;
1558     return SQLITE_OK;
1559   }
1560   *pRes = 0;
1561   rc = moveToRightmost(pCur);
1562   pCur->eSkip = SKIP_NONE;
1563   return rc;
1564 }
1565 
1566 /* Move the cursor so that it points to an entry near pKey.
1567 ** Return a success code.
1568 **
1569 ** If an exact match is not found, then the cursor is always
1570 ** left pointing at a leaf page which would hold the entry if it
1571 ** were present.  The cursor might point to an entry that comes
1572 ** before or after the key.
1573 **
1574 ** The result of comparing the key with the entry to which the
1575 ** cursor is left pointing is stored in pCur->iMatch.  The same
1576 ** value is also written to *pRes if pRes!=NULL.  The meaning of
1577 ** this value is as follows:
1578 **
1579 **     *pRes<0      The cursor is left pointing at an entry that
1580 **                  is smaller than pKey or if the table is empty
1581 **                  and the cursor is therefore left point to nothing.
1582 **
1583 **     *pRes==0     The cursor is left pointing at an entry that
1584 **                  exactly matches pKey.
1585 **
1586 **     *pRes>0      The cursor is left pointing at an entry that
1587 **                  is larger than pKey.
1588 */
1589 static
fileBtreeMoveto(BtCursor * pCur,const void * pKey,int nKey,int * pRes)1590 int fileBtreeMoveto(BtCursor *pCur, const void *pKey, int nKey, int *pRes){
1591   int rc;
1592   if( pCur->pPage==0 ) return SQLITE_ABORT;
1593   pCur->eSkip = SKIP_NONE;
1594   rc = moveToRoot(pCur);
1595   if( rc ) return rc;
1596   for(;;){
1597     int lwr, upr;
1598     Pgno chldPg;
1599     MemPage *pPage = pCur->pPage;
1600     int c = -1;  /* pRes return if table is empty must be -1 */
1601     lwr = 0;
1602     upr = pPage->nCell-1;
1603     while( lwr<=upr ){
1604       pCur->idx = (lwr+upr)/2;
1605       rc = fileBtreeKeyCompare(pCur, pKey, nKey, 0, &c);
1606       if( rc ) return rc;
1607       if( c==0 ){
1608         pCur->iMatch = c;
1609         if( pRes ) *pRes = 0;
1610         return SQLITE_OK;
1611       }
1612       if( c<0 ){
1613         lwr = pCur->idx+1;
1614       }else{
1615         upr = pCur->idx-1;
1616       }
1617     }
1618     assert( lwr==upr+1 );
1619     assert( pPage->isInit );
1620     if( lwr>=pPage->nCell ){
1621       chldPg = pPage->u.hdr.rightChild;
1622     }else{
1623       chldPg = pPage->apCell[lwr]->h.leftChild;
1624     }
1625     if( chldPg==0 ){
1626       pCur->iMatch = c;
1627       if( pRes ) *pRes = c;
1628       return SQLITE_OK;
1629     }
1630     pCur->idx = lwr;
1631     rc = moveToChild(pCur, chldPg);
1632     if( rc ) return rc;
1633   }
1634   /* NOT REACHED */
1635 }
1636 
1637 /*
1638 ** Advance the cursor to the next entry in the database.  If
1639 ** successful then set *pRes=0.  If the cursor
1640 ** was already pointing to the last entry in the database before
1641 ** this routine was called, then set *pRes=1.
1642 */
fileBtreeNext(BtCursor * pCur,int * pRes)1643 static int fileBtreeNext(BtCursor *pCur, int *pRes){
1644   int rc;
1645   MemPage *pPage = pCur->pPage;
1646   assert( pRes!=0 );
1647   if( pPage==0 ){
1648     *pRes = 1;
1649     return SQLITE_ABORT;
1650   }
1651   assert( pPage->isInit );
1652   assert( pCur->eSkip!=SKIP_INVALID );
1653   if( pPage->nCell==0 ){
1654     *pRes = 1;
1655     return SQLITE_OK;
1656   }
1657   assert( pCur->idx<pPage->nCell );
1658   if( pCur->eSkip==SKIP_NEXT ){
1659     pCur->eSkip = SKIP_NONE;
1660     *pRes = 0;
1661     return SQLITE_OK;
1662   }
1663   pCur->eSkip = SKIP_NONE;
1664   pCur->idx++;
1665   if( pCur->idx>=pPage->nCell ){
1666     if( pPage->u.hdr.rightChild ){
1667       rc = moveToChild(pCur, pPage->u.hdr.rightChild);
1668       if( rc ) return rc;
1669       rc = moveToLeftmost(pCur);
1670       *pRes = 0;
1671       return rc;
1672     }
1673     do{
1674       if( pPage->pParent==0 ){
1675         *pRes = 1;
1676         return SQLITE_OK;
1677       }
1678       moveToParent(pCur);
1679       pPage = pCur->pPage;
1680     }while( pCur->idx>=pPage->nCell );
1681     *pRes = 0;
1682     return SQLITE_OK;
1683   }
1684   *pRes = 0;
1685   if( pPage->u.hdr.rightChild==0 ){
1686     return SQLITE_OK;
1687   }
1688   rc = moveToLeftmost(pCur);
1689   return rc;
1690 }
1691 
1692 /*
1693 ** Step the cursor to the back to the previous entry in the database.  If
1694 ** successful then set *pRes=0.  If the cursor
1695 ** was already pointing to the first entry in the database before
1696 ** this routine was called, then set *pRes=1.
1697 */
fileBtreePrevious(BtCursor * pCur,int * pRes)1698 static int fileBtreePrevious(BtCursor *pCur, int *pRes){
1699   int rc;
1700   Pgno pgno;
1701   MemPage *pPage;
1702   pPage = pCur->pPage;
1703   if( pPage==0 ){
1704     *pRes = 1;
1705     return SQLITE_ABORT;
1706   }
1707   assert( pPage->isInit );
1708   assert( pCur->eSkip!=SKIP_INVALID );
1709   if( pPage->nCell==0 ){
1710     *pRes = 1;
1711     return SQLITE_OK;
1712   }
1713   if( pCur->eSkip==SKIP_PREV ){
1714     pCur->eSkip = SKIP_NONE;
1715     *pRes = 0;
1716     return SQLITE_OK;
1717   }
1718   pCur->eSkip = SKIP_NONE;
1719   assert( pCur->idx>=0 );
1720   if( (pgno = pPage->apCell[pCur->idx]->h.leftChild)!=0 ){
1721     rc = moveToChild(pCur, pgno);
1722     if( rc ) return rc;
1723     rc = moveToRightmost(pCur);
1724   }else{
1725     while( pCur->idx==0 ){
1726       if( pPage->pParent==0 ){
1727         if( pRes ) *pRes = 1;
1728         return SQLITE_OK;
1729       }
1730       moveToParent(pCur);
1731       pPage = pCur->pPage;
1732     }
1733     pCur->idx--;
1734     rc = SQLITE_OK;
1735   }
1736   *pRes = 0;
1737   return rc;
1738 }
1739 
1740 /*
1741 ** Allocate a new page from the database file.
1742 **
1743 ** The new page is marked as dirty.  (In other words, sqlitepager_write()
1744 ** has already been called on the new page.)  The new page has also
1745 ** been referenced and the calling routine is responsible for calling
1746 ** sqlitepager_unref() on the new page when it is done.
1747 **
1748 ** SQLITE_OK is returned on success.  Any other return value indicates
1749 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
1750 ** Do not invoke sqlitepager_unref() on *ppPage if an error is returned.
1751 **
1752 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
1753 ** locate a page close to the page number "nearby".  This can be used in an
1754 ** attempt to keep related pages close to each other in the database file,
1755 ** which in turn can make database access faster.
1756 */
allocatePage(Btree * pBt,MemPage ** ppPage,Pgno * pPgno,Pgno nearby)1757 static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
1758   PageOne *pPage1 = pBt->page1;
1759   int rc;
1760   if( pPage1->freeList ){
1761     OverflowPage *pOvfl;
1762     FreelistInfo *pInfo;
1763 
1764     rc = sqlitepager_write(pPage1);
1765     if( rc ) return rc;
1766     SWAB_ADD(pBt, pPage1->nFree, -1);
1767     rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList),
1768                         (void**)&pOvfl);
1769     if( rc ) return rc;
1770     rc = sqlitepager_write(pOvfl);
1771     if( rc ){
1772       sqlitepager_unref(pOvfl);
1773       return rc;
1774     }
1775     pInfo = (FreelistInfo*)pOvfl->aPayload;
1776     if( pInfo->nFree==0 ){
1777       *pPgno = SWAB32(pBt, pPage1->freeList);
1778       pPage1->freeList = pOvfl->iNext;
1779       *ppPage = (MemPage*)pOvfl;
1780     }else{
1781       int closest, n;
1782       n = SWAB32(pBt, pInfo->nFree);
1783       if( n>1 && nearby>0 ){
1784         int i, dist;
1785         closest = 0;
1786         dist = SWAB32(pBt, pInfo->aFree[0]) - nearby;
1787         if( dist<0 ) dist = -dist;
1788         for(i=1; i<n; i++){
1789           int d2 = SWAB32(pBt, pInfo->aFree[i]) - nearby;
1790           if( d2<0 ) d2 = -d2;
1791           if( d2<dist ) closest = i;
1792         }
1793       }else{
1794         closest = 0;
1795       }
1796       SWAB_ADD(pBt, pInfo->nFree, -1);
1797       *pPgno = SWAB32(pBt, pInfo->aFree[closest]);
1798       pInfo->aFree[closest] = pInfo->aFree[n-1];
1799       rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage);
1800       sqlitepager_unref(pOvfl);
1801       if( rc==SQLITE_OK ){
1802         sqlitepager_dont_rollback(*ppPage);
1803         rc = sqlitepager_write(*ppPage);
1804       }
1805     }
1806   }else{
1807     *pPgno = sqlitepager_pagecount(pBt->pPager) + 1;
1808     rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage);
1809     if( rc ) return rc;
1810     rc = sqlitepager_write(*ppPage);
1811   }
1812   return rc;
1813 }
1814 
1815 /*
1816 ** Add a page of the database file to the freelist.  Either pgno or
1817 ** pPage but not both may be 0.
1818 **
1819 ** sqlitepager_unref() is NOT called for pPage.
1820 */
freePage(Btree * pBt,void * pPage,Pgno pgno)1821 static int freePage(Btree *pBt, void *pPage, Pgno pgno){
1822   PageOne *pPage1 = pBt->page1;
1823   OverflowPage *pOvfl = (OverflowPage*)pPage;
1824   int rc;
1825   int needUnref = 0;
1826   MemPage *pMemPage;
1827 
1828   if( pgno==0 ){
1829     assert( pOvfl!=0 );
1830     pgno = sqlitepager_pagenumber(pOvfl);
1831   }
1832   assert( pgno>2 );
1833   assert( sqlitepager_pagenumber(pOvfl)==pgno );
1834   pMemPage = (MemPage*)pPage;
1835   pMemPage->isInit = 0;
1836   if( pMemPage->pParent ){
1837     sqlitepager_unref(pMemPage->pParent);
1838     pMemPage->pParent = 0;
1839   }
1840   rc = sqlitepager_write(pPage1);
1841   if( rc ){
1842     return rc;
1843   }
1844   SWAB_ADD(pBt, pPage1->nFree, 1);
1845   if( pPage1->nFree!=0 && pPage1->freeList!=0 ){
1846     OverflowPage *pFreeIdx;
1847     rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList),
1848                         (void**)&pFreeIdx);
1849     if( rc==SQLITE_OK ){
1850       FreelistInfo *pInfo = (FreelistInfo*)pFreeIdx->aPayload;
1851       int n = SWAB32(pBt, pInfo->nFree);
1852       if( n<(sizeof(pInfo->aFree)/sizeof(pInfo->aFree[0])) ){
1853         rc = sqlitepager_write(pFreeIdx);
1854         if( rc==SQLITE_OK ){
1855           pInfo->aFree[n] = SWAB32(pBt, pgno);
1856           SWAB_ADD(pBt, pInfo->nFree, 1);
1857           sqlitepager_unref(pFreeIdx);
1858           sqlitepager_dont_write(pBt->pPager, pgno);
1859           return rc;
1860         }
1861       }
1862       sqlitepager_unref(pFreeIdx);
1863     }
1864   }
1865   if( pOvfl==0 ){
1866     assert( pgno>0 );
1867     rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pOvfl);
1868     if( rc ) return rc;
1869     needUnref = 1;
1870   }
1871   rc = sqlitepager_write(pOvfl);
1872   if( rc ){
1873     if( needUnref ) sqlitepager_unref(pOvfl);
1874     return rc;
1875   }
1876   pOvfl->iNext = pPage1->freeList;
1877   pPage1->freeList = SWAB32(pBt, pgno);
1878   memset(pOvfl->aPayload, 0, OVERFLOW_SIZE);
1879   if( needUnref ) rc = sqlitepager_unref(pOvfl);
1880   return rc;
1881 }
1882 
1883 /*
1884 ** Erase all the data out of a cell.  This involves returning overflow
1885 ** pages back the freelist.
1886 */
clearCell(Btree * pBt,Cell * pCell)1887 static int clearCell(Btree *pBt, Cell *pCell){
1888   Pager *pPager = pBt->pPager;
1889   OverflowPage *pOvfl;
1890   Pgno ovfl, nextOvfl;
1891   int rc;
1892 
1893   if( NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h) <= MX_LOCAL_PAYLOAD ){
1894     return SQLITE_OK;
1895   }
1896   ovfl = SWAB32(pBt, pCell->ovfl);
1897   pCell->ovfl = 0;
1898   while( ovfl ){
1899     rc = sqlitepager_get(pPager, ovfl, (void**)&pOvfl);
1900     if( rc ) return rc;
1901     nextOvfl = SWAB32(pBt, pOvfl->iNext);
1902     rc = freePage(pBt, pOvfl, ovfl);
1903     if( rc ) return rc;
1904     sqlitepager_unref(pOvfl);
1905     ovfl = nextOvfl;
1906   }
1907   return SQLITE_OK;
1908 }
1909 
1910 /*
1911 ** Create a new cell from key and data.  Overflow pages are allocated as
1912 ** necessary and linked to this cell.
1913 */
fillInCell(Btree * pBt,Cell * pCell,const void * pKey,int nKey,const void * pData,int nData)1914 static int fillInCell(
1915   Btree *pBt,              /* The whole Btree.  Needed to allocate pages */
1916   Cell *pCell,             /* Populate this Cell structure */
1917   const void *pKey, int nKey,    /* The key */
1918   const void *pData,int nData    /* The data */
1919 ){
1920   OverflowPage *pOvfl, *pPrior;
1921   Pgno *pNext;
1922   int spaceLeft;
1923   int n, rc;
1924   int nPayload;
1925   const char *pPayload;
1926   char *pSpace;
1927   Pgno nearby = 0;
1928 
1929   pCell->h.leftChild = 0;
1930   pCell->h.nKey = SWAB16(pBt, nKey & 0xffff);
1931   pCell->h.nKeyHi = nKey >> 16;
1932   pCell->h.nData = SWAB16(pBt, nData & 0xffff);
1933   pCell->h.nDataHi = nData >> 16;
1934   pCell->h.iNext = 0;
1935 
1936   pNext = &pCell->ovfl;
1937   pSpace = pCell->aPayload;
1938   spaceLeft = MX_LOCAL_PAYLOAD;
1939   pPayload = pKey;
1940   pKey = 0;
1941   nPayload = nKey;
1942   pPrior = 0;
1943   while( nPayload>0 ){
1944     if( spaceLeft==0 ){
1945       rc = allocatePage(pBt, (MemPage**)&pOvfl, pNext, nearby);
1946       if( rc ){
1947         *pNext = 0;
1948       }else{
1949         nearby = *pNext;
1950       }
1951       if( pPrior ) sqlitepager_unref(pPrior);
1952       if( rc ){
1953         clearCell(pBt, pCell);
1954         return rc;
1955       }
1956       if( pBt->needSwab ) *pNext = swab32(*pNext);
1957       pPrior = pOvfl;
1958       spaceLeft = OVERFLOW_SIZE;
1959       pSpace = pOvfl->aPayload;
1960       pNext = &pOvfl->iNext;
1961     }
1962     n = nPayload;
1963     if( n>spaceLeft ) n = spaceLeft;
1964     memcpy(pSpace, pPayload, n);
1965     nPayload -= n;
1966     if( nPayload==0 && pData ){
1967       pPayload = pData;
1968       nPayload = nData;
1969       pData = 0;
1970     }else{
1971       pPayload += n;
1972     }
1973     spaceLeft -= n;
1974     pSpace += n;
1975   }
1976   *pNext = 0;
1977   if( pPrior ){
1978     sqlitepager_unref(pPrior);
1979   }
1980   return SQLITE_OK;
1981 }
1982 
1983 /*
1984 ** Change the MemPage.pParent pointer on the page whose number is
1985 ** given in the second argument so that MemPage.pParent holds the
1986 ** pointer in the third argument.
1987 */
reparentPage(Pager * pPager,Pgno pgno,MemPage * pNewParent,int idx)1988 static void reparentPage(Pager *pPager, Pgno pgno, MemPage *pNewParent,int idx){
1989   MemPage *pThis;
1990 
1991   if( pgno==0 ) return;
1992   assert( pPager!=0 );
1993   pThis = sqlitepager_lookup(pPager, pgno);
1994   if( pThis && pThis->isInit ){
1995     if( pThis->pParent!=pNewParent ){
1996       if( pThis->pParent ) sqlitepager_unref(pThis->pParent);
1997       pThis->pParent = pNewParent;
1998       if( pNewParent ) sqlitepager_ref(pNewParent);
1999     }
2000     pThis->idxParent = idx;
2001     sqlitepager_unref(pThis);
2002   }
2003 }
2004 
2005 /*
2006 ** Reparent all children of the given page to be the given page.
2007 ** In other words, for every child of pPage, invoke reparentPage()
2008 ** to make sure that each child knows that pPage is its parent.
2009 **
2010 ** This routine gets called after you memcpy() one page into
2011 ** another.
2012 */
reparentChildPages(Btree * pBt,MemPage * pPage)2013 static void reparentChildPages(Btree *pBt, MemPage *pPage){
2014   int i;
2015   Pager *pPager = pBt->pPager;
2016   for(i=0; i<pPage->nCell; i++){
2017     reparentPage(pPager, SWAB32(pBt, pPage->apCell[i]->h.leftChild), pPage, i);
2018   }
2019   reparentPage(pPager, SWAB32(pBt, pPage->u.hdr.rightChild), pPage, i);
2020   pPage->idxShift = 0;
2021 }
2022 
2023 /*
2024 ** Remove the i-th cell from pPage.  This routine effects pPage only.
2025 ** The cell content is not freed or deallocated.  It is assumed that
2026 ** the cell content has been copied someplace else.  This routine just
2027 ** removes the reference to the cell from pPage.
2028 **
2029 ** "sz" must be the number of bytes in the cell.
2030 **
2031 ** Do not bother maintaining the integrity of the linked list of Cells.
2032 ** Only the pPage->apCell[] array is important.  The relinkCellList()
2033 ** routine will be called soon after this routine in order to rebuild
2034 ** the linked list.
2035 */
dropCell(Btree * pBt,MemPage * pPage,int idx,int sz)2036 static void dropCell(Btree *pBt, MemPage *pPage, int idx, int sz){
2037   int j;
2038   assert( idx>=0 && idx<pPage->nCell );
2039   assert( sz==cellSize(pBt, pPage->apCell[idx]) );
2040   assert( sqlitepager_iswriteable(pPage) );
2041   freeSpace(pBt, pPage, Addr(pPage->apCell[idx]) - Addr(pPage), sz);
2042   for(j=idx; j<pPage->nCell-1; j++){
2043     pPage->apCell[j] = pPage->apCell[j+1];
2044   }
2045   pPage->nCell--;
2046   pPage->idxShift = 1;
2047 }
2048 
2049 /*
2050 ** Insert a new cell on pPage at cell index "i".  pCell points to the
2051 ** content of the cell.
2052 **
2053 ** If the cell content will fit on the page, then put it there.  If it
2054 ** will not fit, then just make pPage->apCell[i] point to the content
2055 ** and set pPage->isOverfull.
2056 **
2057 ** Do not bother maintaining the integrity of the linked list of Cells.
2058 ** Only the pPage->apCell[] array is important.  The relinkCellList()
2059 ** routine will be called soon after this routine in order to rebuild
2060 ** the linked list.
2061 */
insertCell(Btree * pBt,MemPage * pPage,int i,Cell * pCell,int sz)2062 static void insertCell(Btree *pBt, MemPage *pPage, int i, Cell *pCell, int sz){
2063   int idx, j;
2064   assert( i>=0 && i<=pPage->nCell );
2065   assert( sz==cellSize(pBt, pCell) );
2066   assert( sqlitepager_iswriteable(pPage) );
2067   idx = allocateSpace(pBt, pPage, sz);
2068   for(j=pPage->nCell; j>i; j--){
2069     pPage->apCell[j] = pPage->apCell[j-1];
2070   }
2071   pPage->nCell++;
2072   if( idx<=0 ){
2073     pPage->isOverfull = 1;
2074     pPage->apCell[i] = pCell;
2075   }else{
2076     memcpy(&pPage->u.aDisk[idx], pCell, sz);
2077     pPage->apCell[i] = (Cell*)&pPage->u.aDisk[idx];
2078   }
2079   pPage->idxShift = 1;
2080 }
2081 
2082 /*
2083 ** Rebuild the linked list of cells on a page so that the cells
2084 ** occur in the order specified by the pPage->apCell[] array.
2085 ** Invoke this routine once to repair damage after one or more
2086 ** invocations of either insertCell() or dropCell().
2087 */
relinkCellList(Btree * pBt,MemPage * pPage)2088 static void relinkCellList(Btree *pBt, MemPage *pPage){
2089   int i;
2090   u16 *pIdx;
2091   assert( sqlitepager_iswriteable(pPage) );
2092   pIdx = &pPage->u.hdr.firstCell;
2093   for(i=0; i<pPage->nCell; i++){
2094     int idx = Addr(pPage->apCell[i]) - Addr(pPage);
2095     assert( idx>0 && idx<SQLITE_USABLE_SIZE );
2096     *pIdx = SWAB16(pBt, idx);
2097     pIdx = &pPage->apCell[i]->h.iNext;
2098   }
2099   *pIdx = 0;
2100 }
2101 
2102 /*
2103 ** Make a copy of the contents of pFrom into pTo.  The pFrom->apCell[]
2104 ** pointers that point into pFrom->u.aDisk[] must be adjusted to point
2105 ** into pTo->u.aDisk[] instead.  But some pFrom->apCell[] entries might
2106 ** not point to pFrom->u.aDisk[].  Those are unchanged.
2107 */
copyPage(MemPage * pTo,MemPage * pFrom)2108 static void copyPage(MemPage *pTo, MemPage *pFrom){
2109   uptr from, to;
2110   int i;
2111   memcpy(pTo->u.aDisk, pFrom->u.aDisk, SQLITE_USABLE_SIZE);
2112   pTo->pParent = 0;
2113   pTo->isInit = 1;
2114   pTo->nCell = pFrom->nCell;
2115   pTo->nFree = pFrom->nFree;
2116   pTo->isOverfull = pFrom->isOverfull;
2117   to = Addr(pTo);
2118   from = Addr(pFrom);
2119   for(i=0; i<pTo->nCell; i++){
2120     uptr x = Addr(pFrom->apCell[i]);
2121     if( x>from && x<from+SQLITE_USABLE_SIZE ){
2122       *((uptr*)&pTo->apCell[i]) = x + to - from;
2123     }else{
2124       pTo->apCell[i] = pFrom->apCell[i];
2125     }
2126   }
2127 }
2128 
2129 /*
2130 ** The following parameters determine how many adjacent pages get involved
2131 ** in a balancing operation.  NN is the number of neighbors on either side
2132 ** of the page that participate in the balancing operation.  NB is the
2133 ** total number of pages that participate, including the target page and
2134 ** NN neighbors on either side.
2135 **
2136 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
2137 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2138 ** in exchange for a larger degradation in INSERT and UPDATE performance.
2139 ** The value of NN appears to give the best results overall.
2140 */
2141 #define NN 1             /* Number of neighbors on either side of pPage */
2142 #define NB (NN*2+1)      /* Total pages involved in the balance */
2143 
2144 /*
2145 ** This routine redistributes Cells on pPage and up to two siblings
2146 ** of pPage so that all pages have about the same amount of free space.
2147 ** Usually one sibling on either side of pPage is used in the balancing,
2148 ** though both siblings might come from one side if pPage is the first
2149 ** or last child of its parent.  If pPage has fewer than two siblings
2150 ** (something which can only happen if pPage is the root page or a
2151 ** child of root) then all available siblings participate in the balancing.
2152 **
2153 ** The number of siblings of pPage might be increased or decreased by
2154 ** one in an effort to keep pages between 66% and 100% full. The root page
2155 ** is special and is allowed to be less than 66% full. If pPage is
2156 ** the root page, then the depth of the tree might be increased
2157 ** or decreased by one, as necessary, to keep the root page from being
2158 ** overfull or empty.
2159 **
2160 ** This routine calls relinkCellList() on its input page regardless of
2161 ** whether or not it does any real balancing.  Client routines will typically
2162 ** invoke insertCell() or dropCell() before calling this routine, so we
2163 ** need to call relinkCellList() to clean up the mess that those other
2164 ** routines left behind.
2165 **
2166 ** pCur is left pointing to the same cell as when this routine was called
2167 ** even if that cell gets moved to a different page.  pCur may be NULL.
2168 ** Set the pCur parameter to NULL if you do not care about keeping track
2169 ** of a cell as that will save this routine the work of keeping track of it.
2170 **
2171 ** Note that when this routine is called, some of the Cells on pPage
2172 ** might not actually be stored in pPage->u.aDisk[].  This can happen
2173 ** if the page is overfull.  Part of the job of this routine is to
2174 ** make sure all Cells for pPage once again fit in pPage->u.aDisk[].
2175 **
2176 ** In the course of balancing the siblings of pPage, the parent of pPage
2177 ** might become overfull or underfull.  If that happens, then this routine
2178 ** is called recursively on the parent.
2179 **
2180 ** If this routine fails for any reason, it might leave the database
2181 ** in a corrupted state.  So if this routine fails, the database should
2182 ** be rolled back.
2183 */
balance(Btree * pBt,MemPage * pPage,BtCursor * pCur)2184 static int balance(Btree *pBt, MemPage *pPage, BtCursor *pCur){
2185   MemPage *pParent;            /* The parent of pPage */
2186   int nCell;                   /* Number of cells in apCell[] */
2187   int nOld;                    /* Number of pages in apOld[] */
2188   int nNew;                    /* Number of pages in apNew[] */
2189   int nDiv;                    /* Number of cells in apDiv[] */
2190   int i, j, k;                 /* Loop counters */
2191   int idx;                     /* Index of pPage in pParent->apCell[] */
2192   int nxDiv;                   /* Next divider slot in pParent->apCell[] */
2193   int rc;                      /* The return code */
2194   int iCur;                    /* apCell[iCur] is the cell of the cursor */
2195   MemPage *pOldCurPage;        /* The cursor originally points to this page */
2196   int subtotal;                /* Subtotal of bytes in cells on one page */
2197   MemPage *extraUnref = 0;     /* A page that needs to be unref-ed */
2198   MemPage *apOld[NB];          /* pPage and up to two siblings */
2199   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
2200   MemPage *apNew[NB+1];        /* pPage and up to NB siblings after balancing */
2201   Pgno pgnoNew[NB+1];          /* Page numbers for each page in apNew[] */
2202   int idxDiv[NB];              /* Indices of divider cells in pParent */
2203   Cell *apDiv[NB];             /* Divider cells in pParent */
2204   Cell aTemp[NB];              /* Temporary holding area for apDiv[] */
2205   int cntNew[NB+1];            /* Index in apCell[] of cell after i-th page */
2206   int szNew[NB+1];             /* Combined size of cells place on i-th page */
2207   MemPage aOld[NB];            /* Temporary copies of pPage and its siblings */
2208   Cell *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */
2209   int szCell[(MX_CELL+2)*NB];  /* Local size of all cells */
2210 
2211   /*
2212   ** Return without doing any work if pPage is neither overfull nor
2213   ** underfull.
2214   */
2215   assert( sqlitepager_iswriteable(pPage) );
2216   if( !pPage->isOverfull && pPage->nFree<SQLITE_USABLE_SIZE/2
2217         && pPage->nCell>=2){
2218     relinkCellList(pBt, pPage);
2219     return SQLITE_OK;
2220   }
2221 
2222   /*
2223   ** Find the parent of the page to be balanceed.
2224   ** If there is no parent, it means this page is the root page and
2225   ** special rules apply.
2226   */
2227   pParent = pPage->pParent;
2228   if( pParent==0 ){
2229     Pgno pgnoChild;
2230     MemPage *pChild;
2231     assert( pPage->isInit );
2232     if( pPage->nCell==0 ){
2233       if( pPage->u.hdr.rightChild ){
2234         /*
2235         ** The root page is empty.  Copy the one child page
2236         ** into the root page and return.  This reduces the depth
2237         ** of the BTree by one.
2238         */
2239         pgnoChild = SWAB32(pBt, pPage->u.hdr.rightChild);
2240         rc = sqlitepager_get(pBt->pPager, pgnoChild, (void**)&pChild);
2241         if( rc ) return rc;
2242         memcpy(pPage, pChild, SQLITE_USABLE_SIZE);
2243         pPage->isInit = 0;
2244         rc = initPage(pBt, pPage, sqlitepager_pagenumber(pPage), 0);
2245         assert( rc==SQLITE_OK );
2246         reparentChildPages(pBt, pPage);
2247         if( pCur && pCur->pPage==pChild ){
2248           sqlitepager_unref(pChild);
2249           pCur->pPage = pPage;
2250           sqlitepager_ref(pPage);
2251         }
2252         freePage(pBt, pChild, pgnoChild);
2253         sqlitepager_unref(pChild);
2254       }else{
2255         relinkCellList(pBt, pPage);
2256       }
2257       return SQLITE_OK;
2258     }
2259     if( !pPage->isOverfull ){
2260       /* It is OK for the root page to be less than half full.
2261       */
2262       relinkCellList(pBt, pPage);
2263       return SQLITE_OK;
2264     }
2265     /*
2266     ** If we get to here, it means the root page is overfull.
2267     ** When this happens, Create a new child page and copy the
2268     ** contents of the root into the child.  Then make the root
2269     ** page an empty page with rightChild pointing to the new
2270     ** child.  Then fall thru to the code below which will cause
2271     ** the overfull child page to be split.
2272     */
2273     rc = sqlitepager_write(pPage);
2274     if( rc ) return rc;
2275     rc = allocatePage(pBt, &pChild, &pgnoChild, sqlitepager_pagenumber(pPage));
2276     if( rc ) return rc;
2277     assert( sqlitepager_iswriteable(pChild) );
2278     copyPage(pChild, pPage);
2279     pChild->pParent = pPage;
2280     pChild->idxParent = 0;
2281     sqlitepager_ref(pPage);
2282     pChild->isOverfull = 1;
2283     if( pCur && pCur->pPage==pPage ){
2284       sqlitepager_unref(pPage);
2285       pCur->pPage = pChild;
2286     }else{
2287       extraUnref = pChild;
2288     }
2289     zeroPage(pBt, pPage);
2290     pPage->u.hdr.rightChild = SWAB32(pBt, pgnoChild);
2291     pParent = pPage;
2292     pPage = pChild;
2293   }
2294   rc = sqlitepager_write(pParent);
2295   if( rc ) return rc;
2296   assert( pParent->isInit );
2297 
2298   /*
2299   ** Find the Cell in the parent page whose h.leftChild points back
2300   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
2301   ** is the rightmost child of pParent then set idx to pParent->nCell
2302   */
2303   if( pParent->idxShift ){
2304     Pgno pgno, swabPgno;
2305     pgno = sqlitepager_pagenumber(pPage);
2306     swabPgno = SWAB32(pBt, pgno);
2307     for(idx=0; idx<pParent->nCell; idx++){
2308       if( pParent->apCell[idx]->h.leftChild==swabPgno ){
2309         break;
2310       }
2311     }
2312     assert( idx<pParent->nCell || pParent->u.hdr.rightChild==swabPgno );
2313   }else{
2314     idx = pPage->idxParent;
2315   }
2316 
2317   /*
2318   ** Initialize variables so that it will be safe to jump
2319   ** directly to balance_cleanup at any moment.
2320   */
2321   nOld = nNew = 0;
2322   sqlitepager_ref(pParent);
2323 
2324   /*
2325   ** Find sibling pages to pPage and the Cells in pParent that divide
2326   ** the siblings.  An attempt is made to find NN siblings on either
2327   ** side of pPage.  More siblings are taken from one side, however, if
2328   ** pPage there are fewer than NN siblings on the other side.  If pParent
2329   ** has NB or fewer children then all children of pParent are taken.
2330   */
2331   nxDiv = idx - NN;
2332   if( nxDiv + NB > pParent->nCell ){
2333     nxDiv = pParent->nCell - NB + 1;
2334   }
2335   if( nxDiv<0 ){
2336     nxDiv = 0;
2337   }
2338   nDiv = 0;
2339   for(i=0, k=nxDiv; i<NB; i++, k++){
2340     if( k<pParent->nCell ){
2341       idxDiv[i] = k;
2342       apDiv[i] = pParent->apCell[k];
2343       nDiv++;
2344       pgnoOld[i] = SWAB32(pBt, apDiv[i]->h.leftChild);
2345     }else if( k==pParent->nCell ){
2346       pgnoOld[i] = SWAB32(pBt, pParent->u.hdr.rightChild);
2347     }else{
2348       break;
2349     }
2350     rc = sqlitepager_get(pBt->pPager, pgnoOld[i], (void**)&apOld[i]);
2351     if( rc ) goto balance_cleanup;
2352     rc = initPage(pBt, apOld[i], pgnoOld[i], pParent);
2353     if( rc ) goto balance_cleanup;
2354     apOld[i]->idxParent = k;
2355     nOld++;
2356   }
2357 
2358   /*
2359   ** Set iCur to be the index in apCell[] of the cell that the cursor
2360   ** is pointing to.  We will need this later on in order to keep the
2361   ** cursor pointing at the same cell.  If pCur points to a page that
2362   ** has no involvement with this rebalancing, then set iCur to a large
2363   ** number so that the iCur==j tests always fail in the main cell
2364   ** distribution loop below.
2365   */
2366   if( pCur ){
2367     iCur = 0;
2368     for(i=0; i<nOld; i++){
2369       if( pCur->pPage==apOld[i] ){
2370         iCur += pCur->idx;
2371         break;
2372       }
2373       iCur += apOld[i]->nCell;
2374       if( i<nOld-1 && pCur->pPage==pParent && pCur->idx==idxDiv[i] ){
2375         break;
2376       }
2377       iCur++;
2378     }
2379     pOldCurPage = pCur->pPage;
2380   }
2381 
2382   /*
2383   ** Make copies of the content of pPage and its siblings into aOld[].
2384   ** The rest of this function will use data from the copies rather
2385   ** that the original pages since the original pages will be in the
2386   ** process of being overwritten.
2387   */
2388   for(i=0; i<nOld; i++){
2389     copyPage(&aOld[i], apOld[i]);
2390   }
2391 
2392   /*
2393   ** Load pointers to all cells on sibling pages and the divider cells
2394   ** into the local apCell[] array.  Make copies of the divider cells
2395   ** into aTemp[] and remove the the divider Cells from pParent.
2396   */
2397   nCell = 0;
2398   for(i=0; i<nOld; i++){
2399     MemPage *pOld = &aOld[i];
2400     for(j=0; j<pOld->nCell; j++){
2401       apCell[nCell] = pOld->apCell[j];
2402       szCell[nCell] = cellSize(pBt, apCell[nCell]);
2403       nCell++;
2404     }
2405     if( i<nOld-1 ){
2406       szCell[nCell] = cellSize(pBt, apDiv[i]);
2407       memcpy(&aTemp[i], apDiv[i], szCell[nCell]);
2408       apCell[nCell] = &aTemp[i];
2409       dropCell(pBt, pParent, nxDiv, szCell[nCell]);
2410       assert( SWAB32(pBt, apCell[nCell]->h.leftChild)==pgnoOld[i] );
2411       apCell[nCell]->h.leftChild = pOld->u.hdr.rightChild;
2412       nCell++;
2413     }
2414   }
2415 
2416   /*
2417   ** Figure out the number of pages needed to hold all nCell cells.
2418   ** Store this number in "k".  Also compute szNew[] which is the total
2419   ** size of all cells on the i-th page and cntNew[] which is the index
2420   ** in apCell[] of the cell that divides path i from path i+1.
2421   ** cntNew[k] should equal nCell.
2422   **
2423   ** This little patch of code is critical for keeping the tree
2424   ** balanced.
2425   */
2426   for(subtotal=k=i=0; i<nCell; i++){
2427     subtotal += szCell[i];
2428     if( subtotal > USABLE_SPACE ){
2429       szNew[k] = subtotal - szCell[i];
2430       cntNew[k] = i;
2431       subtotal = 0;
2432       k++;
2433     }
2434   }
2435   szNew[k] = subtotal;
2436   cntNew[k] = nCell;
2437   k++;
2438   for(i=k-1; i>0; i--){
2439     while( szNew[i]<USABLE_SPACE/2 ){
2440       cntNew[i-1]--;
2441       assert( cntNew[i-1]>0 );
2442       szNew[i] += szCell[cntNew[i-1]];
2443       szNew[i-1] -= szCell[cntNew[i-1]-1];
2444     }
2445   }
2446   assert( cntNew[0]>0 );
2447 
2448   /*
2449   ** Allocate k new pages.  Reuse old pages where possible.
2450   */
2451   for(i=0; i<k; i++){
2452     if( i<nOld ){
2453       apNew[i] = apOld[i];
2454       pgnoNew[i] = pgnoOld[i];
2455       apOld[i] = 0;
2456       rc = sqlitepager_write(apNew[i]);
2457       if( rc ) goto balance_cleanup;
2458     }else{
2459       rc = allocatePage(pBt, &apNew[i], &pgnoNew[i], pgnoNew[i-1]);
2460       if( rc ) goto balance_cleanup;
2461     }
2462     nNew++;
2463     zeroPage(pBt, apNew[i]);
2464     apNew[i]->isInit = 1;
2465   }
2466 
2467   /* Free any old pages that were not reused as new pages.
2468   */
2469   while( i<nOld ){
2470     rc = freePage(pBt, apOld[i], pgnoOld[i]);
2471     if( rc ) goto balance_cleanup;
2472     sqlitepager_unref(apOld[i]);
2473     apOld[i] = 0;
2474     i++;
2475   }
2476 
2477   /*
2478   ** Put the new pages in accending order.  This helps to
2479   ** keep entries in the disk file in order so that a scan
2480   ** of the table is a linear scan through the file.  That
2481   ** in turn helps the operating system to deliver pages
2482   ** from the disk more rapidly.
2483   **
2484   ** An O(n^2) insertion sort algorithm is used, but since
2485   ** n is never more than NB (a small constant), that should
2486   ** not be a problem.
2487   **
2488   ** When NB==3, this one optimization makes the database
2489   ** about 25% faster for large insertions and deletions.
2490   */
2491   for(i=0; i<k-1; i++){
2492     int minV = pgnoNew[i];
2493     int minI = i;
2494     for(j=i+1; j<k; j++){
2495       if( pgnoNew[j]<(unsigned)minV ){
2496         minI = j;
2497         minV = pgnoNew[j];
2498       }
2499     }
2500     if( minI>i ){
2501       int t;
2502       MemPage *pT;
2503       t = pgnoNew[i];
2504       pT = apNew[i];
2505       pgnoNew[i] = pgnoNew[minI];
2506       apNew[i] = apNew[minI];
2507       pgnoNew[minI] = t;
2508       apNew[minI] = pT;
2509     }
2510   }
2511 
2512   /*
2513   ** Evenly distribute the data in apCell[] across the new pages.
2514   ** Insert divider cells into pParent as necessary.
2515   */
2516   j = 0;
2517   for(i=0; i<nNew; i++){
2518     MemPage *pNew = apNew[i];
2519     while( j<cntNew[i] ){
2520       assert( pNew->nFree>=szCell[j] );
2521       if( pCur && iCur==j ){ pCur->pPage = pNew; pCur->idx = pNew->nCell; }
2522       insertCell(pBt, pNew, pNew->nCell, apCell[j], szCell[j]);
2523       j++;
2524     }
2525     assert( pNew->nCell>0 );
2526     assert( !pNew->isOverfull );
2527     relinkCellList(pBt, pNew);
2528     if( i<nNew-1 && j<nCell ){
2529       pNew->u.hdr.rightChild = apCell[j]->h.leftChild;
2530       apCell[j]->h.leftChild = SWAB32(pBt, pgnoNew[i]);
2531       if( pCur && iCur==j ){ pCur->pPage = pParent; pCur->idx = nxDiv; }
2532       insertCell(pBt, pParent, nxDiv, apCell[j], szCell[j]);
2533       j++;
2534       nxDiv++;
2535     }
2536   }
2537   assert( j==nCell );
2538   apNew[nNew-1]->u.hdr.rightChild = aOld[nOld-1].u.hdr.rightChild;
2539   if( nxDiv==pParent->nCell ){
2540     pParent->u.hdr.rightChild = SWAB32(pBt, pgnoNew[nNew-1]);
2541   }else{
2542     pParent->apCell[nxDiv]->h.leftChild = SWAB32(pBt, pgnoNew[nNew-1]);
2543   }
2544   if( pCur ){
2545     if( j<=iCur && pCur->pPage==pParent && pCur->idx>idxDiv[nOld-1] ){
2546       assert( pCur->pPage==pOldCurPage );
2547       pCur->idx += nNew - nOld;
2548     }else{
2549       assert( pOldCurPage!=0 );
2550       sqlitepager_ref(pCur->pPage);
2551       sqlitepager_unref(pOldCurPage);
2552     }
2553   }
2554 
2555   /*
2556   ** Reparent children of all cells.
2557   */
2558   for(i=0; i<nNew; i++){
2559     reparentChildPages(pBt, apNew[i]);
2560   }
2561   reparentChildPages(pBt, pParent);
2562 
2563   /*
2564   ** balance the parent page.
2565   */
2566   rc = balance(pBt, pParent, pCur);
2567 
2568   /*
2569   ** Cleanup before returning.
2570   */
2571 balance_cleanup:
2572   if( extraUnref ){
2573     sqlitepager_unref(extraUnref);
2574   }
2575   for(i=0; i<nOld; i++){
2576     if( apOld[i]!=0 && apOld[i]!=&aOld[i] ) sqlitepager_unref(apOld[i]);
2577   }
2578   for(i=0; i<nNew; i++){
2579     sqlitepager_unref(apNew[i]);
2580   }
2581   if( pCur && pCur->pPage==0 ){
2582     pCur->pPage = pParent;
2583     pCur->idx = 0;
2584   }else{
2585     sqlitepager_unref(pParent);
2586   }
2587   return rc;
2588 }
2589 
2590 /*
2591 ** This routine checks all cursors that point to the same table
2592 ** as pCur points to.  If any of those cursors were opened with
2593 ** wrFlag==0 then this routine returns SQLITE_LOCKED.  If all
2594 ** cursors point to the same table were opened with wrFlag==1
2595 ** then this routine returns SQLITE_OK.
2596 **
2597 ** In addition to checking for read-locks (where a read-lock
2598 ** means a cursor opened with wrFlag==0) this routine also moves
2599 ** all cursors other than pCur so that they are pointing to the
2600 ** first Cell on root page.  This is necessary because an insert
2601 ** or delete might change the number of cells on a page or delete
2602 ** a page entirely and we do not want to leave any cursors
2603 ** pointing to non-existant pages or cells.
2604 */
checkReadLocks(BtCursor * pCur)2605 static int checkReadLocks(BtCursor *pCur){
2606   BtCursor *p;
2607   assert( pCur->wrFlag );
2608   for(p=pCur->pShared; p!=pCur; p=p->pShared){
2609     assert( p );
2610     assert( p->pgnoRoot==pCur->pgnoRoot );
2611     if( p->wrFlag==0 ) return SQLITE_LOCKED;
2612     if( sqlitepager_pagenumber(p->pPage)!=p->pgnoRoot ){
2613       moveToRoot(p);
2614     }
2615   }
2616   return SQLITE_OK;
2617 }
2618 
2619 /*
2620 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
2621 ** and the data is given by (pData,nData).  The cursor is used only to
2622 ** define what database the record should be inserted into.  The cursor
2623 ** is left pointing at the new record.
2624 */
fileBtreeInsert(BtCursor * pCur,const void * pKey,int nKey,const void * pData,int nData)2625 static int fileBtreeInsert(
2626   BtCursor *pCur,                /* Insert data into the table of this cursor */
2627   const void *pKey, int nKey,    /* The key of the new record */
2628   const void *pData, int nData   /* The data of the new record */
2629 ){
2630   Cell newCell;
2631   int rc;
2632   int loc;
2633   int szNew;
2634   MemPage *pPage;
2635   Btree *pBt = pCur->pBt;
2636 
2637   if( pCur->pPage==0 ){
2638     return SQLITE_ABORT;  /* A rollback destroyed this cursor */
2639   }
2640   if( !pBt->inTrans || nKey+nData==0 ){
2641     /* Must start a transaction before doing an insert */
2642     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2643   }
2644   assert( !pBt->readOnly );
2645   if( !pCur->wrFlag ){
2646     return SQLITE_PERM;   /* Cursor not open for writing */
2647   }
2648   if( checkReadLocks(pCur) ){
2649     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2650   }
2651   rc = fileBtreeMoveto(pCur, pKey, nKey, &loc);
2652   if( rc ) return rc;
2653   pPage = pCur->pPage;
2654   assert( pPage->isInit );
2655   rc = sqlitepager_write(pPage);
2656   if( rc ) return rc;
2657   rc = fillInCell(pBt, &newCell, pKey, nKey, pData, nData);
2658   if( rc ) return rc;
2659   szNew = cellSize(pBt, &newCell);
2660   if( loc==0 ){
2661     newCell.h.leftChild = pPage->apCell[pCur->idx]->h.leftChild;
2662     rc = clearCell(pBt, pPage->apCell[pCur->idx]);
2663     if( rc ) return rc;
2664     dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pPage->apCell[pCur->idx]));
2665   }else if( loc<0 && pPage->nCell>0 ){
2666     assert( pPage->u.hdr.rightChild==0 );  /* Must be a leaf page */
2667     pCur->idx++;
2668   }else{
2669     assert( pPage->u.hdr.rightChild==0 );  /* Must be a leaf page */
2670   }
2671   insertCell(pBt, pPage, pCur->idx, &newCell, szNew);
2672   rc = balance(pCur->pBt, pPage, pCur);
2673   /* sqliteBtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
2674   /* fflush(stdout); */
2675   pCur->eSkip = SKIP_INVALID;
2676   return rc;
2677 }
2678 
2679 /*
2680 ** Delete the entry that the cursor is pointing to.
2681 **
2682 ** The cursor is left pointing at either the next or the previous
2683 ** entry.  If the cursor is left pointing to the next entry, then
2684 ** the pCur->eSkip flag is set to SKIP_NEXT which forces the next call to
2685 ** sqliteBtreeNext() to be a no-op.  That way, you can always call
2686 ** sqliteBtreeNext() after a delete and the cursor will be left
2687 ** pointing to the first entry after the deleted entry.  Similarly,
2688 ** pCur->eSkip is set to SKIP_PREV is the cursor is left pointing to
2689 ** the entry prior to the deleted entry so that a subsequent call to
2690 ** sqliteBtreePrevious() will always leave the cursor pointing at the
2691 ** entry immediately before the one that was deleted.
2692 */
fileBtreeDelete(BtCursor * pCur)2693 static int fileBtreeDelete(BtCursor *pCur){
2694   MemPage *pPage = pCur->pPage;
2695   Cell *pCell;
2696   int rc;
2697   Pgno pgnoChild;
2698   Btree *pBt = pCur->pBt;
2699 
2700   assert( pPage->isInit );
2701   if( pCur->pPage==0 ){
2702     return SQLITE_ABORT;  /* A rollback destroyed this cursor */
2703   }
2704   if( !pBt->inTrans ){
2705     /* Must start a transaction before doing a delete */
2706     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2707   }
2708   assert( !pBt->readOnly );
2709   if( pCur->idx >= pPage->nCell ){
2710     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
2711   }
2712   if( !pCur->wrFlag ){
2713     return SQLITE_PERM;   /* Did not open this cursor for writing */
2714   }
2715   if( checkReadLocks(pCur) ){
2716     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2717   }
2718   rc = sqlitepager_write(pPage);
2719   if( rc ) return rc;
2720   pCell = pPage->apCell[pCur->idx];
2721   pgnoChild = SWAB32(pBt, pCell->h.leftChild);
2722   rc = clearCell(pBt, pCell);
2723   if( rc ) return rc;
2724   if( pgnoChild ){
2725     /*
2726     ** The entry we are about to delete is not a leaf so if we do not
2727     ** do something we will leave a hole on an internal page.
2728     ** We have to fill the hole by moving in a cell from a leaf.  The
2729     ** next Cell after the one to be deleted is guaranteed to exist and
2730     ** to be a leaf so we can use it.
2731     */
2732     BtCursor leafCur;
2733     Cell *pNext;
2734     int szNext;
2735     int notUsed;
2736     getTempCursor(pCur, &leafCur);
2737     rc = fileBtreeNext(&leafCur, &notUsed);
2738     if( rc!=SQLITE_OK ){
2739       if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT;
2740       return rc;
2741     }
2742     rc = sqlitepager_write(leafCur.pPage);
2743     if( rc ) return rc;
2744     dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell));
2745     pNext = leafCur.pPage->apCell[leafCur.idx];
2746     szNext = cellSize(pBt, pNext);
2747     pNext->h.leftChild = SWAB32(pBt, pgnoChild);
2748     insertCell(pBt, pPage, pCur->idx, pNext, szNext);
2749     rc = balance(pBt, pPage, pCur);
2750     if( rc ) return rc;
2751     pCur->eSkip = SKIP_NEXT;
2752     dropCell(pBt, leafCur.pPage, leafCur.idx, szNext);
2753     rc = balance(pBt, leafCur.pPage, pCur);
2754     releaseTempCursor(&leafCur);
2755   }else{
2756     dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell));
2757     if( pCur->idx>=pPage->nCell ){
2758       pCur->idx = pPage->nCell-1;
2759       if( pCur->idx<0 ){
2760         pCur->idx = 0;
2761         pCur->eSkip = SKIP_NEXT;
2762       }else{
2763         pCur->eSkip = SKIP_PREV;
2764       }
2765     }else{
2766       pCur->eSkip = SKIP_NEXT;
2767     }
2768     rc = balance(pBt, pPage, pCur);
2769   }
2770   return rc;
2771 }
2772 
2773 /*
2774 ** Create a new BTree table.  Write into *piTable the page
2775 ** number for the root page of the new table.
2776 **
2777 ** In the current implementation, BTree tables and BTree indices are the
2778 ** the same.  In the future, we may change this so that BTree tables
2779 ** are restricted to having a 4-byte integer key and arbitrary data and
2780 ** BTree indices are restricted to having an arbitrary key and no data.
2781 ** But for now, this routine also serves to create indices.
2782 */
fileBtreeCreateTable(Btree * pBt,int * piTable)2783 static int fileBtreeCreateTable(Btree *pBt, int *piTable){
2784   MemPage *pRoot;
2785   Pgno pgnoRoot;
2786   int rc;
2787   if( !pBt->inTrans ){
2788     /* Must start a transaction first */
2789     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2790   }
2791   if( pBt->readOnly ){
2792     return SQLITE_READONLY;
2793   }
2794   rc = allocatePage(pBt, &pRoot, &pgnoRoot, 0);
2795   if( rc ) return rc;
2796   assert( sqlitepager_iswriteable(pRoot) );
2797   zeroPage(pBt, pRoot);
2798   sqlitepager_unref(pRoot);
2799   *piTable = (int)pgnoRoot;
2800   return SQLITE_OK;
2801 }
2802 
2803 /*
2804 ** Erase the given database page and all its children.  Return
2805 ** the page to the freelist.
2806 */
clearDatabasePage(Btree * pBt,Pgno pgno,int freePageFlag)2807 static int clearDatabasePage(Btree *pBt, Pgno pgno, int freePageFlag){
2808   MemPage *pPage;
2809   int rc;
2810   Cell *pCell;
2811   int idx;
2812 
2813   rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pPage);
2814   if( rc ) return rc;
2815   rc = sqlitepager_write(pPage);
2816   if( rc ) return rc;
2817   rc = initPage(pBt, pPage, pgno, 0);
2818   if( rc ) return rc;
2819   idx = SWAB16(pBt, pPage->u.hdr.firstCell);
2820   while( idx>0 ){
2821     pCell = (Cell*)&pPage->u.aDisk[idx];
2822     idx = SWAB16(pBt, pCell->h.iNext);
2823     if( pCell->h.leftChild ){
2824       rc = clearDatabasePage(pBt, SWAB32(pBt, pCell->h.leftChild), 1);
2825       if( rc ) return rc;
2826     }
2827     rc = clearCell(pBt, pCell);
2828     if( rc ) return rc;
2829   }
2830   if( pPage->u.hdr.rightChild ){
2831     rc = clearDatabasePage(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1);
2832     if( rc ) return rc;
2833   }
2834   if( freePageFlag ){
2835     rc = freePage(pBt, pPage, pgno);
2836   }else{
2837     zeroPage(pBt, pPage);
2838   }
2839   sqlitepager_unref(pPage);
2840   return rc;
2841 }
2842 
2843 /*
2844 ** Delete all information from a single table in the database.
2845 */
fileBtreeClearTable(Btree * pBt,int iTable)2846 static int fileBtreeClearTable(Btree *pBt, int iTable){
2847   int rc;
2848   BtCursor *pCur;
2849   if( !pBt->inTrans ){
2850     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2851   }
2852   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2853     if( pCur->pgnoRoot==(Pgno)iTable ){
2854       if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
2855       moveToRoot(pCur);
2856     }
2857   }
2858   rc = clearDatabasePage(pBt, (Pgno)iTable, 0);
2859   if( rc ){
2860     fileBtreeRollback(pBt);
2861   }
2862   return rc;
2863 }
2864 
2865 /*
2866 ** Erase all information in a table and add the root of the table to
2867 ** the freelist.  Except, the root of the principle table (the one on
2868 ** page 2) is never added to the freelist.
2869 */
fileBtreeDropTable(Btree * pBt,int iTable)2870 static int fileBtreeDropTable(Btree *pBt, int iTable){
2871   int rc;
2872   MemPage *pPage;
2873   BtCursor *pCur;
2874   if( !pBt->inTrans ){
2875     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2876   }
2877   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2878     if( pCur->pgnoRoot==(Pgno)iTable ){
2879       return SQLITE_LOCKED;  /* Cannot drop a table that has a cursor */
2880     }
2881   }
2882   rc = sqlitepager_get(pBt->pPager, (Pgno)iTable, (void**)&pPage);
2883   if( rc ) return rc;
2884   rc = fileBtreeClearTable(pBt, iTable);
2885   if( rc ) return rc;
2886   if( iTable>2 ){
2887     rc = freePage(pBt, pPage, iTable);
2888   }else{
2889     zeroPage(pBt, pPage);
2890   }
2891   sqlitepager_unref(pPage);
2892   return rc;
2893 }
2894 
2895 #if 0 /* UNTESTED */
2896 /*
2897 ** Copy all cell data from one database file into another.
2898 ** pages back the freelist.
2899 */
2900 static int copyCell(Btree *pBtFrom, BTree *pBtTo, Cell *pCell){
2901   Pager *pFromPager = pBtFrom->pPager;
2902   OverflowPage *pOvfl;
2903   Pgno ovfl, nextOvfl;
2904   Pgno *pPrev;
2905   int rc = SQLITE_OK;
2906   MemPage *pNew, *pPrevPg;
2907   Pgno new;
2908 
2909   if( NKEY(pBtTo, pCell->h) + NDATA(pBtTo, pCell->h) <= MX_LOCAL_PAYLOAD ){
2910     return SQLITE_OK;
2911   }
2912   pPrev = &pCell->ovfl;
2913   pPrevPg = 0;
2914   ovfl = SWAB32(pBtTo, pCell->ovfl);
2915   while( ovfl && rc==SQLITE_OK ){
2916     rc = sqlitepager_get(pFromPager, ovfl, (void**)&pOvfl);
2917     if( rc ) return rc;
2918     nextOvfl = SWAB32(pBtFrom, pOvfl->iNext);
2919     rc = allocatePage(pBtTo, &pNew, &new, 0);
2920     if( rc==SQLITE_OK ){
2921       rc = sqlitepager_write(pNew);
2922       if( rc==SQLITE_OK ){
2923         memcpy(pNew, pOvfl, SQLITE_USABLE_SIZE);
2924         *pPrev = SWAB32(pBtTo, new);
2925         if( pPrevPg ){
2926           sqlitepager_unref(pPrevPg);
2927         }
2928         pPrev = &pOvfl->iNext;
2929         pPrevPg = pNew;
2930       }
2931     }
2932     sqlitepager_unref(pOvfl);
2933     ovfl = nextOvfl;
2934   }
2935   if( pPrevPg ){
2936     sqlitepager_unref(pPrevPg);
2937   }
2938   return rc;
2939 }
2940 #endif
2941 
2942 
2943 #if 0 /* UNTESTED */
2944 /*
2945 ** Copy a page of data from one database over to another.
2946 */
2947 static int copyDatabasePage(
2948   Btree *pBtFrom,
2949   Pgno pgnoFrom,
2950   Btree *pBtTo,
2951   Pgno *pTo
2952 ){
2953   MemPage *pPageFrom, *pPage;
2954   Pgno to;
2955   int rc;
2956   Cell *pCell;
2957   int idx;
2958 
2959   rc = sqlitepager_get(pBtFrom->pPager, pgno, (void**)&pPageFrom);
2960   if( rc ) return rc;
2961   rc = allocatePage(pBt, &pPage, pTo, 0);
2962   if( rc==SQLITE_OK ){
2963     rc = sqlitepager_write(pPage);
2964   }
2965   if( rc==SQLITE_OK ){
2966     memcpy(pPage, pPageFrom, SQLITE_USABLE_SIZE);
2967     idx = SWAB16(pBt, pPage->u.hdr.firstCell);
2968     while( idx>0 ){
2969       pCell = (Cell*)&pPage->u.aDisk[idx];
2970       idx = SWAB16(pBt, pCell->h.iNext);
2971       if( pCell->h.leftChild ){
2972         Pgno newChld;
2973         rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pCell->h.leftChild),
2974                               pBtTo, &newChld);
2975         if( rc ) return rc;
2976         pCell->h.leftChild = SWAB32(pBtFrom, newChld);
2977       }
2978       rc = copyCell(pBtFrom, pBtTo, pCell);
2979       if( rc ) return rc;
2980     }
2981     if( pPage->u.hdr.rightChild ){
2982       Pgno newChld;
2983       rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pPage->u.hdr.rightChild),
2984                             pBtTo, &newChld);
2985       if( rc ) return rc;
2986       pPage->u.hdr.rightChild = SWAB32(pBtTo, newChild);
2987     }
2988   }
2989   sqlitepager_unref(pPage);
2990   return rc;
2991 }
2992 #endif
2993 
2994 /*
2995 ** Read the meta-information out of a database file.
2996 */
fileBtreeGetMeta(Btree * pBt,int * aMeta)2997 static int fileBtreeGetMeta(Btree *pBt, int *aMeta){
2998   PageOne *pP1;
2999   int rc;
3000   int i;
3001 
3002   rc = sqlitepager_get(pBt->pPager, 1, (void**)&pP1);
3003   if( rc ) return rc;
3004   aMeta[0] = SWAB32(pBt, pP1->nFree);
3005   for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){
3006     aMeta[i+1] = SWAB32(pBt, pP1->aMeta[i]);
3007   }
3008   sqlitepager_unref(pP1);
3009   return SQLITE_OK;
3010 }
3011 
3012 /*
3013 ** Write meta-information back into the database.
3014 */
fileBtreeUpdateMeta(Btree * pBt,int * aMeta)3015 static int fileBtreeUpdateMeta(Btree *pBt, int *aMeta){
3016   PageOne *pP1;
3017   int rc, i;
3018   if( !pBt->inTrans ){
3019     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
3020   }
3021   pP1 = pBt->page1;
3022   rc = sqlitepager_write(pP1);
3023   if( rc ) return rc;
3024   for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){
3025     pP1->aMeta[i] = SWAB32(pBt, aMeta[i+1]);
3026   }
3027   return SQLITE_OK;
3028 }
3029 
3030 /******************************************************************************
3031 ** The complete implementation of the BTree subsystem is above this line.
3032 ** All the code the follows is for testing and troubleshooting the BTree
3033 ** subsystem.  None of the code that follows is used during normal operation.
3034 ******************************************************************************/
3035 
3036 /*
3037 ** Print a disassembly of the given page on standard output.  This routine
3038 ** is used for debugging and testing only.
3039 */
3040 #ifdef SQLITE_TEST
fileBtreePageDump(Btree * pBt,int pgno,int recursive)3041 static int fileBtreePageDump(Btree *pBt, int pgno, int recursive){
3042   int rc;
3043   MemPage *pPage;
3044   int i, j;
3045   int nFree;
3046   u16 idx;
3047   char range[20];
3048   unsigned char payload[20];
3049   rc = sqlitepager_get(pBt->pPager, (Pgno)pgno, (void**)&pPage);
3050   if( rc ){
3051     return rc;
3052   }
3053   if( recursive ) printf("PAGE %d:\n", pgno);
3054   i = 0;
3055   idx = SWAB16(pBt, pPage->u.hdr.firstCell);
3056   while( idx>0 && idx<=SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){
3057     Cell *pCell = (Cell*)&pPage->u.aDisk[idx];
3058     int sz = cellSize(pBt, pCell);
3059     sprintf(range,"%d..%d", idx, idx+sz-1);
3060     sz = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
3061     if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
3062     memcpy(payload, pCell->aPayload, sz);
3063     for(j=0; j<sz; j++){
3064       if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3065     }
3066     payload[sz] = 0;
3067     printf(
3068       "cell %2d: i=%-10s chld=%-4d nk=%-4d nd=%-4d payload=%s\n",
3069       i, range, (int)pCell->h.leftChild,
3070       NKEY(pBt, pCell->h), NDATA(pBt, pCell->h),
3071       payload
3072     );
3073     if( pPage->isInit && pPage->apCell[i]!=pCell ){
3074       printf("**** apCell[%d] does not match on prior entry ****\n", i);
3075     }
3076     i++;
3077     idx = SWAB16(pBt, pCell->h.iNext);
3078   }
3079   if( idx!=0 ){
3080     printf("ERROR: next cell index out of range: %d\n", idx);
3081   }
3082   printf("right_child: %d\n", SWAB32(pBt, pPage->u.hdr.rightChild));
3083   nFree = 0;
3084   i = 0;
3085   idx = SWAB16(pBt, pPage->u.hdr.firstFree);
3086   while( idx>0 && idx<SQLITE_USABLE_SIZE ){
3087     FreeBlk *p = (FreeBlk*)&pPage->u.aDisk[idx];
3088     sprintf(range,"%d..%d", idx, idx+p->iSize-1);
3089     nFree += SWAB16(pBt, p->iSize);
3090     printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
3091        i, range, SWAB16(pBt, p->iSize), nFree);
3092     idx = SWAB16(pBt, p->iNext);
3093     i++;
3094   }
3095   if( idx!=0 ){
3096     printf("ERROR: next freeblock index out of range: %d\n", idx);
3097   }
3098   if( recursive && pPage->u.hdr.rightChild!=0 ){
3099     idx = SWAB16(pBt, pPage->u.hdr.firstCell);
3100     while( idx>0 && idx<SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){
3101       Cell *pCell = (Cell*)&pPage->u.aDisk[idx];
3102       fileBtreePageDump(pBt, SWAB32(pBt, pCell->h.leftChild), 1);
3103       idx = SWAB16(pBt, pCell->h.iNext);
3104     }
3105     fileBtreePageDump(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1);
3106   }
3107   sqlitepager_unref(pPage);
3108   return SQLITE_OK;
3109 }
3110 #endif
3111 
3112 #ifdef SQLITE_TEST
3113 /*
3114 ** Fill aResult[] with information about the entry and page that the
3115 ** cursor is pointing to.
3116 **
3117 **   aResult[0] =  The page number
3118 **   aResult[1] =  The entry number
3119 **   aResult[2] =  Total number of entries on this page
3120 **   aResult[3] =  Size of this entry
3121 **   aResult[4] =  Number of free bytes on this page
3122 **   aResult[5] =  Number of free blocks on the page
3123 **   aResult[6] =  Page number of the left child of this entry
3124 **   aResult[7] =  Page number of the right child for the whole page
3125 **
3126 ** This routine is used for testing and debugging only.
3127 */
fileBtreeCursorDump(BtCursor * pCur,int * aResult)3128 static int fileBtreeCursorDump(BtCursor *pCur, int *aResult){
3129   int cnt, idx;
3130   MemPage *pPage = pCur->pPage;
3131   Btree *pBt = pCur->pBt;
3132   aResult[0] = sqlitepager_pagenumber(pPage);
3133   aResult[1] = pCur->idx;
3134   aResult[2] = pPage->nCell;
3135   if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
3136     aResult[3] = cellSize(pBt, pPage->apCell[pCur->idx]);
3137     aResult[6] = SWAB32(pBt, pPage->apCell[pCur->idx]->h.leftChild);
3138   }else{
3139     aResult[3] = 0;
3140     aResult[6] = 0;
3141   }
3142   aResult[4] = pPage->nFree;
3143   cnt = 0;
3144   idx = SWAB16(pBt, pPage->u.hdr.firstFree);
3145   while( idx>0 && idx<SQLITE_USABLE_SIZE ){
3146     cnt++;
3147     idx = SWAB16(pBt, ((FreeBlk*)&pPage->u.aDisk[idx])->iNext);
3148   }
3149   aResult[5] = cnt;
3150   aResult[7] = SWAB32(pBt, pPage->u.hdr.rightChild);
3151   return SQLITE_OK;
3152 }
3153 #endif
3154 
3155 /*
3156 ** Return the pager associated with a BTree.  This routine is used for
3157 ** testing and debugging only.
3158 */
fileBtreePager(Btree * pBt)3159 static Pager *fileBtreePager(Btree *pBt){
3160   return pBt->pPager;
3161 }
3162 
3163 /*
3164 ** This structure is passed around through all the sanity checking routines
3165 ** in order to keep track of some global state information.
3166 */
3167 typedef struct IntegrityCk IntegrityCk;
3168 struct IntegrityCk {
3169   Btree *pBt;    /* The tree being checked out */
3170   Pager *pPager; /* The associated pager.  Also accessible by pBt->pPager */
3171   int nPage;     /* Number of pages in the database */
3172   int *anRef;    /* Number of times each page is referenced */
3173   char *zErrMsg; /* An error message.  NULL of no errors seen. */
3174 };
3175 
3176 /*
3177 ** Append a message to the error message string.
3178 */
checkAppendMsg(IntegrityCk * pCheck,char * zMsg1,char * zMsg2)3179 static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
3180   if( pCheck->zErrMsg ){
3181     char *zOld = pCheck->zErrMsg;
3182     pCheck->zErrMsg = 0;
3183     sqliteSetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
3184     sqliteFree(zOld);
3185   }else{
3186     sqliteSetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
3187   }
3188 }
3189 
3190 /*
3191 ** Add 1 to the reference count for page iPage.  If this is the second
3192 ** reference to the page, add an error message to pCheck->zErrMsg.
3193 ** Return 1 if there are 2 ore more references to the page and 0 if
3194 ** if this is the first reference to the page.
3195 **
3196 ** Also check that the page number is in bounds.
3197 */
checkRef(IntegrityCk * pCheck,int iPage,char * zContext)3198 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
3199   if( iPage==0 ) return 1;
3200   if( iPage>pCheck->nPage || iPage<0 ){
3201     char zBuf[100];
3202     sprintf(zBuf, "invalid page number %d", iPage);
3203     checkAppendMsg(pCheck, zContext, zBuf);
3204     return 1;
3205   }
3206   if( pCheck->anRef[iPage]==1 ){
3207     char zBuf[100];
3208     sprintf(zBuf, "2nd reference to page %d", iPage);
3209     checkAppendMsg(pCheck, zContext, zBuf);
3210     return 1;
3211   }
3212   return  (pCheck->anRef[iPage]++)>1;
3213 }
3214 
3215 /*
3216 ** Check the integrity of the freelist or of an overflow page list.
3217 ** Verify that the number of pages on the list is N.
3218 */
checkList(IntegrityCk * pCheck,int isFreeList,int iPage,int N,char * zContext)3219 static void checkList(
3220   IntegrityCk *pCheck,  /* Integrity checking context */
3221   int isFreeList,       /* True for a freelist.  False for overflow page list */
3222   int iPage,            /* Page number for first page in the list */
3223   int N,                /* Expected number of pages in the list */
3224   char *zContext        /* Context for error messages */
3225 ){
3226   int i;
3227   char zMsg[100];
3228   while( N-- > 0 ){
3229     OverflowPage *pOvfl;
3230     if( iPage<1 ){
3231       sprintf(zMsg, "%d pages missing from overflow list", N+1);
3232       checkAppendMsg(pCheck, zContext, zMsg);
3233       break;
3234     }
3235     if( checkRef(pCheck, iPage, zContext) ) break;
3236     if( sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
3237       sprintf(zMsg, "failed to get page %d", iPage);
3238       checkAppendMsg(pCheck, zContext, zMsg);
3239       break;
3240     }
3241     if( isFreeList ){
3242       FreelistInfo *pInfo = (FreelistInfo*)pOvfl->aPayload;
3243       int n = SWAB32(pCheck->pBt, pInfo->nFree);
3244       for(i=0; i<n; i++){
3245         checkRef(pCheck, SWAB32(pCheck->pBt, pInfo->aFree[i]), zContext);
3246       }
3247       N -= n;
3248     }
3249     iPage = SWAB32(pCheck->pBt, pOvfl->iNext);
3250     sqlitepager_unref(pOvfl);
3251   }
3252 }
3253 
3254 /*
3255 ** Return negative if zKey1<zKey2.
3256 ** Return zero if zKey1==zKey2.
3257 ** Return positive if zKey1>zKey2.
3258 */
keyCompare(const char * zKey1,int nKey1,const char * zKey2,int nKey2)3259 static int keyCompare(
3260   const char *zKey1, int nKey1,
3261   const char *zKey2, int nKey2
3262 ){
3263   int min = nKey1>nKey2 ? nKey2 : nKey1;
3264   int c = memcmp(zKey1, zKey2, min);
3265   if( c==0 ){
3266     c = nKey1 - nKey2;
3267   }
3268   return c;
3269 }
3270 
3271 /*
3272 ** Do various sanity checks on a single page of a tree.  Return
3273 ** the tree depth.  Root pages return 0.  Parents of root pages
3274 ** return 1, and so forth.
3275 **
3276 ** These checks are done:
3277 **
3278 **      1.  Make sure that cells and freeblocks do not overlap
3279 **          but combine to completely cover the page.
3280 **      2.  Make sure cell keys are in order.
3281 **      3.  Make sure no key is less than or equal to zLowerBound.
3282 **      4.  Make sure no key is greater than or equal to zUpperBound.
3283 **      5.  Check the integrity of overflow pages.
3284 **      6.  Recursively call checkTreePage on all children.
3285 **      7.  Verify that the depth of all children is the same.
3286 **      8.  Make sure this page is at least 33% full or else it is
3287 **          the root of the tree.
3288 */
checkTreePage(IntegrityCk * pCheck,int iPage,MemPage * pParent,char * zParentContext,char * zLowerBound,int nLower,char * zUpperBound,int nUpper)3289 static int checkTreePage(
3290   IntegrityCk *pCheck,  /* Context for the sanity check */
3291   int iPage,            /* Page number of the page to check */
3292   MemPage *pParent,     /* Parent page */
3293   char *zParentContext, /* Parent context */
3294   char *zLowerBound,    /* All keys should be greater than this, if not NULL */
3295   int nLower,           /* Number of characters in zLowerBound */
3296   char *zUpperBound,    /* All keys should be less than this, if not NULL */
3297   int nUpper            /* Number of characters in zUpperBound */
3298 ){
3299   MemPage *pPage;
3300   int i, rc, depth, d2, pgno;
3301   char *zKey1, *zKey2;
3302   int nKey1, nKey2;
3303   BtCursor cur;
3304   Btree *pBt;
3305   char zMsg[100];
3306   char zContext[100];
3307   char hit[SQLITE_USABLE_SIZE];
3308 
3309   /* Check that the page exists
3310   */
3311   cur.pBt = pBt = pCheck->pBt;
3312   if( iPage==0 ) return 0;
3313   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
3314   sprintf(zContext, "On tree page %d: ", iPage);
3315   if( (rc = sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pPage))!=0 ){
3316     sprintf(zMsg, "unable to get the page. error code=%d", rc);
3317     checkAppendMsg(pCheck, zContext, zMsg);
3318     return 0;
3319   }
3320   if( (rc = initPage(pBt, pPage, (Pgno)iPage, pParent))!=0 ){
3321     sprintf(zMsg, "initPage() returns error code %d", rc);
3322     checkAppendMsg(pCheck, zContext, zMsg);
3323     sqlitepager_unref(pPage);
3324     return 0;
3325   }
3326 
3327   /* Check out all the cells.
3328   */
3329   depth = 0;
3330   if( zLowerBound ){
3331     zKey1 = sqliteMalloc( nLower+1 );
3332     memcpy(zKey1, zLowerBound, nLower);
3333     zKey1[nLower] = 0;
3334   }else{
3335     zKey1 = 0;
3336   }
3337   nKey1 = nLower;
3338   cur.pPage = pPage;
3339   for(i=0; i<pPage->nCell; i++){
3340     Cell *pCell = pPage->apCell[i];
3341     int sz;
3342 
3343     /* Check payload overflow pages
3344     */
3345     nKey2 = NKEY(pBt, pCell->h);
3346     sz = nKey2 + NDATA(pBt, pCell->h);
3347     sprintf(zContext, "On page %d cell %d: ", iPage, i);
3348     if( sz>MX_LOCAL_PAYLOAD ){
3349       int nPage = (sz - MX_LOCAL_PAYLOAD + OVERFLOW_SIZE - 1)/OVERFLOW_SIZE;
3350       checkList(pCheck, 0, SWAB32(pBt, pCell->ovfl), nPage, zContext);
3351     }
3352 
3353     /* Check that keys are in the right order
3354     */
3355     cur.idx = i;
3356     zKey2 = sqliteMallocRaw( nKey2+1 );
3357     getPayload(&cur, 0, nKey2, zKey2);
3358     if( zKey1 && keyCompare(zKey1, nKey1, zKey2, nKey2)>=0 ){
3359       checkAppendMsg(pCheck, zContext, "Key is out of order");
3360     }
3361 
3362     /* Check sanity of left child page.
3363     */
3364     pgno = SWAB32(pBt, pCell->h.leftChild);
3365     d2 = checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zKey2,nKey2);
3366     if( i>0 && d2!=depth ){
3367       checkAppendMsg(pCheck, zContext, "Child page depth differs");
3368     }
3369     depth = d2;
3370     sqliteFree(zKey1);
3371     zKey1 = zKey2;
3372     nKey1 = nKey2;
3373   }
3374   pgno = SWAB32(pBt, pPage->u.hdr.rightChild);
3375   sprintf(zContext, "On page %d at right child: ", iPage);
3376   checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zUpperBound,nUpper);
3377   sqliteFree(zKey1);
3378 
3379   /* Check for complete coverage of the page
3380   */
3381   memset(hit, 0, sizeof(hit));
3382   memset(hit, 1, sizeof(PageHdr));
3383   for(i=SWAB16(pBt, pPage->u.hdr.firstCell); i>0 && i<SQLITE_USABLE_SIZE; ){
3384     Cell *pCell = (Cell*)&pPage->u.aDisk[i];
3385     int j;
3386     for(j=i+cellSize(pBt, pCell)-1; j>=i; j--) hit[j]++;
3387     i = SWAB16(pBt, pCell->h.iNext);
3388   }
3389   for(i=SWAB16(pBt,pPage->u.hdr.firstFree); i>0 && i<SQLITE_USABLE_SIZE; ){
3390     FreeBlk *pFBlk = (FreeBlk*)&pPage->u.aDisk[i];
3391     int j;
3392     for(j=i+SWAB16(pBt,pFBlk->iSize)-1; j>=i; j--) hit[j]++;
3393     i = SWAB16(pBt,pFBlk->iNext);
3394   }
3395   for(i=0; i<SQLITE_USABLE_SIZE; i++){
3396     if( hit[i]==0 ){
3397       sprintf(zMsg, "Unused space at byte %d of page %d", i, iPage);
3398       checkAppendMsg(pCheck, zMsg, 0);
3399       break;
3400     }else if( hit[i]>1 ){
3401       sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
3402       checkAppendMsg(pCheck, zMsg, 0);
3403       break;
3404     }
3405   }
3406 
3407   /* Check that free space is kept to a minimum
3408   */
3409 #if 0
3410   if( pParent && pParent->nCell>2 && pPage->nFree>3*SQLITE_USABLE_SIZE/4 ){
3411     sprintf(zMsg, "free space (%d) greater than max (%d)", pPage->nFree,
3412        SQLITE_USABLE_SIZE/3);
3413     checkAppendMsg(pCheck, zContext, zMsg);
3414   }
3415 #endif
3416 
3417   sqlitepager_unref(pPage);
3418   return depth;
3419 }
3420 
3421 /*
3422 ** This routine does a complete check of the given BTree file.  aRoot[] is
3423 ** an array of pages numbers were each page number is the root page of
3424 ** a table.  nRoot is the number of entries in aRoot.
3425 **
3426 ** If everything checks out, this routine returns NULL.  If something is
3427 ** amiss, an error message is written into memory obtained from malloc()
3428 ** and a pointer to that error message is returned.  The calling function
3429 ** is responsible for freeing the error message when it is done.
3430 */
fileBtreeIntegrityCheck(Btree * pBt,int * aRoot,int nRoot)3431 char *fileBtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
3432   int i;
3433   int nRef;
3434   IntegrityCk sCheck;
3435 
3436   nRef = *sqlitepager_stats(pBt->pPager);
3437   if( lockBtree(pBt)!=SQLITE_OK ){
3438     return sqliteStrDup("Unable to acquire a read lock on the database");
3439   }
3440   sCheck.pBt = pBt;
3441   sCheck.pPager = pBt->pPager;
3442   sCheck.nPage = sqlitepager_pagecount(sCheck.pPager);
3443   if( sCheck.nPage==0 ){
3444     unlockBtreeIfUnused(pBt);
3445     return 0;
3446   }
3447   sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
3448   sCheck.anRef[1] = 1;
3449   for(i=2; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
3450   sCheck.zErrMsg = 0;
3451 
3452   /* Check the integrity of the freelist
3453   */
3454   checkList(&sCheck, 1, SWAB32(pBt, pBt->page1->freeList),
3455             SWAB32(pBt, pBt->page1->nFree), "Main freelist: ");
3456 
3457   /* Check all the tables.
3458   */
3459   for(i=0; i<nRoot; i++){
3460     if( aRoot[i]==0 ) continue;
3461     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
3462   }
3463 
3464   /* Make sure every page in the file is referenced
3465   */
3466   for(i=1; i<=sCheck.nPage; i++){
3467     if( sCheck.anRef[i]==0 ){
3468       char zBuf[100];
3469       sprintf(zBuf, "Page %d is never used", i);
3470       checkAppendMsg(&sCheck, zBuf, 0);
3471     }
3472   }
3473 
3474   /* Make sure this analysis did not leave any unref() pages
3475   */
3476   unlockBtreeIfUnused(pBt);
3477   if( nRef != *sqlitepager_stats(pBt->pPager) ){
3478     char zBuf[100];
3479     sprintf(zBuf,
3480       "Outstanding page count goes from %d to %d during this analysis",
3481       nRef, *sqlitepager_stats(pBt->pPager)
3482     );
3483     checkAppendMsg(&sCheck, zBuf, 0);
3484   }
3485 
3486   /* Clean  up and report errors.
3487   */
3488   sqliteFree(sCheck.anRef);
3489   return sCheck.zErrMsg;
3490 }
3491 
3492 /*
3493 ** Return the full pathname of the underlying database file.
3494 */
fileBtreeGetFilename(Btree * pBt)3495 static const char *fileBtreeGetFilename(Btree *pBt){
3496   assert( pBt->pPager!=0 );
3497   return sqlitepager_filename(pBt->pPager);
3498 }
3499 
3500 /*
3501 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
3502 ** must be active for both files.
3503 **
3504 ** The size of file pBtFrom may be reduced by this operation.
3505 ** If anything goes wrong, the transaction on pBtFrom is rolled back.
3506 */
fileBtreeCopyFile(Btree * pBtTo,Btree * pBtFrom)3507 static int fileBtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
3508   int rc = SQLITE_OK;
3509   Pgno i, nPage, nToPage;
3510 
3511   if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR;
3512   if( pBtTo->needSwab!=pBtFrom->needSwab ) return SQLITE_ERROR;
3513   if( pBtTo->pCursor ) return SQLITE_BUSY;
3514   memcpy(pBtTo->page1, pBtFrom->page1, SQLITE_USABLE_SIZE);
3515   rc = sqlitepager_overwrite(pBtTo->pPager, 1, pBtFrom->page1);
3516   nToPage = sqlitepager_pagecount(pBtTo->pPager);
3517   nPage = sqlitepager_pagecount(pBtFrom->pPager);
3518   for(i=2; rc==SQLITE_OK && i<=nPage; i++){
3519     void *pPage;
3520     rc = sqlitepager_get(pBtFrom->pPager, i, &pPage);
3521     if( rc ) break;
3522     rc = sqlitepager_overwrite(pBtTo->pPager, i, pPage);
3523     if( rc ) break;
3524     sqlitepager_unref(pPage);
3525   }
3526   for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
3527     void *pPage;
3528     rc = sqlitepager_get(pBtTo->pPager, i, &pPage);
3529     if( rc ) break;
3530     rc = sqlitepager_write(pPage);
3531     sqlitepager_unref(pPage);
3532     sqlitepager_dont_write(pBtTo->pPager, i);
3533   }
3534   if( !rc && nPage<nToPage ){
3535     rc = sqlitepager_truncate(pBtTo->pPager, nPage);
3536   }
3537   if( rc ){
3538     fileBtreeRollback(pBtTo);
3539   }
3540   return rc;
3541 }
3542 
3543 /*
3544 ** The following tables contain pointers to all of the interface
3545 ** routines for this implementation of the B*Tree backend.  To
3546 ** substitute a different implemention of the backend, one has merely
3547 ** to provide pointers to alternative functions in similar tables.
3548 */
3549 static BtOps sqliteBtreeOps = {
3550     fileBtreeClose,
3551     fileBtreeSetCacheSize,
3552     fileBtreeSetSafetyLevel,
3553     fileBtreeBeginTrans,
3554     fileBtreeCommit,
3555     fileBtreeRollback,
3556     fileBtreeBeginCkpt,
3557     fileBtreeCommitCkpt,
3558     fileBtreeRollbackCkpt,
3559     fileBtreeCreateTable,
3560     fileBtreeCreateTable,  /* Really sqliteBtreeCreateIndex() */
3561     fileBtreeDropTable,
3562     fileBtreeClearTable,
3563     fileBtreeCursor,
3564     fileBtreeGetMeta,
3565     fileBtreeUpdateMeta,
3566     fileBtreeIntegrityCheck,
3567     fileBtreeGetFilename,
3568     fileBtreeCopyFile,
3569     fileBtreePager,
3570 #ifdef SQLITE_TEST
3571     fileBtreePageDump,
3572 #endif
3573 };
3574 static BtCursorOps sqliteBtreeCursorOps = {
3575     fileBtreeMoveto,
3576     fileBtreeDelete,
3577     fileBtreeInsert,
3578     fileBtreeFirst,
3579     fileBtreeLast,
3580     fileBtreeNext,
3581     fileBtreePrevious,
3582     fileBtreeKeySize,
3583     fileBtreeKey,
3584     fileBtreeKeyCompare,
3585     fileBtreeDataSize,
3586     fileBtreeData,
3587     fileBtreeCloseCursor,
3588 #ifdef SQLITE_TEST
3589     fileBtreeCursorDump,
3590 #endif
3591 };
3592