btree.c

Go to the documentation of this file.
00001 /*
00002 ** 2004 April 6
00003 **
00004 ** The author disclaims copyright to this source code.  In place of
00005 ** a legal notice, here is a blessing:
00006 **
00007 **    May you do good and not evil.
00008 **    May you find forgiveness for yourself and forgive others.
00009 **    May you share freely, never taking more than you give.
00010 **
00011 *************************************************************************
00012 ** $Id: btree.c,v 1.533 2008/11/12 08:49:52 danielk1977 Exp $
00013 **
00014 ** This file implements a external (disk-based) database using BTrees.
00015 ** See the header comment on "btreeInt.h" for additional information.
00016 ** Including a description of file format and an overview of operation.
00017 */
00018 #include "btreeInt.h"
00019 
00020 /*
00021 ** The header string that appears at the beginning of every
00022 ** SQLite database.
00023 */
00024 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
00025 
00026 /*
00027 ** Set this global variable to 1 to enable tracing using the TRACE
00028 ** macro.
00029 */
00030 #if 0
00031 int sqlite3BtreeTrace=0;  /* True to enable tracing */
00032 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
00033 #else
00034 # define TRACE(X)
00035 #endif
00036 
00037 /*
00038 ** Sometimes we need a small amount of code such as a variable initialization
00039 ** to setup for a later assert() statement.  We do not want this code to
00040 ** appear when assert() is disabled.  The following macro is therefore
00041 ** used to contain that setup code.  The "VVA" acronym stands for
00042 ** "Verification, Validation, and Accreditation".  In other words, the
00043 ** code within VVA_ONLY() will only run during verification processes.
00044 */
00045 #ifndef NDEBUG
00046 # define VVA_ONLY(X)  X
00047 #else
00048 # define VVA_ONLY(X)
00049 #endif
00050 
00051 
00052 
00053 #ifndef SQLITE_OMIT_SHARED_CACHE
00054 /*
00055 ** A list of BtShared objects that are eligible for participation
00056 ** in shared cache.  This variable has file scope during normal builds,
00057 ** but the test harness needs to access it so we make it global for 
00058 ** test builds.
00059 */
00060 #ifdef SQLITE_TEST
00061 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
00062 #else
00063 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
00064 #endif
00065 #endif /* SQLITE_OMIT_SHARED_CACHE */
00066 
00067 #ifndef SQLITE_OMIT_SHARED_CACHE
00068 /*
00069 ** Enable or disable the shared pager and schema features.
00070 **
00071 ** This routine has no effect on existing database connections.
00072 ** The shared cache setting effects only future calls to
00073 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
00074 */
00075 int sqlite3_enable_shared_cache(int enable){
00076   sqlite3GlobalConfig.sharedCacheEnabled = enable;
00077   return SQLITE_OK;
00078 }
00079 #endif
00080 
00081 
00082 /*
00083 ** Forward declaration
00084 */
00085 static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
00086 
00087 
00088 #ifdef SQLITE_OMIT_SHARED_CACHE
00089   /*
00090   ** The functions queryTableLock(), lockTable() and unlockAllTables()
00091   ** manipulate entries in the BtShared.pLock linked list used to store
00092   ** shared-cache table level locks. If the library is compiled with the
00093   ** shared-cache feature disabled, then there is only ever one user
00094   ** of each BtShared structure and so this locking is not necessary. 
00095   ** So define the lock related functions as no-ops.
00096   */
00097   #define queryTableLock(a,b,c) SQLITE_OK
00098   #define lockTable(a,b,c) SQLITE_OK
00099   #define unlockAllTables(a)
00100 #endif
00101 
00102 #ifndef SQLITE_OMIT_SHARED_CACHE
00103 /*
00104 ** Query to see if btree handle p may obtain a lock of type eLock 
00105 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
00106 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
00107 ** SQLITE_LOCKED if not.
00108 */
00109 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
00110   BtShared *pBt = p->pBt;
00111   BtLock *pIter;
00112 
00113   assert( sqlite3BtreeHoldsMutex(p) );
00114   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
00115   assert( p->db!=0 );
00116   
00117   /* This is a no-op if the shared-cache is not enabled */
00118   if( !p->sharable ){
00119     return SQLITE_OK;
00120   }
00121 
00122   /* If some other connection is holding an exclusive lock, the
00123   ** requested lock may not be obtained.
00124   */
00125   if( pBt->pExclusive && pBt->pExclusive!=p ){
00126     return SQLITE_LOCKED;
00127   }
00128 
00129   /* This (along with lockTable()) is where the ReadUncommitted flag is
00130   ** dealt with. If the caller is querying for a read-lock and the flag is
00131   ** set, it is unconditionally granted - even if there are write-locks
00132   ** on the table. If a write-lock is requested, the ReadUncommitted flag
00133   ** is not considered.
00134   **
00135   ** In function lockTable(), if a read-lock is demanded and the 
00136   ** ReadUncommitted flag is set, no entry is added to the locks list 
00137   ** (BtShared.pLock).
00138   **
00139   ** To summarize: If the ReadUncommitted flag is set, then read cursors do
00140   ** not create or respect table locks. The locking procedure for a 
00141   ** write-cursor does not change.
00142   */
00143   if( 
00144     0==(p->db->flags&SQLITE_ReadUncommitted) || 
00145     eLock==WRITE_LOCK ||
00146     iTab==MASTER_ROOT
00147   ){
00148     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
00149       if( pIter->pBtree!=p && pIter->iTable==iTab && 
00150           (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
00151         return SQLITE_LOCKED;
00152       }
00153     }
00154   }
00155   return SQLITE_OK;
00156 }
00157 #endif /* !SQLITE_OMIT_SHARED_CACHE */
00158 
00159 #ifndef SQLITE_OMIT_SHARED_CACHE
00160 /*
00161 ** Add a lock on the table with root-page iTable to the shared-btree used
00162 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 
00163 ** WRITE_LOCK.
00164 **
00165 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
00166 ** SQLITE_NOMEM may also be returned.
00167 */
00168 static int lockTable(Btree *p, Pgno iTable, u8 eLock){
00169   BtShared *pBt = p->pBt;
00170   BtLock *pLock = 0;
00171   BtLock *pIter;
00172 
00173   assert( sqlite3BtreeHoldsMutex(p) );
00174   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
00175   assert( p->db!=0 );
00176 
00177   /* This is a no-op if the shared-cache is not enabled */
00178   if( !p->sharable ){
00179     return SQLITE_OK;
00180   }
00181 
00182   assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
00183 
00184   /* If the read-uncommitted flag is set and a read-lock is requested,
00185   ** return early without adding an entry to the BtShared.pLock list. See
00186   ** comment in function queryTableLock() for more info on handling 
00187   ** the ReadUncommitted flag.
00188   */
00189   if( 
00190     (p->db->flags&SQLITE_ReadUncommitted) && 
00191     (eLock==READ_LOCK) &&
00192     iTable!=MASTER_ROOT
00193   ){
00194     return SQLITE_OK;
00195   }
00196 
00197   /* First search the list for an existing lock on this table. */
00198   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
00199     if( pIter->iTable==iTable && pIter->pBtree==p ){
00200       pLock = pIter;
00201       break;
00202     }
00203   }
00204 
00205   /* If the above search did not find a BtLock struct associating Btree p
00206   ** with table iTable, allocate one and link it into the list.
00207   */
00208   if( !pLock ){
00209     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
00210     if( !pLock ){
00211       return SQLITE_NOMEM;
00212     }
00213     pLock->iTable = iTable;
00214     pLock->pBtree = p;
00215     pLock->pNext = pBt->pLock;
00216     pBt->pLock = pLock;
00217   }
00218 
00219   /* Set the BtLock.eLock variable to the maximum of the current lock
00220   ** and the requested lock. This means if a write-lock was already held
00221   ** and a read-lock requested, we don't incorrectly downgrade the lock.
00222   */
00223   assert( WRITE_LOCK>READ_LOCK );
00224   if( eLock>pLock->eLock ){
00225     pLock->eLock = eLock;
00226   }
00227 
00228   return SQLITE_OK;
00229 }
00230 #endif /* !SQLITE_OMIT_SHARED_CACHE */
00231 
00232 #ifndef SQLITE_OMIT_SHARED_CACHE
00233 /*
00234 ** Release all the table locks (locks obtained via calls to the lockTable()
00235 ** procedure) held by Btree handle p.
00236 */
00237 static void unlockAllTables(Btree *p){
00238   BtShared *pBt = p->pBt;
00239   BtLock **ppIter = &pBt->pLock;
00240 
00241   assert( sqlite3BtreeHoldsMutex(p) );
00242   assert( p->sharable || 0==*ppIter );
00243 
00244   while( *ppIter ){
00245     BtLock *pLock = *ppIter;
00246     assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
00247     if( pLock->pBtree==p ){
00248       *ppIter = pLock->pNext;
00249       sqlite3_free(pLock);
00250     }else{
00251       ppIter = &pLock->pNext;
00252     }
00253   }
00254 
00255   if( pBt->pExclusive==p ){
00256     pBt->pExclusive = 0;
00257   }
00258 }
00259 #endif /* SQLITE_OMIT_SHARED_CACHE */
00260 
00261 static void releasePage(MemPage *pPage);  /* Forward reference */
00262 
00263 /*
00264 ** Verify that the cursor holds a mutex on the BtShared
00265 */
00266 #ifndef NDEBUG
00267 static int cursorHoldsMutex(BtCursor *p){
00268   return sqlite3_mutex_held(p->pBt->mutex);
00269 }
00270 #endif
00271 
00272 
00273 #ifndef SQLITE_OMIT_INCRBLOB
00274 /*
00275 ** Invalidate the overflow page-list cache for cursor pCur, if any.
00276 */
00277 static void invalidateOverflowCache(BtCursor *pCur){
00278   assert( cursorHoldsMutex(pCur) );
00279   sqlite3_free(pCur->aOverflow);
00280   pCur->aOverflow = 0;
00281 }
00282 
00283 /*
00284 ** Invalidate the overflow page-list cache for all cursors opened
00285 ** on the shared btree structure pBt.
00286 */
00287 static void invalidateAllOverflowCache(BtShared *pBt){
00288   BtCursor *p;
00289   assert( sqlite3_mutex_held(pBt->mutex) );
00290   for(p=pBt->pCursor; p; p=p->pNext){
00291     invalidateOverflowCache(p);
00292   }
00293 }
00294 #else
00295   #define invalidateOverflowCache(x)
00296   #define invalidateAllOverflowCache(x)
00297 #endif
00298 
00299 /*
00300 ** Save the current cursor position in the variables BtCursor.nKey 
00301 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
00302 */
00303 static int saveCursorPosition(BtCursor *pCur){
00304   int rc;
00305 
00306   assert( CURSOR_VALID==pCur->eState );
00307   assert( 0==pCur->pKey );
00308   assert( cursorHoldsMutex(pCur) );
00309 
00310   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
00311 
00312   /* If this is an intKey table, then the above call to BtreeKeySize()
00313   ** stores the integer key in pCur->nKey. In this case this value is
00314   ** all that is required. Otherwise, if pCur is not open on an intKey
00315   ** table, then malloc space for and store the pCur->nKey bytes of key 
00316   ** data.
00317   */
00318   if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
00319     void *pKey = sqlite3Malloc(pCur->nKey);
00320     if( pKey ){
00321       rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
00322       if( rc==SQLITE_OK ){
00323         pCur->pKey = pKey;
00324       }else{
00325         sqlite3_free(pKey);
00326       }
00327     }else{
00328       rc = SQLITE_NOMEM;
00329     }
00330   }
00331   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
00332 
00333   if( rc==SQLITE_OK ){
00334     int i;
00335     for(i=0; i<=pCur->iPage; i++){
00336       releasePage(pCur->apPage[i]);
00337       pCur->apPage[i] = 0;
00338     }
00339     pCur->iPage = -1;
00340     pCur->eState = CURSOR_REQUIRESEEK;
00341   }
00342 
00343   invalidateOverflowCache(pCur);
00344   return rc;
00345 }
00346 
00347 /*
00348 ** Save the positions of all cursors except pExcept open on the table 
00349 ** with root-page iRoot. Usually, this is called just before cursor
00350 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
00351 */
00352 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
00353   BtCursor *p;
00354   assert( sqlite3_mutex_held(pBt->mutex) );
00355   assert( pExcept==0 || pExcept->pBt==pBt );
00356   for(p=pBt->pCursor; p; p=p->pNext){
00357     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 
00358         p->eState==CURSOR_VALID ){
00359       int rc = saveCursorPosition(p);
00360       if( SQLITE_OK!=rc ){
00361         return rc;
00362       }
00363     }
00364   }
00365   return SQLITE_OK;
00366 }
00367 
00368 /*
00369 ** Clear the current cursor position.
00370 */
00371 void sqlite3BtreeClearCursor(BtCursor *pCur){
00372   assert( cursorHoldsMutex(pCur) );
00373   sqlite3_free(pCur->pKey);
00374   pCur->pKey = 0;
00375   pCur->eState = CURSOR_INVALID;
00376 }
00377 
00378 /*
00379 ** Restore the cursor to the position it was in (or as close to as possible)
00380 ** when saveCursorPosition() was called. Note that this call deletes the 
00381 ** saved position info stored by saveCursorPosition(), so there can be
00382 ** at most one effective restoreCursorPosition() call after each 
00383 ** saveCursorPosition().
00384 */
00385 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
00386   int rc;
00387   assert( cursorHoldsMutex(pCur) );
00388   assert( pCur->eState>=CURSOR_REQUIRESEEK );
00389   if( pCur->eState==CURSOR_FAULT ){
00390     return pCur->skip;
00391   }
00392   pCur->eState = CURSOR_INVALID;
00393   rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
00394   if( rc==SQLITE_OK ){
00395     sqlite3_free(pCur->pKey);
00396     pCur->pKey = 0;
00397     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
00398   }
00399   return rc;
00400 }
00401 
00402 #define restoreCursorPosition(p) \
00403   (p->eState>=CURSOR_REQUIRESEEK ? \
00404          sqlite3BtreeRestoreCursorPosition(p) : \
00405          SQLITE_OK)
00406 
00407 /*
00408 ** Determine whether or not a cursor has moved from the position it
00409 ** was last placed at.  Cursors can move when the row they are pointing
00410 ** at is deleted out from under them.
00411 **
00412 ** This routine returns an error code if something goes wrong.  The
00413 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
00414 */
00415 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
00416   int rc;
00417 
00418   rc = restoreCursorPosition(pCur);
00419   if( rc ){
00420     *pHasMoved = 1;
00421     return rc;
00422   }
00423   if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
00424     *pHasMoved = 1;
00425   }else{
00426     *pHasMoved = 0;
00427   }
00428   return SQLITE_OK;
00429 }
00430 
00431 #ifndef SQLITE_OMIT_AUTOVACUUM
00432 /*
00433 ** Given a page number of a regular database page, return the page
00434 ** number for the pointer-map page that contains the entry for the
00435 ** input page number.
00436 */
00437 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
00438   int nPagesPerMapPage, iPtrMap, ret;
00439   assert( sqlite3_mutex_held(pBt->mutex) );
00440   nPagesPerMapPage = (pBt->usableSize/5)+1;
00441   iPtrMap = (pgno-2)/nPagesPerMapPage;
00442   ret = (iPtrMap*nPagesPerMapPage) + 2; 
00443   if( ret==PENDING_BYTE_PAGE(pBt) ){
00444     ret++;
00445   }
00446   return ret;
00447 }
00448 
00449 /*
00450 ** Write an entry into the pointer map.
00451 **
00452 ** This routine updates the pointer map entry for page number 'key'
00453 ** so that it maps to type 'eType' and parent page number 'pgno'.
00454 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
00455 */
00456 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
00457   DbPage *pDbPage;  /* The pointer map page */
00458   u8 *pPtrmap;      /* The pointer map data */
00459   Pgno iPtrmap;     /* The pointer map page number */
00460   int offset;       /* Offset in pointer map page */
00461   int rc;
00462 
00463   assert( sqlite3_mutex_held(pBt->mutex) );
00464   /* The master-journal page number must never be used as a pointer map page */
00465   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
00466 
00467   assert( pBt->autoVacuum );
00468   if( key==0 ){
00469     return SQLITE_CORRUPT_BKPT;
00470   }
00471   iPtrmap = PTRMAP_PAGENO(pBt, key);
00472   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
00473   if( rc!=SQLITE_OK ){
00474     return rc;
00475   }
00476   offset = PTRMAP_PTROFFSET(iPtrmap, key);
00477   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
00478 
00479   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
00480     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
00481     rc = sqlite3PagerWrite(pDbPage);
00482     if( rc==SQLITE_OK ){
00483       pPtrmap[offset] = eType;
00484       put4byte(&pPtrmap[offset+1], parent);
00485     }
00486   }
00487 
00488   sqlite3PagerUnref(pDbPage);
00489   return rc;
00490 }
00491 
00492 /*
00493 ** Read an entry from the pointer map.
00494 **
00495 ** This routine retrieves the pointer map entry for page 'key', writing
00496 ** the type and parent page number to *pEType and *pPgno respectively.
00497 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
00498 */
00499 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
00500   DbPage *pDbPage;   /* The pointer map page */
00501   int iPtrmap;       /* Pointer map page index */
00502   u8 *pPtrmap;       /* Pointer map page data */
00503   int offset;        /* Offset of entry in pointer map */
00504   int rc;
00505 
00506   assert( sqlite3_mutex_held(pBt->mutex) );
00507 
00508   iPtrmap = PTRMAP_PAGENO(pBt, key);
00509   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
00510   if( rc!=0 ){
00511     return rc;
00512   }
00513   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
00514 
00515   offset = PTRMAP_PTROFFSET(iPtrmap, key);
00516   assert( pEType!=0 );
00517   *pEType = pPtrmap[offset];
00518   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
00519 
00520   sqlite3PagerUnref(pDbPage);
00521   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
00522   return SQLITE_OK;
00523 }
00524 
00525 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
00526   #define ptrmapPut(w,x,y,z) SQLITE_OK
00527   #define ptrmapGet(w,x,y,z) SQLITE_OK
00528   #define ptrmapPutOvfl(y,z) SQLITE_OK
00529 #endif
00530 
00531 /*
00532 ** Given a btree page and a cell index (0 means the first cell on
00533 ** the page, 1 means the second cell, and so forth) return a pointer
00534 ** to the cell content.
00535 **
00536 ** This routine works only for pages that do not contain overflow cells.
00537 */
00538 #define findCell(P,I) \
00539   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
00540 
00541 /*
00542 ** This a more complex version of findCell() that works for
00543 ** pages that do contain overflow cells.  See insert
00544 */
00545 static u8 *findOverflowCell(MemPage *pPage, int iCell){
00546   int i;
00547   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
00548   for(i=pPage->nOverflow-1; i>=0; i--){
00549     int k;
00550     struct _OvflCell *pOvfl;
00551     pOvfl = &pPage->aOvfl[i];
00552     k = pOvfl->idx;
00553     if( k<=iCell ){
00554       if( k==iCell ){
00555         return pOvfl->pCell;
00556       }
00557       iCell--;
00558     }
00559   }
00560   return findCell(pPage, iCell);
00561 }
00562 
00563 /*
00564 ** Parse a cell content block and fill in the CellInfo structure.  There
00565 ** are two versions of this function.  sqlite3BtreeParseCell() takes a 
00566 ** cell index as the second argument and sqlite3BtreeParseCellPtr() 
00567 ** takes a pointer to the body of the cell as its second argument.
00568 **
00569 ** Within this file, the parseCell() macro can be called instead of
00570 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
00571 */
00572 void sqlite3BtreeParseCellPtr(
00573   MemPage *pPage,         /* Page containing the cell */
00574   u8 *pCell,              /* Pointer to the cell text. */
00575   CellInfo *pInfo         /* Fill in this structure */
00576 ){
00577   int n;                  /* Number bytes in cell content header */
00578   u32 nPayload;           /* Number of bytes of cell payload */
00579 
00580   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
00581 
00582   pInfo->pCell = pCell;
00583   assert( pPage->leaf==0 || pPage->leaf==1 );
00584   n = pPage->childPtrSize;
00585   assert( n==4-4*pPage->leaf );
00586   if( pPage->intKey ){
00587     if( pPage->hasData ){
00588       n += getVarint32(&pCell[n], nPayload);
00589     }else{
00590       nPayload = 0;
00591     }
00592     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
00593     pInfo->nData = nPayload;
00594   }else{
00595     pInfo->nData = 0;
00596     n += getVarint32(&pCell[n], nPayload);
00597     pInfo->nKey = nPayload;
00598   }
00599   pInfo->nPayload = nPayload;
00600   pInfo->nHeader = n;
00601   if( likely(nPayload<=pPage->maxLocal) ){
00602     /* This is the (easy) common case where the entire payload fits
00603     ** on the local page.  No overflow is required.
00604     */
00605     int nSize;          /* Total size of cell content in bytes */
00606     nSize = nPayload + n;
00607     pInfo->nLocal = nPayload;
00608     pInfo->iOverflow = 0;
00609     if( (nSize & ~3)==0 ){
00610       nSize = 4;        /* Minimum cell size is 4 */
00611     }
00612     pInfo->nSize = nSize;
00613   }else{
00614     /* If the payload will not fit completely on the local page, we have
00615     ** to decide how much to store locally and how much to spill onto
00616     ** overflow pages.  The strategy is to minimize the amount of unused
00617     ** space on overflow pages while keeping the amount of local storage
00618     ** in between minLocal and maxLocal.
00619     **
00620     ** Warning:  changing the way overflow payload is distributed in any
00621     ** way will result in an incompatible file format.
00622     */
00623     int minLocal;  /* Minimum amount of payload held locally */
00624     int maxLocal;  /* Maximum amount of payload held locally */
00625     int surplus;   /* Overflow payload available for local storage */
00626 
00627     minLocal = pPage->minLocal;
00628     maxLocal = pPage->maxLocal;
00629     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
00630     if( surplus <= maxLocal ){
00631       pInfo->nLocal = surplus;
00632     }else{
00633       pInfo->nLocal = minLocal;
00634     }
00635     pInfo->iOverflow = pInfo->nLocal + n;
00636     pInfo->nSize = pInfo->iOverflow + 4;
00637   }
00638 }
00639 #define parseCell(pPage, iCell, pInfo) \
00640   sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
00641 void sqlite3BtreeParseCell(
00642   MemPage *pPage,         /* Page containing the cell */
00643   int iCell,              /* The cell index.  First cell is 0 */
00644   CellInfo *pInfo         /* Fill in this structure */
00645 ){
00646   parseCell(pPage, iCell, pInfo);
00647 }
00648 
00649 /*
00650 ** Compute the total number of bytes that a Cell needs in the cell
00651 ** data area of the btree-page.  The return number includes the cell
00652 ** data header and the local payload, but not any overflow page or
00653 ** the space used by the cell pointer.
00654 */
00655 #ifndef NDEBUG
00656 static u16 cellSize(MemPage *pPage, int iCell){
00657   CellInfo info;
00658   sqlite3BtreeParseCell(pPage, iCell, &info);
00659   return info.nSize;
00660 }
00661 #endif
00662 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
00663   CellInfo info;
00664   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
00665   return info.nSize;
00666 }
00667 
00668 #ifndef SQLITE_OMIT_AUTOVACUUM
00669 /*
00670 ** If the cell pCell, part of page pPage contains a pointer
00671 ** to an overflow page, insert an entry into the pointer-map
00672 ** for the overflow page.
00673 */
00674 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
00675   CellInfo info;
00676   assert( pCell!=0 );
00677   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
00678   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
00679   if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
00680     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
00681     return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
00682   }
00683   return SQLITE_OK;
00684 }
00685 /*
00686 ** If the cell with index iCell on page pPage contains a pointer
00687 ** to an overflow page, insert an entry into the pointer-map
00688 ** for the overflow page.
00689 */
00690 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
00691   u8 *pCell;
00692   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
00693   pCell = findOverflowCell(pPage, iCell);
00694   return ptrmapPutOvflPtr(pPage, pCell);
00695 }
00696 #endif
00697 
00698 
00699 /*
00700 ** Defragment the page given.  All Cells are moved to the
00701 ** end of the page and all free space is collected into one
00702 ** big FreeBlk that occurs in between the header and cell
00703 ** pointer array and the cell content area.
00704 */
00705 static int defragmentPage(MemPage *pPage){
00706   int i;                     /* Loop counter */
00707   int pc;                    /* Address of a i-th cell */
00708   int addr;                  /* Offset of first byte after cell pointer array */
00709   int hdr;                   /* Offset to the page header */
00710   int size;                  /* Size of a cell */
00711   int usableSize;            /* Number of usable bytes on a page */
00712   int cellOffset;            /* Offset to the cell pointer array */
00713   int cbrk;                  /* Offset to the cell content area */
00714   int nCell;                 /* Number of cells on the page */
00715   unsigned char *data;       /* The page data */
00716   unsigned char *temp;       /* Temp area for cell content */
00717 
00718   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
00719   assert( pPage->pBt!=0 );
00720   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
00721   assert( pPage->nOverflow==0 );
00722   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
00723   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
00724   data = pPage->aData;
00725   hdr = pPage->hdrOffset;
00726   cellOffset = pPage->cellOffset;
00727   nCell = pPage->nCell;
00728   assert( nCell==get2byte(&data[hdr+3]) );
00729   usableSize = pPage->pBt->usableSize;
00730   cbrk = get2byte(&data[hdr+5]);
00731   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
00732   cbrk = usableSize;
00733   for(i=0; i<nCell; i++){
00734     u8 *pAddr;     /* The i-th cell pointer */
00735     pAddr = &data[cellOffset + i*2];
00736     pc = get2byte(pAddr);
00737     if (pc >= pPage->pBt->usableSize) {
00738       return SQLITE_CORRUPT_BKPT;
00739     }
00740     size = cellSizePtr(pPage, &temp[pc]);
00741     cbrk -= size;
00742     if ((cbrk < cellOffset+2*nCell) || (cbrk+size>pPage->pBt->usableSize)) {
00743       return SQLITE_CORRUPT_BKPT;
00744     }
00745     memcpy(&data[cbrk], &temp[pc], size);
00746     put2byte(pAddr, cbrk);
00747   }
00748   assert( cbrk>=cellOffset+2*nCell );
00749   put2byte(&data[hdr+5], cbrk);
00750   data[hdr+1] = 0;
00751   data[hdr+2] = 0;
00752   data[hdr+7] = 0;
00753   addr = cellOffset+2*nCell;
00754   memset(&data[addr], 0, cbrk-addr);
00755   if( cbrk-addr!=pPage->nFree ){
00756     return SQLITE_CORRUPT_BKPT;
00757   }
00758   return SQLITE_OK;
00759 }
00760 
00761 /*
00762 ** Allocate nByte bytes of space on a page.
00763 **
00764 ** Return the index into pPage->aData[] of the first byte of
00765 ** the new allocation.  The caller guarantees that there is enough
00766 ** space.  This routine will never fail.
00767 **
00768 ** If the page contains nBytes of free space but does not contain
00769 ** nBytes of contiguous free space, then this routine automatically
00770 ** calls defragementPage() to consolidate all free space before 
00771 ** allocating the new chunk.
00772 */
00773 static int allocateSpace(MemPage *pPage, int nByte){
00774   int addr, pc, hdr;
00775   int size;
00776   int nFrag;
00777   int top;
00778   int nCell;
00779   int cellOffset;
00780   unsigned char *data;
00781   
00782   data = pPage->aData;
00783   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
00784   assert( pPage->pBt );
00785   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
00786   assert( nByte>=0 );  /* Minimum cell size is 4 */
00787   assert( pPage->nFree>=nByte );
00788   assert( pPage->nOverflow==0 );
00789   pPage->nFree -= nByte;
00790   hdr = pPage->hdrOffset;
00791 
00792   nFrag = data[hdr+7];
00793   if( nFrag<60 ){
00794     /* Search the freelist looking for a slot big enough to satisfy the
00795     ** space request. */
00796     addr = hdr+1;
00797     while( (pc = get2byte(&data[addr]))>0 ){
00798       size = get2byte(&data[pc+2]);
00799       if( size>=nByte ){
00800         if( size<nByte+4 ){
00801           memcpy(&data[addr], &data[pc], 2);
00802           data[hdr+7] = nFrag + size - nByte;
00803           return pc;
00804         }else{
00805           put2byte(&data[pc+2], size-nByte);
00806           return pc + size - nByte;
00807         }
00808       }
00809       addr = pc;
00810     }
00811   }
00812 
00813   /* Allocate memory from the gap in between the cell pointer array
00814   ** and the cell content area.
00815   */
00816   top = get2byte(&data[hdr+5]);
00817   nCell = get2byte(&data[hdr+3]);
00818   cellOffset = pPage->cellOffset;
00819   if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
00820     defragmentPage(pPage);
00821     top = get2byte(&data[hdr+5]);
00822   }
00823   top -= nByte;
00824   assert( cellOffset + 2*nCell <= top );
00825   put2byte(&data[hdr+5], top);
00826   return top;
00827 }
00828 
00829 /*
00830 ** Return a section of the pPage->aData to the freelist.
00831 ** The first byte of the new free block is pPage->aDisk[start]
00832 ** and the size of the block is "size" bytes.
00833 **
00834 ** Most of the effort here is involved in coalesing adjacent
00835 ** free blocks into a single big free block.
00836 */
00837 static void freeSpace(MemPage *pPage, int start, int size){
00838   int addr, pbegin, hdr;
00839   unsigned char *data = pPage->aData;
00840 
00841   assert( pPage->pBt!=0 );
00842   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
00843   assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
00844   assert( (start + size)<=pPage->pBt->usableSize );
00845   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
00846   assert( size>=0 );   /* Minimum cell size is 4 */
00847 
00848 #ifdef SQLITE_SECURE_DELETE
00849   /* Overwrite deleted information with zeros when the SECURE_DELETE 
00850   ** option is enabled at compile-time */
00851   memset(&data[start], 0, size);
00852 #endif
00853 
00854   /* Add the space back into the linked list of freeblocks */
00855   hdr = pPage->hdrOffset;
00856   addr = hdr + 1;
00857   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
00858     assert( pbegin<=pPage->pBt->usableSize-4 );
00859     assert( pbegin>addr );
00860     addr = pbegin;
00861   }
00862   assert( pbegin<=pPage->pBt->usableSize-4 );
00863   assert( pbegin>addr || pbegin==0 );
00864   put2byte(&data[addr], start);
00865   put2byte(&data[start], pbegin);
00866   put2byte(&data[start+2], size);
00867   pPage->nFree += size;
00868 
00869   /* Coalesce adjacent free blocks */
00870   addr = pPage->hdrOffset + 1;
00871   while( (pbegin = get2byte(&data[addr]))>0 ){
00872     int pnext, psize;
00873     assert( pbegin>addr );
00874     assert( pbegin<=pPage->pBt->usableSize-4 );
00875     pnext = get2byte(&data[pbegin]);
00876     psize = get2byte(&data[pbegin+2]);
00877     if( pbegin + psize + 3 >= pnext && pnext>0 ){
00878       int frag = pnext - (pbegin+psize);
00879       assert( frag<=data[pPage->hdrOffset+7] );
00880       data[pPage->hdrOffset+7] -= frag;
00881       put2byte(&data[pbegin], get2byte(&data[pnext]));
00882       put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
00883     }else{
00884       addr = pbegin;
00885     }
00886   }
00887 
00888   /* If the cell content area begins with a freeblock, remove it. */
00889   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
00890     int top;
00891     pbegin = get2byte(&data[hdr+1]);
00892     memcpy(&data[hdr+1], &data[pbegin], 2);
00893     top = get2byte(&data[hdr+5]);
00894     put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
00895   }
00896 }
00897 
00898 /*
00899 ** Decode the flags byte (the first byte of the header) for a page
00900 ** and initialize fields of the MemPage structure accordingly.
00901 **
00902 ** Only the following combinations are supported.  Anything different
00903 ** indicates a corrupt database files:
00904 **
00905 **         PTF_ZERODATA
00906 **         PTF_ZERODATA | PTF_LEAF
00907 **         PTF_LEAFDATA | PTF_INTKEY
00908 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
00909 */
00910 static int decodeFlags(MemPage *pPage, int flagByte){
00911   BtShared *pBt;     /* A copy of pPage->pBt */
00912 
00913   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
00914   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
00915   pPage->leaf = flagByte>>3;  assert( PTF_LEAF == 1<<3 );
00916   flagByte &= ~PTF_LEAF;
00917   pPage->childPtrSize = 4-4*pPage->leaf;
00918   pBt = pPage->pBt;
00919   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
00920     pPage->intKey = 1;
00921     pPage->hasData = pPage->leaf;
00922     pPage->maxLocal = pBt->maxLeaf;
00923     pPage->minLocal = pBt->minLeaf;
00924   }else if( flagByte==PTF_ZERODATA ){
00925     pPage->intKey = 0;
00926     pPage->hasData = 0;
00927     pPage->maxLocal = pBt->maxLocal;
00928     pPage->minLocal = pBt->minLocal;
00929   }else{
00930     return SQLITE_CORRUPT_BKPT;
00931   }
00932   return SQLITE_OK;
00933 }
00934 
00935 /*
00936 ** Initialize the auxiliary information for a disk block.
00937 **
00938 ** Return SQLITE_OK on success.  If we see that the page does
00939 ** not contain a well-formed database page, then return 
00940 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
00941 ** guarantee that the page is well-formed.  It only shows that
00942 ** we failed to detect any corruption.
00943 */
00944 int sqlite3BtreeInitPage(MemPage *pPage){
00945 
00946   assert( pPage->pBt!=0 );
00947   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
00948   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
00949   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
00950   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
00951 
00952   if( !pPage->isInit ){
00953     int pc;            /* Address of a freeblock within pPage->aData[] */
00954     int hdr;           /* Offset to beginning of page header */
00955     u8 *data;          /* Equal to pPage->aData */
00956     BtShared *pBt;        /* The main btree structure */
00957     int usableSize;    /* Amount of usable space on each page */
00958     int cellOffset;    /* Offset from start of page to first cell pointer */
00959     int nFree;         /* Number of unused bytes on the page */
00960     int top;           /* First byte of the cell content area */
00961 
00962     pBt = pPage->pBt;
00963 
00964     hdr = pPage->hdrOffset;
00965     data = pPage->aData;
00966     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
00967     assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
00968     pPage->maskPage = pBt->pageSize - 1;
00969     pPage->nOverflow = 0;
00970     usableSize = pBt->usableSize;
00971     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
00972     top = get2byte(&data[hdr+5]);
00973     pPage->nCell = get2byte(&data[hdr+3]);
00974     if( pPage->nCell>MX_CELL(pBt) ){
00975       /* To many cells for a single page.  The page must be corrupt */
00976       return SQLITE_CORRUPT_BKPT;
00977     }
00978   
00979     /* Compute the total free space on the page */
00980     pc = get2byte(&data[hdr+1]);
00981     nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
00982     while( pc>0 ){
00983       int next, size;
00984       if( pc>usableSize-4 ){
00985         /* Free block is off the page */
00986         return SQLITE_CORRUPT_BKPT; 
00987       }
00988       next = get2byte(&data[pc]);
00989       size = get2byte(&data[pc+2]);
00990       if( next>0 && next<=pc+size+3 ){
00991         /* Free blocks must be in accending order */
00992         return SQLITE_CORRUPT_BKPT; 
00993       }
00994       nFree += size;
00995       pc = next;
00996     }
00997     pPage->nFree = nFree;
00998     if( nFree>=usableSize ){
00999       /* Free space cannot exceed total page size */
01000       return SQLITE_CORRUPT_BKPT; 
01001     }
01002 
01003 #if 0
01004   /* Check that all the offsets in the cell offset array are within range. 
01005   ** 
01006   ** Omitting this consistency check and using the pPage->maskPage mask
01007   ** to prevent overrunning the page buffer in findCell() results in a
01008   ** 2.5% performance gain.
01009   */
01010   {
01011     u8 *pOff;        /* Iterator used to check all cell offsets are in range */
01012     u8 *pEnd;        /* Pointer to end of cell offset array */
01013     u8 mask;         /* Mask of bits that must be zero in MSB of cell offsets */
01014     mask = ~(((u8)(pBt->pageSize>>8))-1);
01015     pEnd = &data[cellOffset + pPage->nCell*2];
01016     for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
01017     if( pOff!=pEnd ){
01018       return SQLITE_CORRUPT_BKPT;
01019     }
01020   }
01021 #endif
01022 
01023     pPage->isInit = 1;
01024   }
01025   return SQLITE_OK;
01026 }
01027 
01028 /*
01029 ** Set up a raw page so that it looks like a database page holding
01030 ** no entries.
01031 */
01032 static void zeroPage(MemPage *pPage, int flags){
01033   unsigned char *data = pPage->aData;
01034   BtShared *pBt = pPage->pBt;
01035   int hdr = pPage->hdrOffset;
01036   int first;
01037 
01038   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
01039   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
01040   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
01041   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
01042   assert( sqlite3_mutex_held(pBt->mutex) );
01043   /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
01044   data[hdr] = flags;
01045   first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
01046   memset(&data[hdr+1], 0, 4);
01047   data[hdr+7] = 0;
01048   put2byte(&data[hdr+5], pBt->usableSize);
01049   pPage->nFree = pBt->usableSize - first;
01050   decodeFlags(pPage, flags);
01051   pPage->hdrOffset = hdr;
01052   pPage->cellOffset = first;
01053   pPage->nOverflow = 0;
01054   assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
01055   pPage->maskPage = pBt->pageSize - 1;
01056   pPage->nCell = 0;
01057   pPage->isInit = 1;
01058 }
01059 
01060 
01061 /*
01062 ** Convert a DbPage obtained from the pager into a MemPage used by
01063 ** the btree layer.
01064 */
01065 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
01066   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
01067   pPage->aData = sqlite3PagerGetData(pDbPage);
01068   pPage->pDbPage = pDbPage;
01069   pPage->pBt = pBt;
01070   pPage->pgno = pgno;
01071   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
01072   return pPage; 
01073 }
01074 
01075 /*
01076 ** Get a page from the pager.  Initialize the MemPage.pBt and
01077 ** MemPage.aData elements if needed.
01078 **
01079 ** If the noContent flag is set, it means that we do not care about
01080 ** the content of the page at this time.  So do not go to the disk
01081 ** to fetch the content.  Just fill in the content with zeros for now.
01082 ** If in the future we call sqlite3PagerWrite() on this page, that
01083 ** means we have started to be concerned about content and the disk
01084 ** read should occur at that point.
01085 */
01086 int sqlite3BtreeGetPage(
01087   BtShared *pBt,       /* The btree */
01088   Pgno pgno,           /* Number of the page to fetch */
01089   MemPage **ppPage,    /* Return the page in this parameter */
01090   int noContent        /* Do not load page content if true */
01091 ){
01092   int rc;
01093   DbPage *pDbPage;
01094 
01095   assert( sqlite3_mutex_held(pBt->mutex) );
01096   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
01097   if( rc ) return rc;
01098   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
01099   return SQLITE_OK;
01100 }
01101 
01102 /*
01103 ** Return the size of the database file in pages.  Or return -1 if
01104 ** there is any kind of error.
01105 */
01106 static int pagerPagecount(Pager *pPager){
01107   int rc;
01108   int nPage;
01109   rc = sqlite3PagerPagecount(pPager, &nPage);
01110   return (rc==SQLITE_OK?nPage:-1);
01111 }
01112 
01113 /*
01114 ** Get a page from the pager and initialize it.  This routine
01115 ** is just a convenience wrapper around separate calls to
01116 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
01117 */
01118 static int getAndInitPage(
01119   BtShared *pBt,          /* The database file */
01120   Pgno pgno,           /* Number of the page to get */
01121   MemPage **ppPage     /* Write the page pointer here */
01122 ){
01123   int rc;
01124   DbPage *pDbPage;
01125   MemPage *pPage;
01126 
01127   assert( sqlite3_mutex_held(pBt->mutex) );
01128   if( pgno==0 ){
01129     return SQLITE_CORRUPT_BKPT; 
01130   }
01131 
01132   /* It is often the case that the page we want is already in cache.
01133   ** If so, get it directly.  This saves us from having to call
01134   ** pagerPagecount() to make sure pgno is within limits, which results
01135   ** in a measureable performance improvements.
01136   */
01137   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
01138   if( pDbPage ){
01139     /* Page is already in cache */
01140     *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
01141     rc = SQLITE_OK;
01142   }else{
01143     /* Page not in cache.  Acquire it. */
01144     if( pgno>pagerPagecount(pBt->pPager) ){
01145       return SQLITE_CORRUPT_BKPT; 
01146     }
01147     rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
01148     if( rc ) return rc;
01149     pPage = *ppPage;
01150   }
01151   if( !pPage->isInit ){
01152     rc = sqlite3BtreeInitPage(pPage);
01153   }
01154   if( rc!=SQLITE_OK ){
01155     releasePage(pPage);
01156     *ppPage = 0;
01157   }
01158   return rc;
01159 }
01160 
01161 /*
01162 ** Release a MemPage.  This should be called once for each prior
01163 ** call to sqlite3BtreeGetPage.
01164 */
01165 static void releasePage(MemPage *pPage){
01166   if( pPage ){
01167     assert( pPage->aData );
01168     assert( pPage->pBt );
01169     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
01170     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
01171     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
01172     sqlite3PagerUnref(pPage->pDbPage);
01173   }
01174 }
01175 
01176 /*
01177 ** During a rollback, when the pager reloads information into the cache
01178 ** so that the cache is restored to its original state at the start of
01179 ** the transaction, for each page restored this routine is called.
01180 **
01181 ** This routine needs to reset the extra data section at the end of the
01182 ** page to agree with the restored data.
01183 */
01184 static void pageReinit(DbPage *pData){
01185   MemPage *pPage;
01186   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
01187   if( pPage->isInit ){
01188     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
01189     pPage->isInit = 0;
01190     if( sqlite3PagerPageRefcount(pData)>0 ){
01191       sqlite3BtreeInitPage(pPage);
01192     }
01193   }
01194 }
01195 
01196 /*
01197 ** Invoke the busy handler for a btree.
01198 */
01199 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
01200   BtShared *pBt = (BtShared*)pArg;
01201   assert( pBt->db );
01202   assert( sqlite3_mutex_held(pBt->db->mutex) );
01203   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
01204 }
01205 
01206 /*
01207 ** Open a database file.
01208 ** 
01209 ** zFilename is the name of the database file.  If zFilename is NULL
01210 ** a new database with a random name is created.  This randomly named
01211 ** database file will be deleted when sqlite3BtreeClose() is called.
01212 ** If zFilename is ":memory:" then an in-memory database is created
01213 ** that is automatically destroyed when it is closed.
01214 */
01215 int sqlite3BtreeOpen(
01216   const char *zFilename,  /* Name of the file containing the BTree database */
01217   sqlite3 *db,            /* Associated database handle */
01218   Btree **ppBtree,        /* Pointer to new Btree object written here */
01219   int flags,              /* Options */
01220   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
01221 ){
01222   sqlite3_vfs *pVfs;      /* The VFS to use for this btree */
01223   BtShared *pBt = 0;      /* Shared part of btree structure */
01224   Btree *p;               /* Handle to return */
01225   int rc = SQLITE_OK;
01226   int nReserve;
01227   unsigned char zDbHeader[100];
01228 
01229   /* Set the variable isMemdb to true for an in-memory database, or 
01230   ** false for a file-based database. This symbol is only required if
01231   ** either of the shared-data or autovacuum features are compiled 
01232   ** into the library.
01233   */
01234 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
01235   #ifdef SQLITE_OMIT_MEMORYDB
01236     const int isMemdb = 0;
01237   #else
01238     const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
01239   #endif
01240 #endif
01241 
01242   assert( db!=0 );
01243   assert( sqlite3_mutex_held(db->mutex) );
01244 
01245   pVfs = db->pVfs;
01246   p = sqlite3MallocZero(sizeof(Btree));
01247   if( !p ){
01248     return SQLITE_NOMEM;
01249   }
01250   p->inTrans = TRANS_NONE;
01251   p->db = db;
01252 
01253 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
01254   /*
01255   ** If this Btree is a candidate for shared cache, try to find an
01256   ** existing BtShared object that we can share with
01257   */
01258   if( isMemdb==0
01259    && (db->flags & SQLITE_Vtab)==0
01260    && zFilename && zFilename[0]
01261   ){
01262     if( sqlite3GlobalConfig.sharedCacheEnabled ){
01263       int nFullPathname = pVfs->mxPathname+1;
01264       char *zFullPathname = sqlite3Malloc(nFullPathname);
01265       sqlite3_mutex *mutexShared;
01266       p->sharable = 1;
01267       db->flags |= SQLITE_SharedCache;
01268       if( !zFullPathname ){
01269         sqlite3_free(p);
01270         return SQLITE_NOMEM;
01271       }
01272       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
01273       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
01274       sqlite3_mutex_enter(mutexShared);
01275       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
01276         assert( pBt->nRef>0 );
01277         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
01278                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
01279           p->pBt = pBt;
01280           pBt->nRef++;
01281           break;
01282         }
01283       }
01284       sqlite3_mutex_leave(mutexShared);
01285       sqlite3_free(zFullPathname);
01286     }
01287 #ifdef SQLITE_DEBUG
01288     else{
01289       /* In debug mode, we mark all persistent databases as sharable
01290       ** even when they are not.  This exercises the locking code and
01291       ** gives more opportunity for asserts(sqlite3_mutex_held())
01292       ** statements to find locking problems.
01293       */
01294       p->sharable = 1;
01295     }
01296 #endif
01297   }
01298 #endif
01299   if( pBt==0 ){
01300     /*
01301     ** The following asserts make sure that structures used by the btree are
01302     ** the right size.  This is to guard against size changes that result
01303     ** when compiling on a different architecture.
01304     */
01305     assert( sizeof(i64)==8 || sizeof(i64)==4 );
01306     assert( sizeof(u64)==8 || sizeof(u64)==4 );
01307     assert( sizeof(u32)==4 );
01308     assert( sizeof(u16)==2 );
01309     assert( sizeof(Pgno)==4 );
01310   
01311     pBt = sqlite3MallocZero( sizeof(*pBt) );
01312     if( pBt==0 ){
01313       rc = SQLITE_NOMEM;
01314       goto btree_open_out;
01315     }
01316     pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
01317     pBt->busyHdr.pArg = pBt;
01318     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
01319                           EXTRA_SIZE, flags, vfsFlags);
01320     if( rc==SQLITE_OK ){
01321       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
01322     }
01323     if( rc!=SQLITE_OK ){
01324       goto btree_open_out;
01325     }
01326     sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
01327     p->pBt = pBt;
01328   
01329     sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
01330     pBt->pCursor = 0;
01331     pBt->pPage1 = 0;
01332     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
01333     pBt->pageSize = get2byte(&zDbHeader[16]);
01334     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
01335          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
01336       pBt->pageSize = 0;
01337       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
01338 #ifndef SQLITE_OMIT_AUTOVACUUM
01339       /* If the magic name ":memory:" will create an in-memory database, then
01340       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
01341       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
01342       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
01343       ** regular file-name. In this case the auto-vacuum applies as per normal.
01344       */
01345       if( zFilename && !isMemdb ){
01346         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
01347         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
01348       }
01349 #endif
01350       nReserve = 0;
01351     }else{
01352       nReserve = zDbHeader[20];
01353       pBt->pageSizeFixed = 1;
01354 #ifndef SQLITE_OMIT_AUTOVACUUM
01355       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
01356       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
01357 #endif
01358     }
01359     pBt->usableSize = pBt->pageSize - nReserve;
01360     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
01361     sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
01362    
01363 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
01364     /* Add the new BtShared object to the linked list sharable BtShareds.
01365     */
01366     if( p->sharable ){
01367       sqlite3_mutex *mutexShared;
01368       pBt->nRef = 1;
01369       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
01370       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
01371         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
01372         if( pBt->mutex==0 ){
01373           rc = SQLITE_NOMEM;
01374           db->mallocFailed = 0;
01375           goto btree_open_out;
01376         }
01377       }
01378       sqlite3_mutex_enter(mutexShared);
01379       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
01380       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
01381       sqlite3_mutex_leave(mutexShared);
01382     }
01383 #endif
01384   }
01385 
01386 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
01387   /* If the new Btree uses a sharable pBtShared, then link the new
01388   ** Btree into the list of all sharable Btrees for the same connection.
01389   ** The list is kept in ascending order by pBt address.
01390   */
01391   if( p->sharable ){
01392     int i;
01393     Btree *pSib;
01394     for(i=0; i<db->nDb; i++){
01395       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
01396         while( pSib->pPrev ){ pSib = pSib->pPrev; }
01397         if( p->pBt<pSib->pBt ){
01398           p->pNext = pSib;
01399           p->pPrev = 0;
01400           pSib->pPrev = p;
01401         }else{
01402           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
01403             pSib = pSib->pNext;
01404           }
01405           p->pNext = pSib->pNext;
01406           p->pPrev = pSib;
01407           if( p->pNext ){
01408             p->pNext->pPrev = p;
01409           }
01410           pSib->pNext = p;
01411         }
01412         break;
01413       }
01414     }
01415   }
01416 #endif
01417   *ppBtree = p;
01418 
01419 btree_open_out:
01420   if( rc!=SQLITE_OK ){
01421     if( pBt && pBt->pPager ){
01422       sqlite3PagerClose(pBt->pPager);
01423     }
01424     sqlite3_free(pBt);
01425     sqlite3_free(p);
01426     *ppBtree = 0;
01427   }
01428   return rc;
01429 }
01430 
01431 /*
01432 ** Decrement the BtShared.nRef counter.  When it reaches zero,
01433 ** remove the BtShared structure from the sharing list.  Return
01434 ** true if the BtShared.nRef counter reaches zero and return
01435 ** false if it is still positive.
01436 */
01437 static int removeFromSharingList(BtShared *pBt){
01438 #ifndef SQLITE_OMIT_SHARED_CACHE
01439   sqlite3_mutex *pMaster;
01440   BtShared *pList;
01441   int removed = 0;
01442 
01443   assert( sqlite3_mutex_notheld(pBt->mutex) );
01444   pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
01445   sqlite3_mutex_enter(pMaster);
01446   pBt->nRef--;
01447   if( pBt->nRef<=0 ){
01448     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
01449       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
01450     }else{
01451       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
01452       while( ALWAYS(pList) && pList->pNext!=pBt ){
01453         pList=pList->pNext;
01454       }
01455       if( ALWAYS(pList) ){
01456         pList->pNext = pBt->pNext;
01457       }
01458     }
01459     if( SQLITE_THREADSAFE ){
01460       sqlite3_mutex_free(pBt->mutex);
01461     }
01462     removed = 1;
01463   }
01464   sqlite3_mutex_leave(pMaster);
01465   return removed;
01466 #else
01467   return 1;
01468 #endif
01469 }
01470 
01471 /*
01472 ** Make sure pBt->pTmpSpace points to an allocation of 
01473 ** MX_CELL_SIZE(pBt) bytes.
01474 */
01475 static void allocateTempSpace(BtShared *pBt){
01476   if( !pBt->pTmpSpace ){
01477     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
01478   }
01479 }
01480 
01481 /*
01482 ** Free the pBt->pTmpSpace allocation
01483 */
01484 static void freeTempSpace(BtShared *pBt){
01485   sqlite3PageFree( pBt->pTmpSpace);
01486   pBt->pTmpSpace = 0;
01487 }
01488 
01489 /*
01490 ** Close an open database and invalidate all cursors.
01491 */
01492 int sqlite3BtreeClose(Btree *p){
01493   BtShared *pBt = p->pBt;
01494   BtCursor *pCur;
01495 
01496   /* Close all cursors opened via this handle.  */
01497   assert( sqlite3_mutex_held(p->db->mutex) );
01498   sqlite3BtreeEnter(p);
01499   pBt->db = p->db;
01500   pCur = pBt->pCursor;
01501   while( pCur ){
01502     BtCursor *pTmp = pCur;
01503     pCur = pCur->pNext;
01504     if( pTmp->pBtree==p ){
01505       sqlite3BtreeCloseCursor(pTmp);
01506     }
01507   }
01508 
01509   /* Rollback any active transaction and free the handle structure.
01510   ** The call to sqlite3BtreeRollback() drops any table-locks held by
01511   ** this handle.
01512   */
01513   sqlite3BtreeRollback(p);
01514   sqlite3BtreeLeave(p);
01515 
01516   /* If there are still other outstanding references to the shared-btree
01517   ** structure, return now. The remainder of this procedure cleans 
01518   ** up the shared-btree.
01519   */
01520   assert( p->wantToLock==0 && p->locked==0 );
01521   if( !p->sharable || removeFromSharingList(pBt) ){
01522     /* The pBt is no longer on the sharing list, so we can access
01523     ** it without having to hold the mutex.
01524     **
01525     ** Clean out and delete the BtShared object.
01526     */
01527     assert( !pBt->pCursor );
01528     sqlite3PagerClose(pBt->pPager);
01529     if( pBt->xFreeSchema && pBt->pSchema ){
01530       pBt->xFreeSchema(pBt->pSchema);
01531     }
01532     sqlite3_free(pBt->pSchema);
01533     freeTempSpace(pBt);
01534     sqlite3_free(pBt);
01535   }
01536 
01537 #ifndef SQLITE_OMIT_SHARED_CACHE
01538   assert( p->wantToLock==0 );
01539   assert( p->locked==0 );
01540   if( p->pPrev ) p->pPrev->pNext = p->pNext;
01541   if( p->pNext ) p->pNext->pPrev = p->pPrev;
01542 #endif
01543 
01544   sqlite3_free(p);
01545   return SQLITE_OK;
01546 }
01547 
01548 /*
01549 ** Change the limit on the number of pages allowed in the cache.
01550 **
01551 ** The maximum number of cache pages is set to the absolute
01552 ** value of mxPage.  If mxPage is negative, the pager will
01553 ** operate asynchronously - it will not stop to do fsync()s
01554 ** to insure data is written to the disk surface before
01555 ** continuing.  Transactions still work if synchronous is off,
01556 ** and the database cannot be corrupted if this program
01557 ** crashes.  But if the operating system crashes or there is
01558 ** an abrupt power failure when synchronous is off, the database
01559 ** could be left in an inconsistent and unrecoverable state.
01560 ** Synchronous is on by default so database corruption is not
01561 ** normally a worry.
01562 */
01563 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
01564   BtShared *pBt = p->pBt;
01565   assert( sqlite3_mutex_held(p->db->mutex) );
01566   sqlite3BtreeEnter(p);
01567   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
01568   sqlite3BtreeLeave(p);
01569   return SQLITE_OK;
01570 }
01571 
01572 /*
01573 ** Change the way data is synced to disk in order to increase or decrease
01574 ** how well the database resists damage due to OS crashes and power
01575 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
01576 ** there is a high probability of damage)  Level 2 is the default.  There
01577 ** is a very low but non-zero probability of damage.  Level 3 reduces the
01578 ** probability of damage to near zero but with a write performance reduction.
01579 */
01580 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
01581 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
01582   BtShared *pBt = p->pBt;
01583   assert( sqlite3_mutex_held(p->db->mutex) );
01584   sqlite3BtreeEnter(p);
01585   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
01586   sqlite3BtreeLeave(p);
01587   return SQLITE_OK;
01588 }
01589 #endif
01590 
01591 /*
01592 ** Return TRUE if the given btree is set to safety level 1.  In other
01593 ** words, return TRUE if no sync() occurs on the disk files.
01594 */
01595 int sqlite3BtreeSyncDisabled(Btree *p){
01596   BtShared *pBt = p->pBt;
01597   int rc;
01598   assert( sqlite3_mutex_held(p->db->mutex) );  
01599   sqlite3BtreeEnter(p);
01600   assert( pBt && pBt->pPager );
01601   rc = sqlite3PagerNosync(pBt->pPager);
01602   sqlite3BtreeLeave(p);
01603   return rc;
01604 }
01605 
01606 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
01607 /*
01608 ** Change the default pages size and the number of reserved bytes per page.
01609 **
01610 ** The page size must be a power of 2 between 512 and 65536.  If the page
01611 ** size supplied does not meet this constraint then the page size is not
01612 ** changed.
01613 **
01614 ** Page sizes are constrained to be a power of two so that the region
01615 ** of the database file used for locking (beginning at PENDING_BYTE,
01616 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
01617 ** at the beginning of a page.
01618 **
01619 ** If parameter nReserve is less than zero, then the number of reserved
01620 ** bytes per page is left unchanged.
01621 */
01622 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
01623   int rc = SQLITE_OK;
01624   BtShared *pBt = p->pBt;
01625   sqlite3BtreeEnter(p);
01626   if( pBt->pageSizeFixed ){
01627     sqlite3BtreeLeave(p);
01628     return SQLITE_READONLY;
01629   }
01630   if( nReserve<0 ){
01631     nReserve = pBt->pageSize - pBt->usableSize;
01632   }
01633   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
01634         ((pageSize-1)&pageSize)==0 ){
01635     assert( (pageSize & 7)==0 );
01636     assert( !pBt->pPage1 && !pBt->pCursor );
01637     pBt->pageSize = pageSize;
01638     freeTempSpace(pBt);
01639     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
01640   }
01641   pBt->usableSize = pBt->pageSize - nReserve;
01642   sqlite3BtreeLeave(p);
01643   return rc;
01644 }
01645 
01646 /*
01647 ** Return the currently defined page size
01648 */
01649 int sqlite3BtreeGetPageSize(Btree *p){
01650   return p->pBt->pageSize;
01651 }
01652 int sqlite3BtreeGetReserve(Btree *p){
01653   int n;
01654   sqlite3BtreeEnter(p);
01655   n = p->pBt->pageSize - p->pBt->usableSize;
01656   sqlite3BtreeLeave(p);
01657   return n;
01658 }
01659 
01660 /*
01661 ** Set the maximum page count for a database if mxPage is positive.
01662 ** No changes are made if mxPage is 0 or negative.
01663 ** Regardless of the value of mxPage, return the maximum page count.
01664 */
01665 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
01666   int n;
01667   sqlite3BtreeEnter(p);
01668   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
01669   sqlite3BtreeLeave(p);
01670   return n;
01671 }
01672 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
01673 
01674 /*
01675 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
01676 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
01677 ** is disabled. The default value for the auto-vacuum property is 
01678 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
01679 */
01680 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
01681 #ifdef SQLITE_OMIT_AUTOVACUUM
01682   return SQLITE_READONLY;
01683 #else
01684   BtShared *pBt = p->pBt;
01685   int rc = SQLITE_OK;
01686   int av = (autoVacuum?1:0);
01687 
01688   sqlite3BtreeEnter(p);
01689   if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
01690     rc = SQLITE_READONLY;
01691   }else{
01692     pBt->autoVacuum = av;
01693   }
01694   sqlite3BtreeLeave(p);
01695   return rc;
01696 #endif
01697 }
01698 
01699 /*
01700 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 
01701 ** enabled 1 is returned. Otherwise 0.
01702 */
01703 int sqlite3BtreeGetAutoVacuum(Btree *p){
01704 #ifdef SQLITE_OMIT_AUTOVACUUM
01705   return BTREE_AUTOVACUUM_NONE;
01706 #else
01707   int rc;
01708   sqlite3BtreeEnter(p);
01709   rc = (
01710     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
01711     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
01712     BTREE_AUTOVACUUM_INCR
01713   );
01714   sqlite3BtreeLeave(p);
01715   return rc;
01716 #endif
01717 }
01718 
01719 
01720 /*
01721 ** Get a reference to pPage1 of the database file.  This will
01722 ** also acquire a readlock on that file.
01723 **
01724 ** SQLITE_OK is returned on success.  If the file is not a
01725 ** well-formed database file, then SQLITE_CORRUPT is returned.
01726 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
01727 ** is returned if we run out of memory. 
01728 */
01729 static int lockBtree(BtShared *pBt){
01730   int rc;
01731   MemPage *pPage1;
01732   int nPage;
01733 
01734   assert( sqlite3_mutex_held(pBt->mutex) );
01735   if( pBt->pPage1 ) return SQLITE_OK;
01736   rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
01737   if( rc!=SQLITE_OK ) return rc;
01738 
01739   /* Do some checking to help insure the file we opened really is
01740   ** a valid database file. 
01741   */
01742   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
01743   if( rc!=SQLITE_OK ){
01744     goto page1_init_failed;
01745   }else if( nPage>0 ){
01746     int pageSize;
01747     int usableSize;
01748     u8 *page1 = pPage1->aData;
01749     rc = SQLITE_NOTADB;
01750     if( memcmp(page1, zMagicHeader, 16)!=0 ){
01751       goto page1_init_failed;
01752     }
01753     if( page1[18]>1 ){
01754       pBt->readOnly = 1;
01755     }
01756     if( page1[19]>1 ){
01757       goto page1_init_failed;
01758     }
01759 
01760     /* The maximum embedded fraction must be exactly 25%.  And the minimum
01761     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
01762     ** The original design allowed these amounts to vary, but as of
01763     ** version 3.6.0, we require them to be fixed.
01764     */
01765     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
01766       goto page1_init_failed;
01767     }
01768     pageSize = get2byte(&page1[16]);
01769     if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
01770         (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
01771     ){
01772       goto page1_init_failed;
01773     }
01774     assert( (pageSize & 7)==0 );
01775     usableSize = pageSize - page1[20];
01776     if( pageSize!=pBt->pageSize ){
01777       /* After reading the first page of the database assuming a page size
01778       ** of BtShared.pageSize, we have discovered that the page-size is
01779       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
01780       ** zero and return SQLITE_OK. The caller will call this function
01781       ** again with the correct page-size.
01782       */
01783       releasePage(pPage1);
01784       pBt->usableSize = usableSize;
01785       pBt->pageSize = pageSize;
01786       freeTempSpace(pBt);
01787       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
01788       return SQLITE_OK;
01789     }
01790     if( usableSize<500 ){
01791       goto page1_init_failed;
01792     }
01793     pBt->pageSize = pageSize;
01794     pBt->usableSize = usableSize;
01795 #ifndef SQLITE_OMIT_AUTOVACUUM
01796     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
01797     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
01798 #endif
01799   }
01800 
01801   /* maxLocal is the maximum amount of payload to store locally for
01802   ** a cell.  Make sure it is small enough so that at least minFanout
01803   ** cells can will fit on one page.  We assume a 10-byte page header.
01804   ** Besides the payload, the cell must store:
01805   **     2-byte pointer to the cell
01806   **     4-byte child pointer
01807   **     9-byte nKey value
01808   **     4-byte nData value
01809   **     4-byte overflow page pointer
01810   ** So a cell consists of a 2-byte poiner, a header which is as much as
01811   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
01812   ** page pointer.
01813   */
01814   pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
01815   pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
01816   pBt->maxLeaf = pBt->usableSize - 35;
01817   pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
01818   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
01819   pBt->pPage1 = pPage1;
01820   return SQLITE_OK;
01821 
01822 page1_init_failed:
01823   releasePage(pPage1);
01824   pBt->pPage1 = 0;
01825   return rc;
01826 }
01827 
01828 /*
01829 ** This routine works like lockBtree() except that it also invokes the
01830 ** busy callback if there is lock contention.
01831 */
01832 static int lockBtreeWithRetry(Btree *pRef){
01833   int rc = SQLITE_OK;
01834 
01835   assert( sqlite3BtreeHoldsMutex(pRef) );
01836   if( pRef->inTrans==TRANS_NONE ){
01837     u8 inTransaction = pRef->pBt->inTransaction;
01838     btreeIntegrity(pRef);
01839     rc = sqlite3BtreeBeginTrans(pRef, 0);
01840     pRef->pBt->inTransaction = inTransaction;
01841     pRef->inTrans = TRANS_NONE;
01842     if( rc==SQLITE_OK ){
01843       pRef->pBt->nTransaction--;
01844     }
01845     btreeIntegrity(pRef);
01846   }
01847   return rc;
01848 }
01849        
01850 
01851 /*
01852 ** If there are no outstanding cursors and we are not in the middle
01853 ** of a transaction but there is a read lock on the database, then
01854 ** this routine unrefs the first page of the database file which 
01855 ** has the effect of releasing the read lock.
01856 **
01857 ** If there are any outstanding cursors, this routine is a no-op.
01858 **
01859 ** If there is a transaction in progress, this routine is a no-op.
01860 */
01861 static void unlockBtreeIfUnused(BtShared *pBt){
01862   assert( sqlite3_mutex_held(pBt->mutex) );
01863   if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
01864     if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
01865       assert( pBt->pPage1->aData );
01866 #if 0
01867       if( pBt->pPage1->aData==0 ){
01868         MemPage *pPage = pBt->pPage1;
01869         pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
01870         pPage->pBt = pBt;
01871         pPage->pgno = 1;
01872       }
01873 #endif
01874       releasePage(pBt->pPage1);
01875     }
01876     pBt->pPage1 = 0;
01877     pBt->inStmt = 0;
01878   }
01879 }
01880 
01881 /*
01882 ** Create a new database by initializing the first page of the
01883 ** file.
01884 */
01885 static int newDatabase(BtShared *pBt){
01886   MemPage *pP1;
01887   unsigned char *data;
01888   int rc;
01889   int nPage;
01890 
01891   assert( sqlite3_mutex_held(pBt->mutex) );
01892   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
01893   if( rc!=SQLITE_OK || nPage>0 ){
01894     return rc;
01895   }
01896   pP1 = pBt->pPage1;
01897   assert( pP1!=0 );
01898   data = pP1->aData;
01899   rc = sqlite3PagerWrite(pP1->pDbPage);
01900   if( rc ) return rc;
01901   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
01902   assert( sizeof(zMagicHeader)==16 );
01903   put2byte(&data[16], pBt->pageSize);
01904   data[18] = 1;
01905   data[19] = 1;
01906   data[20] = pBt->pageSize - pBt->usableSize;
01907   data[21] = 64;
01908   data[22] = 32;
01909   data[23] = 32;
01910   memset(&data[24], 0, 100-24);
01911   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
01912   pBt->pageSizeFixed = 1;
01913 #ifndef SQLITE_OMIT_AUTOVACUUM
01914   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
01915   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
01916   put4byte(&data[36 + 4*4], pBt->autoVacuum);
01917   put4byte(&data[36 + 7*4], pBt->incrVacuum);
01918 #endif
01919   return SQLITE_OK;
01920 }
01921 
01922 /*
01923 ** Attempt to start a new transaction. A write-transaction
01924 ** is started if the second argument is nonzero, otherwise a read-
01925 ** transaction.  If the second argument is 2 or more and exclusive
01926 ** transaction is started, meaning that no other process is allowed
01927 ** to access the database.  A preexisting transaction may not be
01928 ** upgraded to exclusive by calling this routine a second time - the
01929 ** exclusivity flag only works for a new transaction.
01930 **
01931 ** A write-transaction must be started before attempting any 
01932 ** changes to the database.  None of the following routines 
01933 ** will work unless a transaction is started first:
01934 **
01935 **      sqlite3BtreeCreateTable()
01936 **      sqlite3BtreeCreateIndex()
01937 **      sqlite3BtreeClearTable()
01938 **      sqlite3BtreeDropTable()
01939 **      sqlite3BtreeInsert()
01940 **      sqlite3BtreeDelete()
01941 **      sqlite3BtreeUpdateMeta()
01942 **
01943 ** If an initial attempt to acquire the lock fails because of lock contention
01944 ** and the database was previously unlocked, then invoke the busy handler
01945 ** if there is one.  But if there was previously a read-lock, do not
01946 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is 
01947 ** returned when there is already a read-lock in order to avoid a deadlock.
01948 **
01949 ** Suppose there are two processes A and B.  A has a read lock and B has
01950 ** a reserved lock.  B tries to promote to exclusive but is blocked because
01951 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
01952 ** One or the other of the two processes must give way or there can be
01953 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
01954 ** when A already has a read lock, we encourage A to give up and let B
01955 ** proceed.
01956 */
01957 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
01958   BtShared *pBt = p->pBt;
01959   int rc = SQLITE_OK;
01960 
01961   sqlite3BtreeEnter(p);
01962   pBt->db = p->db;
01963   btreeIntegrity(p);
01964 
01965   /* If the btree is already in a write-transaction, or it
01966   ** is already in a read-transaction and a read-transaction
01967   ** is requested, this is a no-op.
01968   */
01969   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
01970     goto trans_begun;
01971   }
01972 
01973   /* Write transactions are not possible on a read-only database */
01974   if( pBt->readOnly && wrflag ){
01975     rc = SQLITE_READONLY;
01976     goto trans_begun;
01977   }
01978 
01979   /* If another database handle has already opened a write transaction 
01980   ** on this shared-btree structure and a second write transaction is
01981   ** requested, return SQLITE_BUSY.
01982   */
01983   if( pBt->inTransaction==TRANS_WRITE && wrflag ){
01984     rc = SQLITE_BUSY;
01985     goto trans_begun;
01986   }
01987 
01988 #ifndef SQLITE_OMIT_SHARED_CACHE
01989   if( wrflag>1 ){
01990     BtLock *pIter;
01991     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
01992       if( pIter->pBtree!=p ){
01993         rc = SQLITE_BUSY;
01994         goto trans_begun;
01995       }
01996     }
01997   }
01998 #endif
01999 
02000   do {
02001     if( pBt->pPage1==0 ){
02002       do{
02003         rc = lockBtree(pBt);
02004       }while( pBt->pPage1==0 && rc==SQLITE_OK );
02005     }
02006 
02007     if( rc==SQLITE_OK && wrflag ){
02008       if( pBt->readOnly ){
02009         rc = SQLITE_READONLY;
02010       }else{
02011         rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
02012         if( rc==SQLITE_OK ){
02013           rc = newDatabase(pBt);
02014         }
02015       }
02016     }
02017   
02018     if( rc==SQLITE_OK ){
02019       if( wrflag ) pBt->inStmt = 0;
02020     }else{
02021       unlockBtreeIfUnused(pBt);
02022     }
02023   }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
02024           sqlite3BtreeInvokeBusyHandler(pBt, 0) );
02025 
02026   if( rc==SQLITE_OK ){
02027     if( p->inTrans==TRANS_NONE ){
02028       pBt->nTransaction++;
02029     }
02030     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
02031     if( p->inTrans>pBt->inTransaction ){
02032       pBt->inTransaction = p->inTrans;
02033     }
02034 #ifndef SQLITE_OMIT_SHARED_CACHE
02035     if( wrflag>1 ){
02036       assert( !pBt->pExclusive );
02037       pBt->pExclusive = p;
02038     }
02039 #endif
02040   }
02041 
02042 
02043 trans_begun:
02044   btreeIntegrity(p);
02045   sqlite3BtreeLeave(p);
02046   return rc;
02047 }
02048 
02049 #ifndef SQLITE_OMIT_AUTOVACUUM
02050 
02051 /*
02052 ** Set the pointer-map entries for all children of page pPage. Also, if
02053 ** pPage contains cells that point to overflow pages, set the pointer
02054 ** map entries for the overflow pages as well.
02055 */
02056 static int setChildPtrmaps(MemPage *pPage){
02057   int i;                             /* Counter variable */
02058   int nCell;                         /* Number of cells in page pPage */
02059   int rc;                            /* Return code */
02060   BtShared *pBt = pPage->pBt;
02061   int isInitOrig = pPage->isInit;
02062   Pgno pgno = pPage->pgno;
02063 
02064   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
02065   rc = sqlite3BtreeInitPage(pPage);
02066   if( rc!=SQLITE_OK ){
02067     goto set_child_ptrmaps_out;
02068   }
02069   nCell = pPage->nCell;
02070 
02071   for(i=0; i<nCell; i++){
02072     u8 *pCell = findCell(pPage, i);
02073 
02074     rc = ptrmapPutOvflPtr(pPage, pCell);
02075     if( rc!=SQLITE_OK ){
02076       goto set_child_ptrmaps_out;
02077     }
02078 
02079     if( !pPage->leaf ){
02080       Pgno childPgno = get4byte(pCell);
02081       rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
02082       if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
02083     }
02084   }
02085 
02086   if( !pPage->leaf ){
02087     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
02088     rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
02089   }
02090 
02091 set_child_ptrmaps_out:
02092   pPage->isInit = isInitOrig;
02093   return rc;
02094 }
02095 
02096 /*
02097 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
02098 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
02099 ** iTo. Parameter eType describes the type of pointer to be modified, as 
02100 ** follows:
02101 **
02102 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child 
02103 **                   page of pPage.
02104 **
02105 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
02106 **                   page pointed to by one of the cells on pPage.
02107 **
02108 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
02109 **                   overflow page in the list.
02110 */
02111 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
02112   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
02113   if( eType==PTRMAP_OVERFLOW2 ){
02114     /* The pointer is always the first 4 bytes of the page in this case.  */
02115     if( get4byte(pPage->aData)!=iFrom ){
02116       return SQLITE_CORRUPT_BKPT;
02117     }
02118     put4byte(pPage->aData, iTo);
02119   }else{
02120     int isInitOrig = pPage->isInit;
02121     int i;
02122     int nCell;
02123 
02124     sqlite3BtreeInitPage(pPage);
02125     nCell = pPage->nCell;
02126 
02127     for(i=0; i<nCell; i++){
02128       u8 *pCell = findCell(pPage, i);
02129       if( eType==PTRMAP_OVERFLOW1 ){
02130         CellInfo info;
02131         sqlite3BtreeParseCellPtr(pPage, pCell, &info);
02132         if( info.iOverflow ){
02133           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
02134             put4byte(&pCell[info.iOverflow], iTo);
02135             break;
02136           }
02137         }
02138       }else{
02139         if( get4byte(pCell)==iFrom ){
02140           put4byte(pCell, iTo);
02141           break;
02142         }
02143       }
02144     }
02145   
02146     if( i==nCell ){
02147       if( eType!=PTRMAP_BTREE || 
02148           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
02149         return SQLITE_CORRUPT_BKPT;
02150       }
02151       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
02152     }
02153 
02154     pPage->isInit = isInitOrig;
02155   }
02156   return SQLITE_OK;
02157 }
02158 
02159 
02160 /*
02161 ** Move the open database page pDbPage to location iFreePage in the 
02162 ** database. The pDbPage reference remains valid.
02163 */
02164 static int relocatePage(
02165   BtShared *pBt,           /* Btree */
02166   MemPage *pDbPage,        /* Open page to move */
02167   u8 eType,                /* Pointer map 'type' entry for pDbPage */
02168   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
02169   Pgno iFreePage,          /* The location to move pDbPage to */
02170   int isCommit
02171 ){
02172   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
02173   Pgno iDbPage = pDbPage->pgno;
02174   Pager *pPager = pBt->pPager;
02175   int rc;
02176 
02177   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 
02178       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
02179   assert( sqlite3_mutex_held(pBt->mutex) );
02180   assert( pDbPage->pBt==pBt );
02181 
02182   /* Move page iDbPage from its current location to page number iFreePage */
02183   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 
02184       iDbPage, iFreePage, iPtrPage, eType));
02185   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
02186   if( rc!=SQLITE_OK ){
02187     return rc;
02188   }
02189   pDbPage->pgno = iFreePage;
02190 
02191   /* If pDbPage was a btree-page, then it may have child pages and/or cells
02192   ** that point to overflow pages. The pointer map entries for all these
02193   ** pages need to be changed.
02194   **
02195   ** If pDbPage is an overflow page, then the first 4 bytes may store a
02196   ** pointer to a subsequent overflow page. If this is the case, then
02197   ** the pointer map needs to be updated for the subsequent overflow page.
02198   */
02199   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
02200     rc = setChildPtrmaps(pDbPage);
02201     if( rc!=SQLITE_OK ){
02202       return rc;
02203     }
02204   }else{
02205     Pgno nextOvfl = get4byte(pDbPage->aData);
02206     if( nextOvfl!=0 ){
02207       rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
02208       if( rc!=SQLITE_OK ){
02209         return rc;
02210       }
02211     }
02212   }
02213 
02214   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
02215   ** that it points at iFreePage. Also fix the pointer map entry for
02216   ** iPtrPage.
02217   */
02218   if( eType!=PTRMAP_ROOTPAGE ){
02219     rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
02220     if( rc!=SQLITE_OK ){
02221       return rc;
02222     }
02223     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
02224     if( rc!=SQLITE_OK ){
02225       releasePage(pPtrPage);
02226       return rc;
02227     }
02228     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
02229     releasePage(pPtrPage);
02230     if( rc==SQLITE_OK ){
02231       rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
02232     }
02233   }
02234   return rc;
02235 }
02236 
02237 /* Forward declaration required by incrVacuumStep(). */
02238 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
02239 
02240 /*
02241 ** Perform a single step of an incremental-vacuum. If successful,
02242 ** return SQLITE_OK. If there is no work to do (and therefore no
02243 ** point in calling this function again), return SQLITE_DONE.
02244 **
02245 ** More specificly, this function attempts to re-organize the 
02246 ** database so that the last page of the file currently in use
02247 ** is no longer in use.
02248 **
02249 ** If the nFin parameter is non-zero, the implementation assumes
02250 ** that the caller will keep calling incrVacuumStep() until
02251 ** it returns SQLITE_DONE or an error, and that nFin is the
02252 ** number of pages the database file will contain after this 
02253 ** process is complete.
02254 */
02255 static int incrVacuumStep(BtShared *pBt, Pgno nFin){
02256   Pgno iLastPg;             /* Last page in the database */
02257   Pgno nFreeList;           /* Number of pages still on the free-list */
02258 
02259   assert( sqlite3_mutex_held(pBt->mutex) );
02260   iLastPg = pBt->nTrunc;
02261   if( iLastPg==0 ){
02262     iLastPg = pagerPagecount(pBt->pPager);
02263   }
02264 
02265   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
02266     int rc;
02267     u8 eType;
02268     Pgno iPtrPage;
02269 
02270     nFreeList = get4byte(&pBt->pPage1->aData[36]);
02271     if( nFreeList==0 || nFin==iLastPg ){
02272       return SQLITE_DONE;
02273     }
02274 
02275     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
02276     if( rc!=SQLITE_OK ){
02277       return rc;
02278     }
02279     if( eType==PTRMAP_ROOTPAGE ){
02280       return SQLITE_CORRUPT_BKPT;
02281     }
02282 
02283     if( eType==PTRMAP_FREEPAGE ){
02284       if( nFin==0 ){
02285         /* Remove the page from the files free-list. This is not required
02286         ** if nFin is non-zero. In that case, the free-list will be
02287         ** truncated to zero after this function returns, so it doesn't 
02288         ** matter if it still contains some garbage entries.
02289         */
02290         Pgno iFreePg;
02291         MemPage *pFreePg;
02292         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
02293         if( rc!=SQLITE_OK ){
02294           return rc;
02295         }
02296         assert( iFreePg==iLastPg );
02297         releasePage(pFreePg);
02298       }
02299     } else {
02300       Pgno iFreePg;             /* Index of free page to move pLastPg to */
02301       MemPage *pLastPg;
02302 
02303       rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
02304       if( rc!=SQLITE_OK ){
02305         return rc;
02306       }
02307 
02308       /* If nFin is zero, this loop runs exactly once and page pLastPg
02309       ** is swapped with the first free page pulled off the free list.
02310       **
02311       ** On the other hand, if nFin is greater than zero, then keep
02312       ** looping until a free-page located within the first nFin pages
02313       ** of the file is found.
02314       */
02315       do {
02316         MemPage *pFreePg;
02317         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
02318         if( rc!=SQLITE_OK ){
02319           releasePage(pLastPg);
02320           return rc;
02321         }
02322         releasePage(pFreePg);
02323       }while( nFin!=0 && iFreePg>nFin );
02324       assert( iFreePg<iLastPg );
02325       
02326       rc = sqlite3PagerWrite(pLastPg->pDbPage);
02327       if( rc==SQLITE_OK ){
02328         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
02329       }
02330       releasePage(pLastPg);
02331       if( rc!=SQLITE_OK ){
02332         return rc;
02333       }
02334     }
02335   }
02336 
02337   pBt->nTrunc = iLastPg - 1;
02338   while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
02339     pBt->nTrunc--;
02340   }
02341   return SQLITE_OK;
02342 }
02343 
02344 /*
02345 ** A write-transaction must be opened before calling this function.
02346 ** It performs a single unit of work towards an incremental vacuum.
02347 **
02348 ** If the incremental vacuum is finished after this function has run,
02349 ** SQLITE_DONE is returned. If it is not finished, but no error occured,
02350 ** SQLITE_OK is returned. Otherwise an SQLite error code. 
02351 */
02352 int sqlite3BtreeIncrVacuum(Btree *p){
02353   int rc;
02354   BtShared *pBt = p->pBt;
02355 
02356   sqlite3BtreeEnter(p);
02357   pBt->db = p->db;
02358   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
02359   if( !pBt->autoVacuum ){
02360     rc = SQLITE_DONE;
02361   }else{
02362     invalidateAllOverflowCache(pBt);
02363     rc = incrVacuumStep(pBt, 0);
02364   }
02365   sqlite3BtreeLeave(p);
02366   return rc;
02367 }
02368 
02369 /*
02370 ** This routine is called prior to sqlite3PagerCommit when a transaction
02371 ** is commited for an auto-vacuum database.
02372 **
02373 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
02374 ** the database file should be truncated to during the commit process. 
02375 ** i.e. the database has been reorganized so that only the first *pnTrunc
02376 ** pages are in use.
02377 */
02378 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
02379   int rc = SQLITE_OK;
02380   Pager *pPager = pBt->pPager;
02381   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
02382 
02383   assert( sqlite3_mutex_held(pBt->mutex) );
02384   invalidateAllOverflowCache(pBt);
02385   assert(pBt->autoVacuum);
02386   if( !pBt->incrVacuum ){
02387     Pgno nFin = 0;
02388 
02389     if( pBt->nTrunc==0 ){
02390       Pgno nFree;
02391       Pgno nPtrmap;
02392       const int pgsz = pBt->pageSize;
02393       int nOrig = pagerPagecount(pBt->pPager);
02394 
02395       if( PTRMAP_ISPAGE(pBt, nOrig) ){
02396         return SQLITE_CORRUPT_BKPT;
02397       }
02398       if( nOrig==PENDING_BYTE_PAGE(pBt) ){
02399         nOrig--;
02400       }
02401       nFree = get4byte(&pBt->pPage1->aData[36]);
02402       nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
02403       nFin = nOrig - nFree - nPtrmap;
02404       if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
02405         nFin--;
02406       }
02407       while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
02408         nFin--;
02409       }
02410     }
02411 
02412     while( rc==SQLITE_OK ){
02413       rc = incrVacuumStep(pBt, nFin);
02414     }
02415     if( rc==SQLITE_DONE ){
02416       assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
02417       rc = SQLITE_OK;
02418       if( pBt->nTrunc && nFin ){
02419         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
02420         put4byte(&pBt->pPage1->aData[32], 0);
02421         put4byte(&pBt->pPage1->aData[36], 0);
02422         pBt->nTrunc = nFin;
02423       }
02424     }
02425     if( rc!=SQLITE_OK ){
02426       sqlite3PagerRollback(pPager);
02427     }
02428   }
02429 
02430   if( rc==SQLITE_OK ){
02431     *pnTrunc = pBt->nTrunc;
02432     pBt->nTrunc = 0;
02433   }
02434   assert( nRef==sqlite3PagerRefcount(pPager) );
02435   return rc;
02436 }
02437 
02438 #endif /* ifndef SQLITE_OMIT_AUTOVACUUM */
02439 
02440 /*
02441 ** This routine does the first phase of a two-phase commit.  This routine
02442 ** causes a rollback journal to be created (if it does not already exist)
02443 ** and populated with enough information so that if a power loss occurs
02444 ** the database can be restored to its original state by playing back
02445 ** the journal.  Then the contents of the journal are flushed out to
02446 ** the disk.  After the journal is safely on oxide, the changes to the
02447 ** database are written into the database file and flushed to oxide.
02448 ** At the end of this call, the rollback journal still exists on the
02449 ** disk and we are still holding all locks, so the transaction has not
02450 ** committed.  See sqlite3BtreeCommit() for the second phase of the
02451 ** commit process.
02452 **
02453 ** This call is a no-op if no write-transaction is currently active on pBt.
02454 **
02455 ** Otherwise, sync the database file for the btree pBt. zMaster points to
02456 ** the name of a master journal file that should be written into the
02457 ** individual journal file, or is NULL, indicating no master journal file 
02458 ** (single database transaction).
02459 **
02460 ** When this is called, the master journal should already have been
02461 ** created, populated with this journal pointer and synced to disk.
02462 **
02463 ** Once this is routine has returned, the only thing required to commit
02464 ** the write-transaction for this database file is to delete the journal.
02465 */
02466 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
02467   int rc = SQLITE_OK;
02468   if( p->inTrans==TRANS_WRITE ){
02469     BtShared *pBt = p->pBt;
02470     Pgno nTrunc = 0;
02471     sqlite3BtreeEnter(p);
02472     pBt->db = p->db;
02473 #ifndef SQLITE_OMIT_AUTOVACUUM
02474     if( pBt->autoVacuum ){
02475       rc = autoVacuumCommit(pBt, &nTrunc); 
02476       if( rc!=SQLITE_OK ){
02477         sqlite3BtreeLeave(p);
02478         return rc;
02479       }
02480     }
02481 #endif
02482     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
02483     sqlite3BtreeLeave(p);
02484   }
02485   return rc;
02486 }
02487 
02488 /*
02489 ** Commit the transaction currently in progress.
02490 **
02491 ** This routine implements the second phase of a 2-phase commit.  The
02492 ** sqlite3BtreeSync() routine does the first phase and should be invoked
02493 ** prior to calling this routine.  The sqlite3BtreeSync() routine did
02494 ** all the work of writing information out to disk and flushing the
02495 ** contents so that they are written onto the disk platter.  All this
02496 ** routine has to do is delete or truncate the rollback journal
02497 ** (which causes the transaction to commit) and drop locks.
02498 **
02499 ** This will release the write lock on the database file.  If there
02500 ** are no active cursors, it also releases the read lock.
02501 */
02502 int sqlite3BtreeCommitPhaseTwo(Btree *p){
02503   BtShared *pBt = p->pBt;
02504 
02505   sqlite3BtreeEnter(p);
02506   pBt->db = p->db;
02507   btreeIntegrity(p);
02508 
02509   /* If the handle has a write-transaction open, commit the shared-btrees 
02510   ** transaction and set the shared state to TRANS_READ.
02511   */
02512   if( p->inTrans==TRANS_WRITE ){
02513     int rc;
02514     assert( pBt->inTransaction==TRANS_WRITE );
02515     assert( pBt->nTransaction>0 );
02516     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
02517     if( rc!=SQLITE_OK ){
02518       sqlite3BtreeLeave(p);
02519       return rc;
02520     }
02521     pBt->inTransaction = TRANS_READ;
02522     pBt->inStmt = 0;
02523   }
02524   unlockAllTables(p);
02525 
02526   /* If the handle has any kind of transaction open, decrement the transaction
02527   ** count of the shared btree. If the transaction count reaches 0, set
02528   ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
02529   ** will unlock the pager.
02530   */
02531   if( p->inTrans!=TRANS_NONE ){
02532     pBt->nTransaction--;
02533     if( 0==pBt->nTransaction ){
02534       pBt->inTransaction = TRANS_NONE;
02535     }
02536   }
02537 
02538   /* Set the handles current transaction state to TRANS_NONE and unlock
02539   ** the pager if this call closed the only read or write transaction.
02540   */
02541   p->inTrans = TRANS_NONE;
02542   unlockBtreeIfUnused(pBt);
02543 
02544   btreeIntegrity(p);
02545   sqlite3BtreeLeave(p);
02546   return SQLITE_OK;
02547 }
02548 
02549 /*
02550 ** Do both phases of a commit.
02551 */
02552 int sqlite3BtreeCommit(Btree *p){
02553   int rc;
02554   sqlite3BtreeEnter(p);
02555   rc = sqlite3BtreeCommitPhaseOne(p, 0);
02556   if( rc==SQLITE_OK ){
02557     rc = sqlite3BtreeCommitPhaseTwo(p);
02558   }
02559   sqlite3BtreeLeave(p);
02560   return rc;
02561 }
02562 
02563 #ifndef NDEBUG
02564 /*
02565 ** Return the number of write-cursors open on this handle. This is for use
02566 ** in assert() expressions, so it is only compiled if NDEBUG is not
02567 ** defined.
02568 **
02569 ** For the purposes of this routine, a write-cursor is any cursor that
02570 ** is capable of writing to the databse.  That means the cursor was
02571 ** originally opened for writing and the cursor has not be disabled
02572 ** by having its state changed to CURSOR_FAULT.
02573 */
02574 static int countWriteCursors(BtShared *pBt){
02575   BtCursor *pCur;
02576   int r = 0;
02577   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
02578     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 
02579   }
02580   return r;
02581 }
02582 #endif
02583 
02584 /*
02585 ** This routine sets the state to CURSOR_FAULT and the error
02586 ** code to errCode for every cursor on BtShared that pBtree
02587 ** references.
02588 **
02589 ** Every cursor is tripped, including cursors that belong
02590 ** to other database connections that happen to be sharing
02591 ** the cache with pBtree.
02592 **
02593 ** This routine gets called when a rollback occurs.
02594 ** All cursors using the same cache must be tripped
02595 ** to prevent them from trying to use the btree after
02596 ** the rollback.  The rollback may have deleted tables
02597 ** or moved root pages, so it is not sufficient to
02598 ** save the state of the cursor.  The cursor must be
02599 ** invalidated.
02600 */
02601 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
02602   BtCursor *p;
02603   sqlite3BtreeEnter(pBtree);
02604   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
02605     sqlite3BtreeClearCursor(p);
02606     p->eState = CURSOR_FAULT;
02607     p->skip = errCode;
02608   }
02609   sqlite3BtreeLeave(pBtree);
02610 }
02611 
02612 /*
02613 ** Rollback the transaction in progress.  All cursors will be
02614 ** invalided by this operation.  Any attempt to use a cursor
02615 ** that was open at the beginning of this operation will result
02616 ** in an error.
02617 **
02618 ** This will release the write lock on the database file.  If there
02619 ** are no active cursors, it also releases the read lock.
02620 */
02621 int sqlite3BtreeRollback(Btree *p){
02622   int rc;
02623   BtShared *pBt = p->pBt;
02624   MemPage *pPage1;
02625 
02626   sqlite3BtreeEnter(p);
02627   pBt->db = p->db;
02628   rc = saveAllCursors(pBt, 0, 0);
02629 #ifndef SQLITE_OMIT_SHARED_CACHE
02630   if( rc!=SQLITE_OK ){
02631     /* This is a horrible situation. An IO or malloc() error occured whilst
02632     ** trying to save cursor positions. If this is an automatic rollback (as
02633     ** the result of a constraint, malloc() failure or IO error) then 
02634     ** the cache may be internally inconsistent (not contain valid trees) so
02635     ** we cannot simply return the error to the caller. Instead, abort 
02636     ** all queries that may be using any of the cursors that failed to save.
02637     */
02638     sqlite3BtreeTripAllCursors(p, rc);
02639   }
02640 #endif
02641   btreeIntegrity(p);
02642   unlockAllTables(p);
02643 
02644   if( p->inTrans==TRANS_WRITE ){
02645     int rc2;
02646 
02647 #ifndef SQLITE_OMIT_AUTOVACUUM
02648     pBt->nTrunc = 0;
02649 #endif
02650 
02651     assert( TRANS_WRITE==pBt->inTransaction );
02652     rc2 = sqlite3PagerRollback(pBt->pPager);
02653     if( rc2!=SQLITE_OK ){
02654       rc = rc2;
02655     }
02656 
02657     /* The rollback may have destroyed the pPage1->aData value.  So
02658     ** call sqlite3BtreeGetPage() on page 1 again to make
02659     ** sure pPage1->aData is set correctly. */
02660     if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
02661       releasePage(pPage1);
02662     }
02663     assert( countWriteCursors(pBt)==0 );
02664     pBt->inTransaction = TRANS_READ;
02665   }
02666 
02667   if( p->inTrans!=TRANS_NONE ){
02668     assert( pBt->nTransaction>0 );
02669     pBt->nTransaction--;
02670     if( 0==pBt->nTransaction ){
02671       pBt->inTransaction = TRANS_NONE;
02672     }
02673   }
02674 
02675   p->inTrans = TRANS_NONE;
02676   pBt->inStmt = 0;
02677   unlockBtreeIfUnused(pBt);
02678 
02679   btreeIntegrity(p);
02680   sqlite3BtreeLeave(p);
02681   return rc;
02682 }
02683 
02684 /*
02685 ** Start a statement subtransaction.  The subtransaction can
02686 ** can be rolled back independently of the main transaction.
02687 ** You must start a transaction before starting a subtransaction.
02688 ** The subtransaction is ended automatically if the main transaction
02689 ** commits or rolls back.
02690 **
02691 ** Only one subtransaction may be active at a time.  It is an error to try
02692 ** to start a new subtransaction if another subtransaction is already active.
02693 **
02694 ** Statement subtransactions are used around individual SQL statements
02695 ** that are contained within a BEGIN...COMMIT block.  If a constraint
02696 ** error occurs within the statement, the effect of that one statement
02697 ** can be rolled back without having to rollback the entire transaction.
02698 */
02699 int sqlite3BtreeBeginStmt(Btree *p){
02700   int rc;
02701   BtShared *pBt = p->pBt;
02702   sqlite3BtreeEnter(p);
02703   pBt->db = p->db;
02704   if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
02705     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
02706   }else{
02707     assert( pBt->inTransaction==TRANS_WRITE );
02708     rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
02709     pBt->inStmt = 1;
02710   }
02711   sqlite3BtreeLeave(p);
02712   return rc;
02713 }
02714 
02715 
02716 /*
02717 ** Commit the statment subtransaction currently in progress.  If no
02718 ** subtransaction is active, this is a no-op.
02719 */
02720 int sqlite3BtreeCommitStmt(Btree *p){
02721   int rc;
02722   BtShared *pBt = p->pBt;
02723   sqlite3BtreeEnter(p);
02724   pBt->db = p->db;
02725   if( pBt->inStmt && !pBt->readOnly ){
02726     rc = sqlite3PagerStmtCommit(pBt->pPager);
02727   }else{
02728     rc = SQLITE_OK;
02729   }
02730   pBt->inStmt = 0;
02731   sqlite3BtreeLeave(p);
02732   return rc;
02733 }
02734 
02735 /*
02736 ** Rollback the active statement subtransaction.  If no subtransaction
02737 ** is active this routine is a no-op.
02738 **
02739 ** All cursors will be invalidated by this operation.  Any attempt
02740 ** to use a cursor that was open at the beginning of this operation
02741 ** will result in an error.
02742 */
02743 int sqlite3BtreeRollbackStmt(Btree *p){
02744   int rc = SQLITE_OK;
02745   BtShared *pBt = p->pBt;
02746   sqlite3BtreeEnter(p);
02747   pBt->db = p->db;
02748   if( pBt->inStmt && !pBt->readOnly ){
02749     rc = sqlite3PagerStmtRollback(pBt->pPager);
02750     pBt->inStmt = 0;
02751   }
02752   sqlite3BtreeLeave(p);
02753   return rc;
02754 }
02755 
02756 /*
02757 ** Create a new cursor for the BTree whose root is on the page
02758 ** iTable.  The act of acquiring a cursor gets a read lock on 
02759 ** the database file.
02760 **
02761 ** If wrFlag==0, then the cursor can only be used for reading.
02762 ** If wrFlag==1, then the cursor can be used for reading or for
02763 ** writing if other conditions for writing are also met.  These
02764 ** are the conditions that must be met in order for writing to
02765 ** be allowed:
02766 **
02767 ** 1:  The cursor must have been opened with wrFlag==1
02768 **
02769 ** 2:  Other database connections that share the same pager cache
02770 **     but which are not in the READ_UNCOMMITTED state may not have
02771 **     cursors open with wrFlag==0 on the same table.  Otherwise
02772 **     the changes made by this write cursor would be visible to
02773 **     the read cursors in the other database connection.
02774 **
02775 ** 3:  The database must be writable (not on read-only media)
02776 **
02777 ** 4:  There must be an active transaction.
02778 **
02779 ** No checking is done to make sure that page iTable really is the
02780 ** root page of a b-tree.  If it is not, then the cursor acquired
02781 ** will not work correctly.
02782 **
02783 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory 
02784 ** pointed to by pCur have been zeroed by the caller.
02785 */
02786 static int btreeCursor(
02787   Btree *p,                              /* The btree */
02788   int iTable,                            /* Root page of table to open */
02789   int wrFlag,                            /* 1 to write. 0 read-only */
02790   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
02791   BtCursor *pCur                         /* Space for new cursor */
02792 ){
02793   int rc;
02794   BtShared *pBt = p->pBt;
02795 
02796   assert( sqlite3BtreeHoldsMutex(p) );
02797   if( wrFlag ){
02798     if( pBt->readOnly ){
02799       return SQLITE_READONLY;
02800     }
02801     if( checkReadLocks(p, iTable, 0, 0) ){
02802       return SQLITE_LOCKED;
02803     }
02804   }
02805 
02806   if( pBt->pPage1==0 ){
02807     rc = lockBtreeWithRetry(p);
02808     if( rc!=SQLITE_OK ){
02809       return rc;
02810     }
02811     if( pBt->readOnly && wrFlag ){
02812       return SQLITE_READONLY;
02813     }
02814   }
02815   pCur->pgnoRoot = (Pgno)iTable;
02816   if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
02817     rc = SQLITE_EMPTY;
02818     goto create_cursor_exception;
02819   }
02820   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
02821   if( rc!=SQLITE_OK ){
02822     goto create_cursor_exception;
02823   }
02824 
02825   /* Now that no other errors can occur, finish filling in the BtCursor
02826   ** variables, link the cursor into the BtShared list and set *ppCur (the
02827   ** output argument to this function).
02828   */
02829   pCur->pKeyInfo = pKeyInfo;
02830   pCur->pBtree = p;
02831   pCur->pBt = pBt;
02832   pCur->wrFlag = wrFlag;
02833   pCur->pNext = pBt->pCursor;
02834   if( pCur->pNext ){
02835     pCur->pNext->pPrev = pCur;
02836   }
02837   pBt->pCursor = pCur;
02838   pCur->eState = CURSOR_INVALID;
02839 
02840   return SQLITE_OK;
02841 
02842 create_cursor_exception:
02843   releasePage(pCur->apPage[0]);
02844   unlockBtreeIfUnused(pBt);
02845   return rc;
02846 }
02847 int sqlite3BtreeCursor(
02848   Btree *p,                                   /* The btree */
02849   int iTable,                                 /* Root page of table to open */
02850   int wrFlag,                                 /* 1 to write. 0 read-only */
02851   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
02852   BtCursor *pCur                              /* Write new cursor here */
02853 ){
02854   int rc;
02855   sqlite3BtreeEnter(p);
02856   p->pBt->db = p->db;
02857   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
02858   sqlite3BtreeLeave(p);
02859   return rc;
02860 }
02861 int sqlite3BtreeCursorSize(){
02862   return sizeof(BtCursor);
02863 }
02864 
02865 
02866 
02867 /*
02868 ** Close a cursor.  The read lock on the database file is released
02869 ** when the last cursor is closed.
02870 */
02871 int sqlite3BtreeCloseCursor(BtCursor *pCur){
02872   Btree *pBtree = pCur->pBtree;
02873   if( pBtree ){
02874     int i;
02875     BtShared *pBt = pCur->pBt;
02876     sqlite3BtreeEnter(pBtree);
02877     pBt->db = pBtree->db;
02878     sqlite3BtreeClearCursor(pCur);
02879     if( pCur->pPrev ){
02880       pCur->pPrev->pNext = pCur->pNext;
02881     }else{
02882       pBt->pCursor = pCur->pNext;
02883     }
02884     if( pCur->pNext ){
02885       pCur->pNext->pPrev = pCur->pPrev;
02886     }
02887     for(i=0; i<=pCur->iPage; i++){
02888       releasePage(pCur->apPage[i]);
02889     }
02890     unlockBtreeIfUnused(pBt);
02891     invalidateOverflowCache(pCur);
02892     /* sqlite3_free(pCur); */
02893     sqlite3BtreeLeave(pBtree);
02894   }
02895   return SQLITE_OK;
02896 }
02897 
02898 /*
02899 ** Make a temporary cursor by filling in the fields of pTempCur.
02900 ** The temporary cursor is not on the cursor list for the Btree.
02901 */
02902 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
02903   int i;
02904   assert( cursorHoldsMutex(pCur) );
02905   memcpy(pTempCur, pCur, sizeof(BtCursor));
02906   pTempCur->pNext = 0;
02907   pTempCur->pPrev = 0;
02908   for(i=0; i<=pTempCur->iPage; i++){
02909     sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
02910   }
02911 }
02912 
02913 /*
02914 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
02915 ** function above.
02916 */
02917 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
02918   int i;
02919   assert( cursorHoldsMutex(pCur) );
02920   for(i=0; i<=pCur->iPage; i++){
02921     sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
02922   }
02923 }
02924 
02925 /*
02926 ** Make sure the BtCursor* given in the argument has a valid
02927 ** BtCursor.info structure.  If it is not already valid, call
02928 ** sqlite3BtreeParseCell() to fill it in.
02929 **
02930 ** BtCursor.info is a cache of the information in the current cell.
02931 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
02932 **
02933 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
02934 ** compiler to crash when getCellInfo() is implemented as a macro.
02935 ** But there is a measureable speed advantage to using the macro on gcc
02936 ** (when less compiler optimizations like -Os or -O0 are used and the
02937 ** compiler is not doing agressive inlining.)  So we use a real function
02938 ** for MSVC and a macro for everything else.  Ticket #2457.
02939 */
02940 #ifndef NDEBUG
02941   static void assertCellInfo(BtCursor *pCur){
02942     CellInfo info;
02943     int iPage = pCur->iPage;
02944     memset(&info, 0, sizeof(info));
02945     sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
02946     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
02947   }
02948 #else
02949   #define assertCellInfo(x)
02950 #endif
02951 #ifdef _MSC_VER
02952   /* Use a real function in MSVC to work around bugs in that compiler. */
02953   static void getCellInfo(BtCursor *pCur){
02954     if( pCur->info.nSize==0 ){
02955       int iPage = pCur->iPage;
02956       sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
02957       pCur->validNKey = 1;
02958     }else{
02959       assertCellInfo(pCur);
02960     }
02961   }
02962 #else /* if not _MSC_VER */
02963   /* Use a macro in all other compilers so that the function is inlined */
02964 #define getCellInfo(pCur)                                                      \
02965   if( pCur->info.nSize==0 ){                                                   \
02966     int iPage = pCur->iPage;                                                   \
02967     sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
02968     pCur->validNKey = 1;                                                       \
02969   }else{                                                                       \
02970     assertCellInfo(pCur);                                                      \
02971   }
02972 #endif /* _MSC_VER */
02973 
02974 /*
02975 ** Set *pSize to the size of the buffer needed to hold the value of
02976 ** the key for the current entry.  If the cursor is not pointing
02977 ** to a valid entry, *pSize is set to 0. 
02978 **
02979 ** For a table with the INTKEY flag set, this routine returns the key
02980 ** itself, not the number of bytes in the key.
02981 */
02982 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
02983   int rc;
02984 
02985   assert( cursorHoldsMutex(pCur) );
02986   rc = restoreCursorPosition(pCur);
02987   if( rc==SQLITE_OK ){
02988     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
02989     if( pCur->eState==CURSOR_INVALID ){
02990       *pSize = 0;
02991     }else{
02992       getCellInfo(pCur);
02993       *pSize = pCur->info.nKey;
02994     }
02995   }
02996   return rc;
02997 }
02998 
02999 /*
03000 ** Set *pSize to the number of bytes of data in the entry the
03001 ** cursor currently points to.  Always return SQLITE_OK.
03002 ** Failure is not possible.  If the cursor is not currently
03003 ** pointing to an entry (which can happen, for example, if
03004 ** the database is empty) then *pSize is set to 0.
03005 */
03006 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
03007   int rc;
03008 
03009   assert( cursorHoldsMutex(pCur) );
03010   rc = restoreCursorPosition(pCur);
03011   if( rc==SQLITE_OK ){
03012     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
03013     if( pCur->eState==CURSOR_INVALID ){
03014       /* Not pointing at a valid entry - set *pSize to 0. */
03015       *pSize = 0;
03016     }else{
03017       getCellInfo(pCur);
03018       *pSize = pCur->info.nData;
03019     }
03020   }
03021   return rc;
03022 }
03023 
03024 /*
03025 ** Given the page number of an overflow page in the database (parameter
03026 ** ovfl), this function finds the page number of the next page in the 
03027 ** linked list of overflow pages. If possible, it uses the auto-vacuum
03028 ** pointer-map data instead of reading the content of page ovfl to do so. 
03029 **
03030 ** If an error occurs an SQLite error code is returned. Otherwise:
03031 **
03032 ** Unless pPgnoNext is NULL, the page number of the next overflow 
03033 ** page in the linked list is written to *pPgnoNext. If page ovfl
03034 ** is the last page in its linked list, *pPgnoNext is set to zero. 
03035 **
03036 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle
03037 ** for page ovfl. The underlying pager page may have been requested
03038 ** with the noContent flag set, so the page data accessable via
03039 ** this handle may not be trusted.
03040 */
03041 static int getOverflowPage(
03042   BtShared *pBt, 
03043   Pgno ovfl,                   /* Overflow page */
03044   MemPage **ppPage,            /* OUT: MemPage handle */
03045   Pgno *pPgnoNext              /* OUT: Next overflow page number */
03046 ){
03047   Pgno next = 0;
03048   int rc;
03049 
03050   assert( sqlite3_mutex_held(pBt->mutex) );
03051   /* One of these must not be NULL. Otherwise, why call this function? */
03052   assert(ppPage || pPgnoNext);
03053 
03054   /* If pPgnoNext is NULL, then this function is being called to obtain
03055   ** a MemPage* reference only. No page-data is required in this case.
03056   */
03057   if( !pPgnoNext ){
03058     return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
03059   }
03060 
03061 #ifndef SQLITE_OMIT_AUTOVACUUM
03062   /* Try to find the next page in the overflow list using the
03063   ** autovacuum pointer-map pages. Guess that the next page in 
03064   ** the overflow list is page number (ovfl+1). If that guess turns 
03065   ** out to be wrong, fall back to loading the data of page 
03066   ** number ovfl to determine the next page number.
03067   */
03068   if( pBt->autoVacuum ){
03069     Pgno pgno;
03070     Pgno iGuess = ovfl+1;
03071     u8 eType;
03072 
03073     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
03074       iGuess++;
03075     }
03076 
03077     if( iGuess<=pagerPagecount(pBt->pPager) ){
03078       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
03079       if( rc!=SQLITE_OK ){
03080         return rc;
03081       }
03082       if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
03083         next = iGuess;
03084       }
03085     }
03086   }
03087 #endif
03088 
03089   if( next==0 || ppPage ){
03090     MemPage *pPage = 0;
03091 
03092     rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
03093     assert(rc==SQLITE_OK || pPage==0);
03094     if( next==0 && rc==SQLITE_OK ){
03095       next = get4byte(pPage->aData);
03096     }
03097 
03098     if( ppPage ){
03099       *ppPage = pPage;
03100     }else{
03101       releasePage(pPage);
03102     }
03103   }
03104   *pPgnoNext = next;
03105 
03106   return rc;
03107 }
03108 
03109 /*
03110 ** Copy data from a buffer to a page, or from a page to a buffer.
03111 **
03112 ** pPayload is a pointer to data stored on database page pDbPage.
03113 ** If argument eOp is false, then nByte bytes of data are copied
03114 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
03115 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
03116 ** of data are copied from the buffer pBuf to pPayload.
03117 **
03118 ** SQLITE_OK is returned on success, otherwise an error code.
03119 */
03120 static int copyPayload(
03121   void *pPayload,           /* Pointer to page data */
03122   void *pBuf,               /* Pointer to buffer */
03123   int nByte,                /* Number of bytes to copy */
03124   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
03125   DbPage *pDbPage           /* Page containing pPayload */
03126 ){
03127   if( eOp ){
03128     /* Copy data from buffer to page (a write operation) */
03129     int rc = sqlite3PagerWrite(pDbPage);
03130     if( rc!=SQLITE_OK ){
03131       return rc;
03132     }
03133     memcpy(pPayload, pBuf, nByte);
03134   }else{
03135     /* Copy data from page to buffer (a read operation) */
03136     memcpy(pBuf, pPayload, nByte);
03137   }
03138   return SQLITE_OK;
03139 }
03140 
03141 /*
03142 ** This function is used to read or overwrite payload information
03143 ** for the entry that the pCur cursor is pointing to. If the eOp
03144 ** parameter is 0, this is a read operation (data copied into
03145 ** buffer pBuf). If it is non-zero, a write (data copied from
03146 ** buffer pBuf).
03147 **
03148 ** A total of "amt" bytes are read or written beginning at "offset".
03149 ** Data is read to or from the buffer pBuf.
03150 **
03151 ** This routine does not make a distinction between key and data.
03152 ** It just reads or writes bytes from the payload area.  Data might 
03153 ** appear on the main page or be scattered out on multiple overflow 
03154 ** pages.
03155 **
03156 ** If the BtCursor.isIncrblobHandle flag is set, and the current
03157 ** cursor entry uses one or more overflow pages, this function
03158 ** allocates space for and lazily popluates the overflow page-list 
03159 ** cache array (BtCursor.aOverflow). Subsequent calls use this
03160 ** cache to make seeking to the supplied offset more efficient.
03161 **
03162 ** Once an overflow page-list cache has been allocated, it may be
03163 ** invalidated if some other cursor writes to the same table, or if
03164 ** the cursor is moved to a different row. Additionally, in auto-vacuum
03165 ** mode, the following events may invalidate an overflow page-list cache.
03166 **
03167 **   * An incremental vacuum,
03168 **   * A commit in auto_vacuum="full" mode,
03169 **   * Creating a table (may require moving an overflow page).
03170 */
03171 static int accessPayload(
03172   BtCursor *pCur,      /* Cursor pointing to entry to read from */
03173   int offset,          /* Begin reading this far into payload */
03174   int amt,             /* Read this many bytes */
03175   unsigned char *pBuf, /* Write the bytes into this buffer */ 
03176   int skipKey,         /* offset begins at data if this is true */
03177   int eOp              /* zero to read. non-zero to write. */
03178 ){
03179   unsigned char *aPayload;
03180   int rc = SQLITE_OK;
03181   u32 nKey;
03182   int iIdx = 0;
03183   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
03184   BtShared *pBt;                              /* Btree this cursor belongs to */
03185 
03186   assert( pPage );
03187   assert( pCur->eState==CURSOR_VALID );
03188   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
03189   assert( offset>=0 );
03190   assert( cursorHoldsMutex(pCur) );
03191 
03192   getCellInfo(pCur);
03193   aPayload = pCur->info.pCell + pCur->info.nHeader;
03194   nKey = (pPage->intKey ? 0 : pCur->info.nKey);
03195 
03196   if( skipKey ){
03197     offset += nKey;
03198   }
03199   if( offset+amt > nKey+pCur->info.nData ){
03200     /* Trying to read or write past the end of the data is an error */
03201     return SQLITE_CORRUPT_BKPT;
03202   }
03203 
03204   /* Check if data must be read/written to/from the btree page itself. */
03205   if( offset<pCur->info.nLocal ){
03206     int a = amt;
03207     if( a+offset>pCur->info.nLocal ){
03208       a = pCur->info.nLocal - offset;
03209     }
03210     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
03211     offset = 0;
03212     pBuf += a;
03213     amt -= a;
03214   }else{
03215     offset -= pCur->info.nLocal;
03216   }
03217 
03218   pBt = pCur->pBt;
03219   if( rc==SQLITE_OK && amt>0 ){
03220     const int ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
03221     Pgno nextPage;
03222 
03223     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
03224 
03225 #ifndef SQLITE_OMIT_INCRBLOB
03226     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
03227     ** has not been allocated, allocate it now. The array is sized at
03228     ** one entry for each overflow page in the overflow chain. The
03229     ** page number of the first overflow page is stored in aOverflow[0],
03230     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
03231     ** (the cache is lazily populated).
03232     */
03233     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
03234       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
03235       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
03236       if( nOvfl && !pCur->aOverflow ){
03237         rc = SQLITE_NOMEM;
03238       }
03239     }
03240 
03241     /* If the overflow page-list cache has been allocated and the
03242     ** entry for the first required overflow page is valid, skip
03243     ** directly to it.
03244     */
03245     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
03246       iIdx = (offset/ovflSize);
03247       nextPage = pCur->aOverflow[iIdx];
03248       offset = (offset%ovflSize);
03249     }
03250 #endif
03251 
03252     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
03253 
03254 #ifndef SQLITE_OMIT_INCRBLOB
03255       /* If required, populate the overflow page-list cache. */
03256       if( pCur->aOverflow ){
03257         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
03258         pCur->aOverflow[iIdx] = nextPage;
03259       }
03260 #endif
03261 
03262       if( offset>=ovflSize ){
03263         /* The only reason to read this page is to obtain the page
03264         ** number for the next page in the overflow chain. The page
03265         ** data is not required. So first try to lookup the overflow
03266         ** page-list cache, if any, then fall back to the getOverflowPage()
03267         ** function.
03268         */
03269 #ifndef SQLITE_OMIT_INCRBLOB
03270         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
03271           nextPage = pCur->aOverflow[iIdx+1];
03272         } else 
03273 #endif
03274           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
03275         offset -= ovflSize;
03276       }else{
03277         /* Need to read this page properly. It contains some of the
03278         ** range of data that is being read (eOp==0) or written (eOp!=0).
03279         */
03280         DbPage *pDbPage;
03281         int a = amt;
03282         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
03283         if( rc==SQLITE_OK ){
03284           aPayload = sqlite3PagerGetData(pDbPage);
03285           nextPage = get4byte(aPayload);
03286           if( a + offset > ovflSize ){
03287             a = ovflSize - offset;
03288           }
03289           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
03290           sqlite3PagerUnref(pDbPage);
03291           offset = 0;
03292           amt -= a;
03293           pBuf += a;
03294         }
03295       }
03296     }
03297   }
03298 
03299   if( rc==SQLITE_OK && amt>0 ){
03300     return SQLITE_CORRUPT_BKPT;
03301   }
03302   return rc;
03303 }
03304 
03305 /*
03306 ** Read part of the key associated with cursor pCur.  Exactly
03307 ** "amt" bytes will be transfered into pBuf[].  The transfer
03308 ** begins at "offset".
03309 **
03310 ** Return SQLITE_OK on success or an error code if anything goes
03311 ** wrong.  An error is returned if "offset+amt" is larger than
03312 ** the available payload.
03313 */
03314 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
03315   int rc;
03316 
03317   assert( cursorHoldsMutex(pCur) );
03318   rc = restoreCursorPosition(pCur);
03319   if( rc==SQLITE_OK ){
03320     assert( pCur->eState==CURSOR_VALID );
03321     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
03322     if( pCur->apPage[0]->intKey ){
03323       return SQLITE_CORRUPT_BKPT;
03324     }
03325     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
03326     rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
03327   }
03328   return rc;
03329 }
03330 
03331 /*
03332 ** Read part of the data associated with cursor pCur.  Exactly
03333 ** "amt" bytes will be transfered into pBuf[].  The transfer
03334 ** begins at "offset".
03335 **
03336 ** Return SQLITE_OK on success or an error code if anything goes
03337 ** wrong.  An error is returned if "offset+amt" is larger than
03338 ** the available payload.
03339 */
03340 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
03341   int rc;
03342 
03343 #ifndef SQLITE_OMIT_INCRBLOB
03344   if ( pCur->eState==CURSOR_INVALID ){
03345     return SQLITE_ABORT;
03346   }
03347 #endif
03348 
03349   assert( cursorHoldsMutex(pCur) );
03350   rc = restoreCursorPosition(pCur);
03351   if( rc==SQLITE_OK ){
03352     assert( pCur->eState==CURSOR_VALID );
03353     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
03354     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
03355     rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
03356   }
03357   return rc;
03358 }
03359 
03360 /*
03361 ** Return a pointer to payload information from the entry that the 
03362 ** pCur cursor is pointing to.  The pointer is to the beginning of
03363 ** the key if skipKey==0 and it points to the beginning of data if
03364 ** skipKey==1.  The number of bytes of available key/data is written
03365 ** into *pAmt.  If *pAmt==0, then the value returned will not be
03366 ** a valid pointer.
03367 **
03368 ** This routine is an optimization.  It is common for the entire key
03369 ** and data to fit on the local page and for there to be no overflow
03370 ** pages.  When that is so, this routine can be used to access the
03371 ** key and data without making a copy.  If the key and/or data spills
03372 ** onto overflow pages, then accessPayload() must be used to reassembly
03373 ** the key/data and copy it into a preallocated buffer.
03374 **
03375 ** The pointer returned by this routine looks directly into the cached
03376 ** page of the database.  The data might change or move the next time
03377 ** any btree routine is called.
03378 */
03379 static const unsigned char *fetchPayload(
03380   BtCursor *pCur,      /* Cursor pointing to entry to read from */
03381   int *pAmt,           /* Write the number of available bytes here */
03382   int skipKey          /* read beginning at data if this is true */
03383 ){
03384   unsigned char *aPayload;
03385   MemPage *pPage;
03386   u32 nKey;
03387   int nLocal;
03388 
03389   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
03390   assert( pCur->eState==CURSOR_VALID );
03391   assert( cursorHoldsMutex(pCur) );
03392   pPage = pCur->apPage[pCur->iPage];
03393   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
03394   getCellInfo(pCur);
03395   aPayload = pCur->info.pCell;
03396   aPayload += pCur->info.nHeader;
03397   if( pPage->intKey ){
03398     nKey = 0;
03399   }else{
03400     nKey = pCur->info.nKey;
03401   }
03402   if( skipKey ){
03403     aPayload += nKey;
03404     nLocal = pCur->info.nLocal - nKey;
03405   }else{
03406     nLocal = pCur->info.nLocal;
03407     if( nLocal>nKey ){
03408       nLocal = nKey;
03409     }
03410   }
03411   *pAmt = nLocal;
03412   return aPayload;
03413 }
03414 
03415 
03416 /*
03417 ** For the entry that cursor pCur is point to, return as
03418 ** many bytes of the key or data as are available on the local
03419 ** b-tree page.  Write the number of available bytes into *pAmt.
03420 **
03421 ** The pointer returned is ephemeral.  The key/data may move
03422 ** or be destroyed on the next call to any Btree routine,
03423 ** including calls from other threads against the same cache.
03424 ** Hence, a mutex on the BtShared should be held prior to calling
03425 ** this routine.
03426 **
03427 ** These routines is used to get quick access to key and data
03428 ** in the common case where no overflow pages are used.
03429 */
03430 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
03431   assert( cursorHoldsMutex(pCur) );
03432   if( pCur->eState==CURSOR_VALID ){
03433     return (const void*)fetchPayload(pCur, pAmt, 0);
03434   }
03435   return 0;
03436 }
03437 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
03438   assert( cursorHoldsMutex(pCur) );
03439   if( pCur->eState==CURSOR_VALID ){
03440     return (const void*)fetchPayload(pCur, pAmt, 1);
03441   }
03442   return 0;
03443 }
03444 
03445 
03446 /*
03447 ** Move the cursor down to a new child page.  The newPgno argument is the
03448 ** page number of the child page to move to.
03449 */
03450 static int moveToChild(BtCursor *pCur, u32 newPgno){
03451   int rc;
03452   int i = pCur->iPage;
03453   MemPage *pNewPage;
03454   BtShared *pBt = pCur->pBt;
03455 
03456   assert( cursorHoldsMutex(pCur) );
03457   assert( pCur->eState==CURSOR_VALID );
03458   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
03459   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
03460     return SQLITE_CORRUPT_BKPT;
03461   }
03462   rc = getAndInitPage(pBt, newPgno, &pNewPage);
03463   if( rc ) return rc;
03464   pCur->apPage[i+1] = pNewPage;
03465   pCur->aiIdx[i+1] = 0;
03466   pCur->iPage++;
03467 
03468   pCur->info.nSize = 0;
03469   pCur->validNKey = 0;
03470   if( pNewPage->nCell<1 ){
03471     return SQLITE_CORRUPT_BKPT;
03472   }
03473   return SQLITE_OK;
03474 }
03475 
03476 #ifndef NDEBUG
03477 /*
03478 ** Page pParent is an internal (non-leaf) tree page. This function 
03479 ** asserts that page number iChild is the left-child if the iIdx'th
03480 ** cell in page pParent. Or, if iIdx is equal to the total number of
03481 ** cells in pParent, that page number iChild is the right-child of
03482 ** the page.
03483 */
03484 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
03485   assert( iIdx<=pParent->nCell );
03486   if( iIdx==pParent->nCell ){
03487     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
03488   }else{
03489     assert( get4byte(findCell(pParent, iIdx))==iChild );
03490   }
03491 }
03492 #else
03493 #  define assertParentIndex(x,y,z) 
03494 #endif
03495 
03496 /*
03497 ** Move the cursor up to the parent page.
03498 **
03499 ** pCur->idx is set to the cell index that contains the pointer
03500 ** to the page we are coming from.  If we are coming from the
03501 ** right-most child page then pCur->idx is set to one more than
03502 ** the largest cell index.
03503 */
03504 void sqlite3BtreeMoveToParent(BtCursor *pCur){
03505   assert( cursorHoldsMutex(pCur) );
03506   assert( pCur->eState==CURSOR_VALID );
03507   assert( pCur->iPage>0 );
03508   assert( pCur->apPage[pCur->iPage] );
03509   assertParentIndex(
03510     pCur->apPage[pCur->iPage-1], 
03511     pCur->aiIdx[pCur->iPage-1], 
03512     pCur->apPage[pCur->iPage]->pgno
03513   );
03514   releasePage(pCur->apPage[pCur->iPage]);
03515   pCur->iPage--;
03516   pCur->info.nSize = 0;
03517   pCur->validNKey = 0;
03518 }
03519 
03520 /*
03521 ** Move the cursor to the root page
03522 */
03523 static int moveToRoot(BtCursor *pCur){
03524   MemPage *pRoot;
03525   int rc = SQLITE_OK;
03526   Btree *p = pCur->pBtree;
03527   BtShared *pBt = p->pBt;
03528 
03529   assert( cursorHoldsMutex(pCur) );
03530   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
03531   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
03532   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
03533   if( pCur->eState>=CURSOR_REQUIRESEEK ){
03534     if( pCur->eState==CURSOR_FAULT ){
03535       return pCur->skip;
03536     }
03537     sqlite3BtreeClearCursor(pCur);
03538   }
03539 
03540   if( pCur->iPage>=0 ){
03541     int i;
03542     for(i=1; i<=pCur->iPage; i++){
03543       releasePage(pCur->apPage[i]);
03544     }
03545   }else{
03546     if( 
03547       SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
03548     ){
03549       pCur->eState = CURSOR_INVALID;
03550       return rc;
03551     }
03552   }
03553 
03554   pRoot = pCur->apPage[0];
03555   assert( pRoot->pgno==pCur->pgnoRoot );
03556   pCur->iPage = 0;
03557   pCur->aiIdx[0] = 0;
03558   pCur->info.nSize = 0;
03559   pCur->atLast = 0;
03560   pCur->validNKey = 0;
03561 
03562   if( pRoot->nCell==0 && !pRoot->leaf ){
03563     Pgno subpage;
03564     assert( pRoot->pgno==1 );
03565     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
03566     assert( subpage>0 );
03567     pCur->eState = CURSOR_VALID;
03568     rc = moveToChild(pCur, subpage);
03569   }else{
03570     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
03571   }
03572   return rc;
03573 }
03574 
03575 /*
03576 ** Move the cursor down to the left-most leaf entry beneath the
03577 ** entry to which it is currently pointing.
03578 **
03579 ** The left-most leaf is the one with the smallest key - the first
03580 ** in ascending order.
03581 */
03582 static int moveToLeftmost(BtCursor *pCur){
03583   Pgno pgno;
03584   int rc = SQLITE_OK;
03585   MemPage *pPage;
03586 
03587   assert( cursorHoldsMutex(pCur) );
03588   assert( pCur->eState==CURSOR_VALID );
03589   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
03590     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
03591     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
03592     rc = moveToChild(pCur, pgno);
03593   }
03594   return rc;
03595 }
03596 
03597 /*
03598 ** Move the cursor down to the right-most leaf entry beneath the
03599 ** page to which it is currently pointing.  Notice the difference
03600 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
03601 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
03602 ** finds the right-most entry beneath the *page*.
03603 **
03604 ** The right-most entry is the one with the largest key - the last
03605 ** key in ascending order.
03606 */
03607 static int moveToRightmost(BtCursor *pCur){
03608   Pgno pgno;
03609   int rc = SQLITE_OK;
03610   MemPage *pPage;
03611 
03612   assert( cursorHoldsMutex(pCur) );
03613   assert( pCur->eState==CURSOR_VALID );
03614   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
03615     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
03616     pCur->aiIdx[pCur->iPage] = pPage->nCell;
03617     rc = moveToChild(pCur, pgno);
03618   }
03619   if( rc==SQLITE_OK ){
03620     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
03621     pCur->info.nSize = 0;
03622     pCur->validNKey = 0;
03623   }
03624   return rc;
03625 }
03626 
03627 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
03628 ** on success.  Set *pRes to 0 if the cursor actually points to something
03629 ** or set *pRes to 1 if the table is empty.
03630 */
03631 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
03632   int rc;
03633 
03634   assert( cursorHoldsMutex(pCur) );
03635   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
03636   rc = moveToRoot(pCur);
03637   if( rc==SQLITE_OK ){
03638     if( pCur->eState==CURSOR_INVALID ){
03639       assert( pCur->apPage[pCur->iPage]->nCell==0 );
03640       *pRes = 1;
03641       rc = SQLITE_OK;
03642     }else{
03643       assert( pCur->apPage[pCur->iPage]->nCell>0 );
03644       *pRes = 0;
03645       rc = moveToLeftmost(pCur);
03646     }
03647   }
03648   return rc;
03649 }
03650 
03651 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
03652 ** on success.  Set *pRes to 0 if the cursor actually points to something
03653 ** or set *pRes to 1 if the table is empty.
03654 */
03655 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
03656   int rc;
03657  
03658   assert( cursorHoldsMutex(pCur) );
03659   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
03660   rc = moveToRoot(pCur);
03661   if( rc==SQLITE_OK ){
03662     if( CURSOR_INVALID==pCur->eState ){
03663       assert( pCur->apPage[pCur->iPage]->nCell==0 );
03664       *pRes = 1;
03665     }else{
03666       assert( pCur->eState==CURSOR_VALID );
03667       *pRes = 0;
03668       rc = moveToRightmost(pCur);
03669       getCellInfo(pCur);
03670       pCur->atLast = rc==SQLITE_OK;
03671     }
03672   }
03673   return rc;
03674 }
03675 
03676 /* Move the cursor so that it points to an entry near the key 
03677 ** specified by pIdxKey or intKey.   Return a success code.
03678 **
03679 ** For INTKEY tables, the intKey parameter is used.  pIdxKey 
03680 ** must be NULL.  For index tables, pIdxKey is used and intKey
03681 ** is ignored.
03682 **
03683 ** If an exact match is not found, then the cursor is always
03684 ** left pointing at a leaf page which would hold the entry if it
03685 ** were present.  The cursor might point to an entry that comes
03686 ** before or after the key.
03687 **
03688 ** The result of comparing the key with the entry to which the
03689 ** cursor is written to *pRes if pRes!=NULL.  The meaning of
03690 ** this value is as follows:
03691 **
03692 **     *pRes<0      The cursor is left pointing at an entry that
03693 **                  is smaller than pKey or if the table is empty
03694 **                  and the cursor is therefore left point to nothing.
03695 **
03696 **     *pRes==0     The cursor is left pointing at an entry that
03697 **                  exactly matches pKey.
03698 **
03699 **     *pRes>0      The cursor is left pointing at an entry that
03700 **                  is larger than pKey.
03701 **
03702 */
03703 int sqlite3BtreeMovetoUnpacked(
03704   BtCursor *pCur,          /* The cursor to be moved */
03705   UnpackedRecord *pIdxKey, /* Unpacked index key */
03706   i64 intKey,              /* The table key */
03707   int biasRight,           /* If true, bias the search to the high end */
03708   int *pRes                /* Write search results here */
03709 ){
03710   int rc;
03711 
03712   assert( cursorHoldsMutex(pCur) );
03713   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
03714 
03715   /* If the cursor is already positioned at the point we are trying
03716   ** to move to, then just return without doing any work */
03717   if( pCur->eState==CURSOR_VALID && pCur->validNKey 
03718    && pCur->apPage[0]->intKey 
03719   ){
03720     if( pCur->info.nKey==intKey ){
03721       *pRes = 0;
03722       return SQLITE_OK;
03723     }
03724     if( pCur->atLast && pCur->info.nKey<intKey ){
03725       *pRes = -1;
03726       return SQLITE_OK;
03727     }
03728   }
03729 
03730   rc = moveToRoot(pCur);
03731   if( rc ){
03732     return rc;
03733   }
03734   assert( pCur->apPage[pCur->iPage] );
03735   assert( pCur->apPage[pCur->iPage]->isInit );
03736   if( pCur->eState==CURSOR_INVALID ){
03737     *pRes = -1;
03738     assert( pCur->apPage[pCur->iPage]->nCell==0 );
03739     return SQLITE_OK;
03740   }
03741   assert( pCur->apPage[0]->intKey || pIdxKey );
03742   for(;;){
03743     int lwr, upr;
03744     Pgno chldPg;
03745     MemPage *pPage = pCur->apPage[pCur->iPage];
03746     int c = -1;  /* pRes return if table is empty must be -1 */
03747     lwr = 0;
03748     upr = pPage->nCell-1;
03749     if( !pPage->intKey && pIdxKey==0 ){
03750       rc = SQLITE_CORRUPT_BKPT;
03751       goto moveto_finish;
03752     }
03753     if( biasRight ){
03754       pCur->aiIdx[pCur->iPage] = upr;
03755     }else{
03756       pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
03757     }
03758     if( lwr<=upr ) for(;;){
03759       void *pCellKey;
03760       i64 nCellKey;
03761       int idx = pCur->aiIdx[pCur->iPage];
03762       pCur->info.nSize = 0;
03763       pCur->validNKey = 1;
03764       if( pPage->intKey ){
03765         u8 *pCell;
03766         pCell = findCell(pPage, idx) + pPage->childPtrSize;
03767         if( pPage->hasData ){
03768           u32 dummy;
03769           pCell += getVarint32(pCell, dummy);
03770         }
03771         getVarint(pCell, (u64*)&nCellKey);
03772         if( nCellKey==intKey ){
03773           c = 0;
03774         }else if( nCellKey<intKey ){
03775           c = -1;
03776         }else{
03777           assert( nCellKey>intKey );
03778           c = +1;
03779         }
03780       }else{
03781         int available;
03782         pCellKey = (void *)fetchPayload(pCur, &available, 0);
03783         nCellKey = pCur->info.nKey;
03784         if( available>=nCellKey ){
03785           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
03786         }else{
03787           pCellKey = sqlite3Malloc( nCellKey );
03788           if( pCellKey==0 ){
03789             rc = SQLITE_NOMEM;
03790             goto moveto_finish;
03791           }
03792           rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
03793           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
03794           sqlite3_free(pCellKey);
03795           if( rc ) goto moveto_finish;
03796         }
03797       }
03798       if( c==0 ){
03799         pCur->info.nKey = nCellKey;
03800         if( pPage->intKey && !pPage->leaf ){
03801           lwr = idx;
03802           upr = lwr - 1;
03803           break;
03804         }else{
03805           if( pRes ) *pRes = 0;
03806           rc = SQLITE_OK;
03807           goto moveto_finish;
03808         }
03809       }
03810       if( c<0 ){
03811         lwr = idx+1;
03812       }else{
03813         upr = idx-1;
03814       }
03815       if( lwr>upr ){
03816         pCur->info.nKey = nCellKey;
03817         break;
03818       }
03819       pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
03820     }
03821     assert( lwr==upr+1 );
03822     assert( pPage->isInit );
03823     if( pPage->leaf ){
03824       chldPg = 0;
03825     }else if( lwr>=pPage->nCell ){
03826       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
03827     }else{
03828       chldPg = get4byte(findCell(pPage, lwr));
03829     }
03830     if( chldPg==0 ){
03831       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
03832       if( pRes ) *pRes = c;
03833       rc = SQLITE_OK;
03834       goto moveto_finish;
03835     }
03836     pCur->aiIdx[pCur->iPage] = lwr;
03837     pCur->info.nSize = 0;
03838     pCur->validNKey = 0;
03839     rc = moveToChild(pCur, chldPg);
03840     if( rc ) goto moveto_finish;
03841   }
03842 moveto_finish:
03843   return rc;
03844 }
03845 
03846 /*
03847 ** In this version of BtreeMoveto, pKey is a packed index record
03848 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
03849 ** record and then call BtreeMovetoUnpacked() to do the work.
03850 */
03851 int sqlite3BtreeMoveto(
03852   BtCursor *pCur,     /* Cursor open on the btree to be searched */
03853   const void *pKey,   /* Packed key if the btree is an index */
03854   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
03855   int bias,           /* Bias search to the high end */
03856   int *pRes           /* Write search results here */
03857 ){
03858   int rc;                    /* Status code */
03859   UnpackedRecord *pIdxKey;   /* Unpacked index key */
03860   UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
03861 
03862   if( pKey ){
03863     pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
03864                                       aSpace, sizeof(aSpace));
03865     if( pIdxKey==0 ) return SQLITE_NOMEM;
03866   }else{
03867     pIdxKey = 0;
03868   }
03869   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
03870   if( pKey ){
03871     sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
03872   }
03873   return rc;
03874 }
03875 
03876 
03877 /*
03878 ** Return TRUE if the cursor is not pointing at an entry of the table.
03879 **
03880 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
03881 ** past the last entry in the table or sqlite3BtreePrev() moves past
03882 ** the first entry.  TRUE is also returned if the table is empty.
03883 */
03884 int sqlite3BtreeEof(BtCursor *pCur){
03885   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
03886   ** have been deleted? This API will need to change to return an error code
03887   ** as well as the boolean result value.
03888   */
03889   return (CURSOR_VALID!=pCur->eState);
03890 }
03891 
03892 /*
03893 ** Return the database connection handle for a cursor.
03894 */
03895 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
03896   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
03897   return pCur->pBtree->db;
03898 }
03899 
03900 /*
03901 ** Advance the cursor to the next entry in the database.  If
03902 ** successful then set *pRes=0.  If the cursor
03903 ** was already pointing to the last entry in the database before
03904 ** this routine was called, then set *pRes=1.
03905 */
03906 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
03907   int rc;
03908   int idx;
03909   MemPage *pPage;
03910 
03911   assert( cursorHoldsMutex(pCur) );
03912   rc = restoreCursorPosition(pCur);
03913   if( rc!=SQLITE_OK ){
03914     return rc;
03915   }
03916   assert( pRes!=0 );
03917   if( CURSOR_INVALID==pCur->eState ){
03918     *pRes = 1;
03919     return SQLITE_OK;
03920   }
03921   if( pCur->skip>0 ){
03922     pCur->skip = 0;
03923     *pRes = 0;
03924     return SQLITE_OK;
03925   }
03926   pCur->skip = 0;
03927 
03928   pPage = pCur->apPage[pCur->iPage];
03929   idx = ++pCur->aiIdx[pCur->iPage];
03930   assert( pPage->isInit );
03931   assert( idx<=pPage->nCell );
03932 
03933   pCur->info.nSize = 0;
03934   pCur->validNKey = 0;
03935   if( idx>=pPage->nCell ){
03936     if( !pPage->leaf ){
03937       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
03938       if( rc ) return rc;
03939       rc = moveToLeftmost(pCur);
03940       *pRes = 0;
03941       return rc;
03942     }
03943     do{
03944       if( pCur->iPage==0 ){
03945         *pRes = 1;
03946         pCur->eState = CURSOR_INVALID;
03947         return SQLITE_OK;
03948       }
03949       sqlite3BtreeMoveToParent(pCur);
03950       pPage = pCur->apPage[pCur->iPage];
03951     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
03952     *pRes = 0;
03953     if( pPage->intKey ){
03954       rc = sqlite3BtreeNext(pCur, pRes);
03955     }else{
03956       rc = SQLITE_OK;
03957     }
03958     return rc;
03959   }
03960   *pRes = 0;
03961   if( pPage->leaf ){
03962     return SQLITE_OK;
03963   }
03964   rc = moveToLeftmost(pCur);
03965   return rc;
03966 }
03967 
03968 
03969 /*
03970 ** Step the cursor to the back to the previous entry in the database.  If
03971 ** successful then set *pRes=0.  If the cursor
03972 ** was already pointing to the first entry in the database before
03973 ** this routine was called, then set *pRes=1.
03974 */
03975 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
03976   int rc;
03977   MemPage *pPage;
03978 
03979   assert( cursorHoldsMutex(pCur) );
03980   rc = restoreCursorPosition(pCur);
03981   if( rc!=SQLITE_OK ){
03982     return rc;
03983   }
03984   pCur->atLast = 0;
03985   if( CURSOR_INVALID==pCur->eState ){
03986     *pRes = 1;
03987     return SQLITE_OK;
03988   }
03989   if( pCur->skip<0 ){
03990     pCur->skip = 0;
03991     *pRes = 0;
03992     return SQLITE_OK;
03993   }
03994   pCur->skip = 0;
03995 
03996   pPage = pCur->apPage[pCur->iPage];
03997   assert( pPage->isInit );
03998   if( !pPage->leaf ){
03999     int idx = pCur->aiIdx[pCur->iPage];
04000     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
04001     if( rc ){
04002       return rc;
04003     }
04004     rc = moveToRightmost(pCur);
04005   }else{
04006     while( pCur->aiIdx[pCur->iPage]==0 ){
04007       if( pCur->iPage==0 ){
04008         pCur->eState = CURSOR_INVALID;
04009         *pRes = 1;
04010         return SQLITE_OK;
04011       }
04012       sqlite3BtreeMoveToParent(pCur);
04013     }
04014     pCur->info.nSize = 0;
04015     pCur->validNKey = 0;
04016 
04017     pCur->aiIdx[pCur->iPage]--;
04018     pPage = pCur->apPage[pCur->iPage];
04019     if( pPage->intKey && !pPage->leaf ){
04020       rc = sqlite3BtreePrevious(pCur, pRes);
04021     }else{
04022       rc = SQLITE_OK;
04023     }
04024   }
04025   *pRes = 0;
04026   return rc;
04027 }
04028 
04029 /*
04030 ** Allocate a new page from the database file.
04031 **
04032 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
04033 ** has already been called on the new page.)  The new page has also
04034 ** been referenced and the calling routine is responsible for calling
04035 ** sqlite3PagerUnref() on the new page when it is done.
04036 **
04037 ** SQLITE_OK is returned on success.  Any other return value indicates
04038 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
04039 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
04040 **
04041 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 
04042 ** locate a page close to the page number "nearby".  This can be used in an
04043 ** attempt to keep related pages close to each other in the database file,
04044 ** which in turn can make database access faster.
04045 **
04046 ** If the "exact" parameter is not 0, and the page-number nearby exists 
04047 ** anywhere on the free-list, then it is guarenteed to be returned. This
04048 ** is only used by auto-vacuum databases when allocating a new table.
04049 */
04050 static int allocateBtreePage(
04051   BtShared *pBt, 
04052   MemPage **ppPage, 
04053   Pgno *pPgno, 
04054   Pgno nearby,
04055   u8 exact
04056 ){
04057   MemPage *pPage1;
04058   int rc;
04059   int n;     /* Number of pages on the freelist */
04060   int k;     /* Number of leaves on the trunk of the freelist */
04061   MemPage *pTrunk = 0;
04062   MemPage *pPrevTrunk = 0;
04063 
04064   assert( sqlite3_mutex_held(pBt->mutex) );
04065   pPage1 = pBt->pPage1;
04066   n = get4byte(&pPage1->aData[36]);
04067   if( n>0 ){
04068     /* There are pages on the freelist.  Reuse one of those pages. */
04069     Pgno iTrunk;
04070     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
04071     
04072     /* If the 'exact' parameter was true and a query of the pointer-map
04073     ** shows that the page 'nearby' is somewhere on the free-list, then
04074     ** the entire-list will be searched for that page.
04075     */
04076 #ifndef SQLITE_OMIT_AUTOVACUUM
04077     if( exact && nearby<=pagerPagecount(pBt->pPager) ){
04078       u8 eType;
04079       assert( nearby>0 );
04080       assert( pBt->autoVacuum );
04081       rc = ptrmapGet(pBt, nearby, &eType, 0);
04082       if( rc ) return rc;
04083       if( eType==PTRMAP_FREEPAGE ){
04084         searchList = 1;
04085       }
04086       *pPgno = nearby;
04087     }
04088 #endif
04089 
04090     /* Decrement the free-list count by 1. Set iTrunk to the index of the
04091     ** first free-list trunk page. iPrevTrunk is initially 1.
04092     */
04093     rc = sqlite3PagerWrite(pPage1->pDbPage);
04094     if( rc ) return rc;
04095     put4byte(&pPage1->aData[36], n-1);
04096 
04097     /* The code within this loop is run only once if the 'searchList' variable
04098     ** is not true. Otherwise, it runs once for each trunk-page on the
04099     ** free-list until the page 'nearby' is located.
04100     */
04101     do {
04102       pPrevTrunk = pTrunk;
04103       if( pPrevTrunk ){
04104         iTrunk = get4byte(&pPrevTrunk->aData[0]);
04105       }else{
04106         iTrunk = get4byte(&pPage1->aData[32]);
04107       }
04108       rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
04109       if( rc ){
04110         pTrunk = 0;
04111         goto end_allocate_page;
04112       }
04113 
04114       k = get4byte(&pTrunk->aData[4]);
04115       if( k==0 && !searchList ){
04116         /* The trunk has no leaves and the list is not being searched. 
04117         ** So extract the trunk page itself and use it as the newly 
04118         ** allocated page */
04119         assert( pPrevTrunk==0 );
04120         rc = sqlite3PagerWrite(pTrunk->pDbPage);
04121         if( rc ){
04122           goto end_allocate_page;
04123         }
04124         *pPgno = iTrunk;
04125         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
04126         *ppPage = pTrunk;
04127         pTrunk = 0;
04128         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
04129       }else if( k>pBt->usableSize/4 - 2 ){
04130         /* Value of k is out of range.  Database corruption */
04131         rc = SQLITE_CORRUPT_BKPT;
04132         goto end_allocate_page;
04133 #ifndef SQLITE_OMIT_AUTOVACUUM
04134       }else if( searchList && nearby==iTrunk ){
04135         /* The list is being searched and this trunk page is the page
04136         ** to allocate, regardless of whether it has leaves.
04137         */
04138         assert( *pPgno==iTrunk );
04139         *ppPage = pTrunk;
04140         searchList = 0;
04141         rc = sqlite3PagerWrite(pTrunk->pDbPage);
04142         if( rc ){
04143           goto end_allocate_page;
04144         }
04145         if( k==0 ){
04146           if( !pPrevTrunk ){
04147             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
04148           }else{
04149             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
04150           }
04151         }else{
04152           /* The trunk page is required by the caller but it contains 
04153           ** pointers to free-list leaves. The first leaf becomes a trunk
04154           ** page in this case.
04155           */
04156           MemPage *pNewTrunk;
04157           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
04158           rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
04159           if( rc!=SQLITE_OK ){
04160             goto end_allocate_page;
04161           }
04162           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
04163           if( rc!=SQLITE_OK ){
04164             releasePage(pNewTrunk);
04165             goto end_allocate_page;
04166           }
04167           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
04168           put4byte(&pNewTrunk->aData[4], k-1);
04169           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
04170           releasePage(pNewTrunk);
04171           if( !pPrevTrunk ){
04172             put4byte(&pPage1->aData[32], iNewTrunk);
04173           }else{
04174             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
04175             if( rc ){
04176               goto end_allocate_page;
04177             }
04178             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
04179           }
04180         }
04181         pTrunk = 0;
04182         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
04183 #endif
04184       }else{
04185         /* Extract a leaf from the trunk */
04186         int closest;
04187         Pgno iPage;
04188         unsigned char *aData = pTrunk->aData;
04189         rc = sqlite3PagerWrite(pTrunk->pDbPage);
04190         if( rc ){
04191           goto end_allocate_page;
04192         }
04193         if( nearby>0 ){
04194           int i, dist;
04195           closest = 0;
04196           dist = get4byte(&aData[8]) - nearby;
04197           if( dist<0 ) dist = -dist;
04198           for(i=1; i<k; i++){
04199             int d2 = get4byte(&aData[8+i*4]) - nearby;
04200             if( d2<0 ) d2 = -d2;
04201             if( d2<dist ){
04202               closest = i;
04203               dist = d2;
04204             }
04205           }
04206         }else{
04207           closest = 0;
04208         }
04209 
04210         iPage = get4byte(&aData[8+closest*4]);
04211         if( !searchList || iPage==nearby ){
04212           int nPage;
04213           *pPgno = iPage;
04214           nPage = pagerPagecount(pBt->pPager);
04215           if( *pPgno>nPage ){
04216             /* Free page off the end of the file */
04217             rc = SQLITE_CORRUPT_BKPT;
04218             goto end_allocate_page;
04219           }
04220           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
04221                  ": %d more free pages\n",
04222                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
04223           if( closest<k-1 ){
04224             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
04225           }
04226           put4byte(&aData[4], k-1);
04227           rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
04228           if( rc==SQLITE_OK ){
04229             sqlite3PagerDontRollback((*ppPage)->pDbPage);
04230             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
04231             if( rc!=SQLITE_OK ){
04232               releasePage(*ppPage);
04233             }
04234           }
04235           searchList = 0;
04236         }
04237       }
04238       releasePage(pPrevTrunk);
04239       pPrevTrunk = 0;
04240     }while( searchList );
04241   }else{
04242     /* There are no pages on the freelist, so create a new page at the
04243     ** end of the file */
04244     int nPage = pagerPagecount(pBt->pPager);
04245     *pPgno = nPage + 1;
04246 
04247 #ifndef SQLITE_OMIT_AUTOVACUUM
04248     if( pBt->nTrunc ){
04249       /* An incr-vacuum has already run within this transaction. So the
04250       ** page to allocate is not from the physical end of the file, but
04251       ** at pBt->nTrunc. 
04252       */
04253       *pPgno = pBt->nTrunc+1;
04254       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
04255         (*pPgno)++;
04256       }
04257     }
04258     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
04259       /* If *pPgno refers to a pointer-map page, allocate two new pages
04260       ** at the end of the file instead of one. The first allocated page
04261       ** becomes a new pointer-map page, the second is used by the caller.
04262       */
04263       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
04264       assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
04265       (*pPgno)++;
04266       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
04267     }
04268     if( pBt->nTrunc ){
04269       pBt->nTrunc = *pPgno;
04270     }
04271 #endif
04272 
04273     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
04274     rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
04275     if( rc ) return rc;
04276     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
04277     if( rc!=SQLITE_OK ){
04278       releasePage(*ppPage);
04279     }
04280     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
04281   }
04282 
04283   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
04284 
04285 end_allocate_page:
04286   releasePage(pTrunk);
04287   releasePage(pPrevTrunk);
04288   if( rc==SQLITE_OK && sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
04289     releasePage(*ppPage);
04290     return SQLITE_CORRUPT_BKPT;
04291   }
04292   return rc;
04293 }
04294 
04295 /*
04296 ** Add a page of the database file to the freelist.
04297 **
04298 ** sqlite3PagerUnref() is NOT called for pPage.
04299 */
04300 static int freePage(MemPage *pPage){
04301   BtShared *pBt = pPage->pBt;
04302   MemPage *pPage1 = pBt->pPage1;
04303   int rc, n, k;
04304 
04305   /* Prepare the page for freeing */
04306   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
04307   assert( pPage->pgno>1 );
04308   pPage->isInit = 0;
04309 
04310   /* Increment the free page count on pPage1 */
04311   rc = sqlite3PagerWrite(pPage1->pDbPage);
04312   if( rc ) return rc;
04313   n = get4byte(&pPage1->aData[36]);
04314   put4byte(&pPage1->aData[36], n+1);
04315 
04316 #ifdef SQLITE_SECURE_DELETE
04317   /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
04318   ** always fully overwrite deleted information with zeros.
04319   */
04320   rc = sqlite3PagerWrite(pPage->pDbPage);
04321   if( rc ) return rc;
04322   memset(pPage->aData, 0, pPage->pBt->pageSize);
04323 #endif
04324 
04325   /* If the database supports auto-vacuum, write an entry in the pointer-map
04326   ** to indicate that the page is free.
04327   */
04328   if( ISAUTOVACUUM ){
04329     rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
04330     if( rc ) return rc;
04331   }
04332 
04333   if( n==0 ){
04334     /* This is the first free page */
04335     rc = sqlite3PagerWrite(pPage->pDbPage);
04336     if( rc ) return rc;
04337     memset(pPage->aData, 0, 8);
04338     put4byte(&pPage1->aData[32], pPage->pgno);
04339     TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
04340   }else{
04341     /* Other free pages already exist.  Retrive the first trunk page
04342     ** of the freelist and find out how many leaves it has. */
04343     MemPage *pTrunk;
04344     rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
04345     if( rc ) return rc;
04346     k = get4byte(&pTrunk->aData[4]);
04347     if( k>=pBt->usableSize/4 - 8 ){
04348       /* The trunk is full.  Turn the page being freed into a new
04349       ** trunk page with no leaves.
04350       **
04351       ** Note that the trunk page is not really full until it contains
04352       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
04353       ** coded.  But due to a coding error in versions of SQLite prior to
04354       ** 3.6.0, databases with freelist trunk pages holding more than
04355       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
04356       ** to maintain backwards compatibility with older versions of SQLite,
04357       ** we will contain to restrict the number of entries to usableSize/4 - 8
04358       ** for now.  At some point in the future (once everyone has upgraded
04359       ** to 3.6.0 or later) we should consider fixing the conditional above
04360       ** to read "usableSize/4-2" instead of "usableSize/4-8".
04361       */
04362       rc = sqlite3PagerWrite(pPage->pDbPage);
04363       if( rc==SQLITE_OK ){
04364         put4byte(pPage->aData, pTrunk->pgno);
04365         put4byte(&pPage->aData[4], 0);
04366         put4byte(&pPage1->aData[32], pPage->pgno);
04367         TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
04368                 pPage->pgno, pTrunk->pgno));
04369       }
04370     }else if( k<0 ){
04371       rc = SQLITE_CORRUPT;
04372     }else{
04373       /* Add the newly freed page as a leaf on the current trunk */
04374       rc = sqlite3PagerWrite(pTrunk->pDbPage);
04375       if( rc==SQLITE_OK ){
04376         put4byte(&pTrunk->aData[4], k+1);
04377         put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
04378 #ifndef SQLITE_SECURE_DELETE
04379         rc = sqlite3PagerDontWrite(pPage->pDbPage);
04380 #endif
04381       }
04382       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
04383     }
04384     releasePage(pTrunk);
04385   }
04386   return rc;
04387 }
04388 
04389 /*
04390 ** Free any overflow pages associated with the given Cell.
04391 */
04392 static int clearCell(MemPage *pPage, unsigned char *pCell){
04393   BtShared *pBt = pPage->pBt;
04394   CellInfo info;
04395   Pgno ovflPgno;
04396   int rc;
04397   int nOvfl;
04398   int ovflPageSize;
04399 
04400   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
04401   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
04402   if( info.iOverflow==0 ){
04403     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
04404   }
04405   ovflPgno = get4byte(&pCell[info.iOverflow]);
04406   ovflPageSize = pBt->usableSize - 4;
04407   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
04408   assert( ovflPgno==0 || nOvfl>0 );
04409   while( nOvfl-- ){
04410     MemPage *pOvfl;
04411     if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
04412       return SQLITE_CORRUPT_BKPT;
04413     }
04414 
04415     rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
04416     if( rc ) return rc;
04417     rc = freePage(pOvfl);
04418     sqlite3PagerUnref(pOvfl->pDbPage);
04419     if( rc ) return rc;
04420   }
04421   return SQLITE_OK;
04422 }
04423 
04424 /*
04425 ** Create the byte sequence used to represent a cell on page pPage
04426 ** and write that byte sequence into pCell[].  Overflow pages are
04427 ** allocated and filled in as necessary.  The calling procedure
04428 ** is responsible for making sure sufficient space has been allocated
04429 ** for pCell[].
04430 **
04431 ** Note that pCell does not necessary need to point to the pPage->aData
04432 ** area.  pCell might point to some temporary storage.  The cell will
04433 ** be constructed in this temporary area then copied into pPage->aData
04434 ** later.
04435 */
04436 static int fillInCell(
04437   MemPage *pPage,                /* The page that contains the cell */
04438   unsigned char *pCell,          /* Complete text of the cell */
04439   const void *pKey, i64 nKey,    /* The key */
04440   const void *pData,int nData,   /* The data */
04441   int nZero,                     /* Extra zero bytes to append to pData */
04442   int *pnSize                    /* Write cell size here */
04443 ){
04444   int nPayload;
04445   const u8 *pSrc;
04446   int nSrc, n, rc;
04447   int spaceLeft;
04448   MemPage *pOvfl = 0;
04449   MemPage *pToRelease = 0;
04450   unsigned char *pPrior;
04451   unsigned char *pPayload;
04452   BtShared *pBt = pPage->pBt;
04453   Pgno pgnoOvfl = 0;
04454   int nHeader;
04455   CellInfo info;
04456 
04457   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
04458 
04459   /* Fill in the header. */
04460   nHeader = 0;
04461   if( !pPage->leaf ){
04462     nHeader += 4;
04463   }
04464   if( pPage->hasData ){
04465     nHeader += putVarint(&pCell[nHeader], nData+nZero);
04466   }else{
04467     nData = nZero = 0;
04468   }
04469   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
04470   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
04471   assert( info.nHeader==nHeader );
04472   assert( info.nKey==nKey );
04473   assert( info.nData==nData+nZero );
04474   
04475   /* Fill in the payload */
04476   nPayload = nData + nZero;
04477   if( pPage->intKey ){
04478     pSrc = pData;
04479     nSrc = nData;
04480     nData = 0;
04481   }else{
04482     nPayload += nKey;
04483     pSrc = pKey;
04484     nSrc = nKey;
04485   }
04486   *pnSize = info.nSize;
04487   spaceLeft = info.nLocal;
04488   pPayload = &pCell[nHeader];
04489   pPrior = &pCell[info.iOverflow];
04490 
04491   while( nPayload>0 ){
04492     if( spaceLeft==0 ){
04493       int isExact = 0;
04494 #ifndef SQLITE_OMIT_AUTOVACUUM
04495       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
04496       if( pBt->autoVacuum ){
04497         do{
04498           pgnoOvfl++;
04499         } while( 
04500           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 
04501         );
04502         if( pgnoOvfl>1 ){
04503           /* isExact = 1; */
04504         }
04505       }
04506 #endif
04507       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
04508 #ifndef SQLITE_OMIT_AUTOVACUUM
04509       /* If the database supports auto-vacuum, and the second or subsequent
04510       ** overflow page is being allocated, add an entry to the pointer-map
04511       ** for that page now. 
04512       **
04513       ** If this is the first overflow page, then write a partial entry 
04514       ** to the pointer-map. If we write nothing to this pointer-map slot,
04515       ** then the optimistic overflow chain processing in clearCell()
04516       ** may misinterpret the uninitialised values and delete the
04517       ** wrong pages from the database.
04518       */
04519       if( pBt->autoVacuum && rc==SQLITE_OK ){
04520         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
04521         rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
04522         if( rc ){
04523           releasePage(pOvfl);
04524         }
04525       }
04526 #endif
04527       if( rc ){
04528         releasePage(pToRelease);
04529         return rc;
04530       }
04531       put4byte(pPrior, pgnoOvfl);
04532       releasePage(pToRelease);
04533       pToRelease = pOvfl;
04534       pPrior = pOvfl->aData;
04535       put4byte(pPrior, 0);
04536       pPayload = &pOvfl->aData[4];
04537       spaceLeft = pBt->usableSize - 4;
04538     }
04539     n = nPayload;
04540     if( n>spaceLeft ) n = spaceLeft;
04541     if( nSrc>0 ){
04542       if( n>nSrc ) n = nSrc;
04543       assert( pSrc );
04544       memcpy(pPayload, pSrc, n);
04545     }else{
04546       memset(pPayload, 0, n);
04547     }
04548     nPayload -= n;
04549     pPayload += n;
04550     pSrc += n;
04551     nSrc -= n;
04552     spaceLeft -= n;
04553     if( nSrc==0 ){
04554       nSrc = nData;
04555       pSrc = pData;
04556     }
04557   }
04558   releasePage(pToRelease);
04559   return SQLITE_OK;
04560 }
04561 
04562 /*
04563 ** Remove the i-th cell from pPage.  This routine effects pPage only.
04564 ** The cell content is not freed or deallocated.  It is assumed that
04565 ** the cell content has been copied someplace else.  This routine just
04566 ** removes the reference to the cell from pPage.
04567 **
04568 ** "sz" must be the number of bytes in the cell.
04569 */
04570 static int dropCell(MemPage *pPage, int idx, int sz){
04571   int i;          /* Loop counter */
04572   int pc;         /* Offset to cell content of cell being deleted */
04573   u8 *data;       /* pPage->aData */
04574   u8 *ptr;        /* Used to move bytes around within data[] */
04575 
04576   assert( idx>=0 && idx<pPage->nCell );
04577   assert( sz==cellSize(pPage, idx) );
04578   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
04579   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
04580   data = pPage->aData;
04581   ptr = &data[pPage->cellOffset + 2*idx];
04582   pc = get2byte(ptr);
04583   if ( pc<=10 || pc+sz>pPage->pBt->usableSize ) {
04584     return SQLITE_CORRUPT_BKPT;
04585   }
04586   freeSpace(pPage, pc, sz);
04587   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
04588     ptr[0] = ptr[2];
04589     ptr[1] = ptr[3];
04590   }
04591   pPage->nCell--;
04592   put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
04593   pPage->nFree += 2;
04594   return SQLITE_OK;
04595 }
04596 
04597 /*
04598 ** Insert a new cell on pPage at cell index "i".  pCell points to the
04599 ** content of the cell.
04600 **
04601 ** If the cell content will fit on the page, then put it there.  If it
04602 ** will not fit, then make a copy of the cell content into pTemp if
04603 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
04604 ** in pPage->aOvfl[] and make it point to the cell content (either
04605 ** in pTemp or the original pCell) and also record its index. 
04606 ** Allocating a new entry in pPage->aCell[] implies that 
04607 ** pPage->nOverflow is incremented.
04608 **
04609 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
04610 ** cell. The caller will overwrite them after this function returns. If
04611 ** nSkip is non-zero, then pCell may not point to an invalid memory location 
04612 ** (but pCell+nSkip is always valid).
04613 */
04614 static int insertCell(
04615   MemPage *pPage,   /* Page into which we are copying */
04616   int i,            /* New cell becomes the i-th cell of the page */
04617   u8 *pCell,        /* Content of the new cell */
04618   int sz,           /* Bytes of content in pCell */
04619   u8 *pTemp,        /* Temp storage space for pCell, if needed */
04620   u8 nSkip          /* Do not write the first nSkip bytes of the cell */
04621 ){
04622   int idx;          /* Where to write new cell content in data[] */
04623   int j;            /* Loop counter */
04624   int top;          /* First byte of content for any cell in data[] */
04625   int end;          /* First byte past the last cell pointer in data[] */
04626   int ins;          /* Index in data[] where new cell pointer is inserted */
04627   int hdr;          /* Offset into data[] of the page header */
04628   int cellOffset;   /* Address of first cell pointer in data[] */
04629   u8 *data;         /* The content of the whole page */
04630   u8 *ptr;          /* Used for moving information around in data[] */
04631 
04632   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
04633   assert( sz==cellSizePtr(pPage, pCell) );
04634   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
04635   if( pPage->nOverflow || sz+2>pPage->nFree ){
04636     if( pTemp ){
04637       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
04638       pCell = pTemp;
04639     }
04640     j = pPage->nOverflow++;
04641     assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
04642     pPage->aOvfl[j].pCell = pCell;
04643     pPage->aOvfl[j].idx = i;
04644     pPage->nFree = 0;
04645   }else{
04646     int rc = sqlite3PagerWrite(pPage->pDbPage);
04647     if( rc!=SQLITE_OK ){
04648       return rc;
04649     }
04650     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
04651     data = pPage->aData;
04652     hdr = pPage->hdrOffset;
04653     top = get2byte(&data[hdr+5]);
04654     cellOffset = pPage->cellOffset;
04655     end = cellOffset + 2*pPage->nCell + 2;
04656     ins = cellOffset + 2*i;
04657     if( end > top - sz ){
04658       rc = defragmentPage(pPage);
04659       if( rc!=SQLITE_OK ){
04660         return rc;
04661       }
04662       top = get2byte(&data[hdr+5]);
04663       assert( end + sz <= top );
04664     }
04665     idx = allocateSpace(pPage, sz);
04666     assert( idx>0 );
04667     assert( end <= get2byte(&data[hdr+5]) );
04668     if (idx+sz > pPage->pBt->usableSize) {
04669       return SQLITE_CORRUPT_BKPT;
04670     }
04671     pPage->nCell++;
04672     pPage->nFree -= 2;
04673     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
04674     for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
04675       ptr[0] = ptr[-2];
04676       ptr[1] = ptr[-1];
04677     }
04678     put2byte(&data[ins], idx);
04679     put2byte(&data[hdr+3], pPage->nCell);
04680 #ifndef SQLITE_OMIT_AUTOVACUUM
04681     if( pPage->pBt->autoVacuum ){
04682       /* The cell may contain a pointer to an overflow page. If so, write
04683       ** the entry for the overflow page into the pointer map.
04684       */
04685       CellInfo info;
04686       sqlite3BtreeParseCellPtr(pPage, pCell, &info);
04687       assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
04688       if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
04689         Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
04690         rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
04691         if( rc!=SQLITE_OK ) return rc;
04692       }
04693     }
04694 #endif
04695   }
04696 
04697   return SQLITE_OK;
04698 }
04699 
04700 /*
04701 ** Add a list of cells to a page.  The page should be initially empty.
04702 ** The cells are guaranteed to fit on the page.
04703 */
04704 static void assemblePage(
04705   MemPage *pPage,   /* The page to be assemblied */
04706   int nCell,        /* The number of cells to add to this page */
04707   u8 **apCell,      /* Pointers to cell bodies */
04708   u16 *aSize        /* Sizes of the cells */
04709 ){
04710   int i;            /* Loop counter */
04711   int totalSize;    /* Total size of all cells */
04712   int hdr;          /* Index of page header */
04713   int cellptr;      /* Address of next cell pointer */
04714   int cellbody;     /* Address of next cell body */
04715   u8 *data;         /* Data for the page */
04716 
04717   assert( pPage->nOverflow==0 );
04718   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
04719   totalSize = 0;
04720   for(i=0; i<nCell; i++){
04721     totalSize += aSize[i];
04722   }
04723   assert( totalSize+2*nCell<=pPage->nFree );
04724   assert( pPage->nCell==0 );
04725   cellptr = pPage->cellOffset;
04726   data = pPage->aData;
04727   hdr = pPage->hdrOffset;
04728   put2byte(&data[hdr+3], nCell);
04729   if( nCell ){
04730     cellbody = allocateSpace(pPage, totalSize);
04731     assert( cellbody>0 );
04732     assert( pPage->nFree >= 2*nCell );
04733     pPage->nFree -= 2*nCell;
04734     for(i=0; i<nCell; i++){
04735       put2byte(&data[cellptr], cellbody);
04736       memcpy(&data[cellbody], apCell[i], aSize[i]);
04737       cellptr += 2;
04738       cellbody += aSize[i];
04739     }
04740     assert( cellbody==pPage->pBt->usableSize );
04741   }
04742   pPage->nCell = nCell;
04743 }
04744 
04745 /*
04746 ** The following parameters determine how many adjacent pages get involved
04747 ** in a balancing operation.  NN is the number of neighbors on either side
04748 ** of the page that participate in the balancing operation.  NB is the
04749 ** total number of pages that participate, including the target page and
04750 ** NN neighbors on either side.
04751 **
04752 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
04753 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
04754 ** in exchange for a larger degradation in INSERT and UPDATE performance.
04755 ** The value of NN appears to give the best results overall.
04756 */
04757 #define NN 1             /* Number of neighbors on either side of pPage */
04758 #define NB (NN*2+1)      /* Total pages involved in the balance */
04759 
04760 /* Forward reference */
04761 static int balance(BtCursor*, int);
04762 
04763 #ifndef SQLITE_OMIT_QUICKBALANCE
04764 /*
04765 ** This version of balance() handles the common special case where
04766 ** a new entry is being inserted on the extreme right-end of the
04767 ** tree, in other words, when the new entry will become the largest
04768 ** entry in the tree.
04769 **
04770 ** Instead of trying balance the 3 right-most leaf pages, just add
04771 ** a new page to the right-hand side and put the one new entry in
04772 ** that page.  This leaves the right side of the tree somewhat
04773 ** unbalanced.  But odds are that we will be inserting new entries
04774 ** at the end soon afterwards so the nearly empty page will quickly
04775 ** fill up.  On average.
04776 **
04777 ** pPage is the leaf page which is the right-most page in the tree.
04778 ** pParent is its parent.  pPage must have a single overflow entry
04779 ** which is also the right-most entry on the page.
04780 */
04781 static int balance_quick(BtCursor *pCur){
04782   int rc;
04783   MemPage *pNew = 0;
04784   Pgno pgnoNew;
04785   u8 *pCell;
04786   u16 szCell;
04787   CellInfo info;
04788   MemPage *pPage = pCur->apPage[pCur->iPage];
04789   MemPage *pParent = pCur->apPage[pCur->iPage-1];
04790   BtShared *pBt = pPage->pBt;
04791   int parentIdx = pParent->nCell;   /* pParent new divider cell index */
04792   int parentSize;                   /* Size of new divider cell */
04793   u8 parentCell[64];                /* Space for the new divider cell */
04794 
04795   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
04796 
04797   /* Allocate a new page. Insert the overflow cell from pPage
04798   ** into it. Then remove the overflow cell from pPage.
04799   */
04800   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
04801   if( rc==SQLITE_OK ){
04802     pCell = pPage->aOvfl[0].pCell;
04803     szCell = cellSizePtr(pPage, pCell);
04804     zeroPage(pNew, pPage->aData[0]);
04805     assemblePage(pNew, 1, &pCell, &szCell);
04806     pPage->nOverflow = 0;
04807   
04808     /* pPage is currently the right-child of pParent. Change this
04809     ** so that the right-child is the new page allocated above and
04810     ** pPage is the next-to-right child. 
04811     **
04812     ** Ignore the return value of the call to fillInCell(). fillInCell()
04813     ** may only return other than SQLITE_OK if it is required to allocate
04814     ** one or more overflow pages. Since an internal table B-Tree cell 
04815     ** may never spill over onto an overflow page (it is a maximum of 
04816     ** 13 bytes in size), it is not neccessary to check the return code.
04817     **
04818     ** Similarly, the insertCell() function cannot fail if the page
04819     ** being inserted into is already writable and the cell does not 
04820     ** contain an overflow pointer. So ignore this return code too.
04821     */
04822     assert( pPage->nCell>0 );
04823     pCell = findCell(pPage, pPage->nCell-1);
04824     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
04825     fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
04826     assert( parentSize<64 );
04827     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
04828     insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
04829     put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
04830     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
04831   
04832     /* If this is an auto-vacuum database, update the pointer map
04833     ** with entries for the new page, and any pointer from the 
04834     ** cell on the page to an overflow page.
04835     */
04836     if( ISAUTOVACUUM ){
04837       rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
04838       if( rc==SQLITE_OK ){
04839         rc = ptrmapPutOvfl(pNew, 0);
04840       }
04841     }
04842 
04843     /* Release the reference to the new page. */
04844     releasePage(pNew);
04845   }
04846 
04847   /* At this point the pPage->nFree variable is not set correctly with
04848   ** respect to the content of the page (because it was set to 0 by 
04849   ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
04850   ** correct.
04851   **
04852   ** This has to be done even if an error will be returned. Normally, if
04853   ** an error occurs during tree balancing, the contents of MemPage are
04854   ** not important, as they will be recalculated when the page is rolled
04855   ** back. But here, in balance_quick(), it is possible that pPage has 
04856   ** not yet been marked dirty or written into the journal file. Therefore
04857   ** it will not be rolled back and so it is important to make sure that
04858   ** the page data and contents of MemPage are consistent.
04859   */
04860   pPage->isInit = 0;
04861   sqlite3BtreeInitPage(pPage);
04862 
04863   /* If everything else succeeded, balance the parent page, in 
04864   ** case the divider cell inserted caused it to become overfull.
04865   */
04866   if( rc==SQLITE_OK ){
04867     releasePage(pPage);
04868     pCur->iPage--;
04869     rc = balance(pCur, 0);
04870   }
04871   return rc;
04872 }
04873 #endif /* SQLITE_OMIT_QUICKBALANCE */
04874 
04875 /*
04876 ** This routine redistributes Cells on pPage and up to NN*2 siblings
04877 ** of pPage so that all pages have about the same amount of free space.
04878 ** Usually NN siblings on either side of pPage is used in the balancing,
04879 ** though more siblings might come from one side if pPage is the first
04880 ** or last child of its parent.  If pPage has fewer than 2*NN siblings
04881 ** (something which can only happen if pPage is the root page or a 
04882 ** child of root) then all available siblings participate in the balancing.
04883 **
04884 ** The number of siblings of pPage might be increased or decreased by one or
04885 ** two in an effort to keep pages nearly full but not over full. The root page
04886 ** is special and is allowed to be nearly empty. If pPage is 
04887 ** the root page, then the depth of the tree might be increased
04888 ** or decreased by one, as necessary, to keep the root page from being
04889 ** overfull or completely empty.
04890 **
04891 ** Note that when this routine is called, some of the Cells on pPage
04892 ** might not actually be stored in pPage->aData[].  This can happen
04893 ** if the page is overfull.  Part of the job of this routine is to
04894 ** make sure all Cells for pPage once again fit in pPage->aData[].
04895 **
04896 ** In the course of balancing the siblings of pPage, the parent of pPage
04897 ** might become overfull or underfull.  If that happens, then this routine
04898 ** is called recursively on the parent.
04899 **
04900 ** If this routine fails for any reason, it might leave the database
04901 ** in a corrupted state.  So if this routine fails, the database should
04902 ** be rolled back.
04903 */
04904 static int balance_nonroot(BtCursor *pCur){
04905   MemPage *pPage;              /* The over or underfull page to balance */
04906   MemPage *pParent;            /* The parent of pPage */
04907   BtShared *pBt;               /* The whole database */
04908   int nCell = 0;               /* Number of cells in apCell[] */
04909   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
04910   int nOld;                    /* Number of pages in apOld[] */
04911   int nNew;                    /* Number of pages in apNew[] */
04912   int nDiv;                    /* Number of cells in apDiv[] */
04913   int i, j, k;                 /* Loop counters */
04914   int idx;                     /* Index of pPage in pParent->aCell[] */
04915   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
04916   int rc;                      /* The return code */
04917   int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
04918   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
04919   int usableSpace;             /* Bytes in pPage beyond the header */
04920   int pageFlags;               /* Value of pPage->aData[0] */
04921   int subtotal;                /* Subtotal of bytes in cells on one page */
04922   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
04923   int iSpace2 = 0;             /* First unused byte of aSpace2[] */
04924   int szScratch;               /* Size of scratch memory requested */
04925   MemPage *apOld[NB];          /* pPage and up to two siblings */
04926   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
04927   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
04928   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
04929   Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
04930   u8 *apDiv[NB];               /* Divider cells in pParent */
04931   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
04932   int szNew[NB+2];             /* Combined size of cells place on i-th page */
04933   u8 **apCell = 0;             /* All cells begin balanced */
04934   u16 *szCell;                 /* Local size of all cells in apCell[] */
04935   u8 *aCopy[NB];         /* Space for holding data of apCopy[] */
04936   u8 *aSpace1;           /* Space for copies of dividers cells before balance */
04937   u8 *aSpace2 = 0;       /* Space for overflow dividers cells after balance */
04938   u8 *aFrom = 0;
04939 
04940   pPage = pCur->apPage[pCur->iPage];
04941   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
04942   VVA_ONLY( pCur->pagesShuffled = 1 );
04943 
04944   /* 
04945   ** Find the parent page.
04946   */
04947   assert( pCur->iPage>0 );
04948   assert( pPage->isInit );
04949   assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
04950   pBt = pPage->pBt;
04951   pParent = pCur->apPage[pCur->iPage-1];
04952   assert( pParent );
04953   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
04954     return rc;
04955   }
04956 
04957   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
04958 
04959 #ifndef SQLITE_OMIT_QUICKBALANCE
04960   /*
04961   ** A special case:  If a new entry has just been inserted into a
04962   ** table (that is, a btree with integer keys and all data at the leaves)
04963   ** and the new entry is the right-most entry in the tree (it has the
04964   ** largest key) then use the special balance_quick() routine for
04965   ** balancing.  balance_quick() is much faster and results in a tighter
04966   ** packing of data in the common case.
04967   */
04968   if( pPage->leaf &&
04969       pPage->intKey &&
04970       pPage->nOverflow==1 &&
04971       pPage->aOvfl[0].idx==pPage->nCell &&
04972       pParent->pgno!=1 &&
04973       get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
04974   ){
04975     assert( pPage->intKey );
04976     /*
04977     ** TODO: Check the siblings to the left of pPage. It may be that
04978     ** they are not full and no new page is required.
04979     */
04980     return balance_quick(pCur);
04981   }
04982 #endif
04983 
04984   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
04985     return rc;
04986   }
04987 
04988   /*
04989   ** Find the cell in the parent page whose left child points back
04990   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
04991   ** is the rightmost child of pParent then set idx to pParent->nCell 
04992   */
04993   idx = pCur->aiIdx[pCur->iPage-1];
04994   assertParentIndex(pParent, idx, pPage->pgno);
04995 
04996   /*
04997   ** Initialize variables so that it will be safe to jump
04998   ** directly to balance_cleanup at any moment.
04999   */
05000   nOld = nNew = 0;
05001 
05002   /*
05003   ** Find sibling pages to pPage and the cells in pParent that divide
05004   ** the siblings.  An attempt is made to find NN siblings on either
05005   ** side of pPage.  More siblings are taken from one side, however, if
05006   ** pPage there are fewer than NN siblings on the other side.  If pParent
05007   ** has NB or fewer children then all children of pParent are taken.
05008   */
05009   nxDiv = idx - NN;
05010   if( nxDiv + NB > pParent->nCell ){
05011     nxDiv = pParent->nCell - NB + 1;
05012   }
05013   if( nxDiv<0 ){
05014     nxDiv = 0;
05015   }
05016   nDiv = 0;
05017   for(i=0, k=nxDiv; i<NB; i++, k++){
05018     if( k<pParent->nCell ){
05019       apDiv[i] = findCell(pParent, k);
05020       nDiv++;
05021       assert( !pParent->leaf );
05022       pgnoOld[i] = get4byte(apDiv[i]);
05023     }else if( k==pParent->nCell ){
05024       pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
05025     }else{
05026       break;
05027     }
05028     rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
05029     if( rc ) goto balance_cleanup;
05030     /* apOld[i]->idxParent = k; */
05031     apCopy[i] = 0;
05032     assert( i==nOld );
05033     nOld++;
05034     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
05035   }
05036 
05037   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
05038   ** alignment */
05039   nMaxCells = (nMaxCells + 3)&~3;
05040 
05041   /*
05042   ** Allocate space for memory structures
05043   */
05044   szScratch =
05045        nMaxCells*sizeof(u8*)                       /* apCell */
05046      + nMaxCells*sizeof(u16)                       /* szCell */
05047      + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB  /* aCopy */
05048      + pBt->pageSize                               /* aSpace1 */
05049      + (ISAUTOVACUUM ? nMaxCells : 0);             /* aFrom */
05050   apCell = sqlite3ScratchMalloc( szScratch ); 
05051   if( apCell==0 ){
05052     rc = SQLITE_NOMEM;
05053     goto balance_cleanup;
05054   }
05055   szCell = (u16*)&apCell[nMaxCells];
05056   aCopy[0] = (u8*)&szCell[nMaxCells];
05057   assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
05058   for(i=1; i<NB; i++){
05059     aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
05060     assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
05061   }
05062   aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
05063   assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
05064   if( ISAUTOVACUUM ){
05065     aFrom = &aSpace1[pBt->pageSize];
05066   }
05067   aSpace2 = sqlite3PageMalloc(pBt->pageSize);
05068   if( aSpace2==0 ){
05069     rc = SQLITE_NOMEM;
05070     goto balance_cleanup;
05071   }
05072   
05073   /*
05074   ** Make copies of the content of pPage and its siblings into aOld[].
05075   ** The rest of this function will use data from the copies rather
05076   ** that the original pages since the original pages will be in the
05077   ** process of being overwritten.
05078   */
05079   for(i=0; i<nOld; i++){
05080     MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
05081     memcpy(p, apOld[i], sizeof(MemPage));
05082     p->aData = (void*)&p[1];
05083     memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
05084   }
05085 
05086   /*
05087   ** Load pointers to all cells on sibling pages and the divider cells
05088   ** into the local apCell[] array.  Make copies of the divider cells
05089   ** into space obtained form aSpace1[] and remove the the divider Cells
05090   ** from pParent.
05091   **
05092   ** If the siblings are on leaf pages, then the child pointers of the
05093   ** divider cells are stripped from the cells before they are copied
05094   ** into aSpace1[].  In this way, all cells in apCell[] are without
05095   ** child pointers.  If siblings are not leaves, then all cell in
05096   ** apCell[] include child pointers.  Either way, all cells in apCell[]
05097   ** are alike.
05098   **
05099   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
05100   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
05101   */
05102   nCell = 0;
05103   leafCorrection = pPage->leaf*4;
05104   leafData = pPage->hasData;
05105   for(i=0; i<nOld; i++){
05106     MemPage *pOld = apCopy[i];
05107     int limit = pOld->nCell+pOld->nOverflow;
05108     for(j=0; j<limit; j++){
05109       assert( nCell<nMaxCells );
05110       apCell[nCell] = findOverflowCell(pOld, j);
05111       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
05112       if( ISAUTOVACUUM ){
05113         int a;
05114         aFrom[nCell] = i;
05115         for(a=0; a<pOld->nOverflow; a++){
05116           if( pOld->aOvfl[a].pCell==apCell[nCell] ){
05117             aFrom[nCell] = 0xFF;
05118             break;
05119           }
05120         }
05121       }
05122       nCell++;
05123     }
05124     if( i<nOld-1 ){
05125       u16 sz = cellSizePtr(pParent, apDiv[i]);
05126       if( leafData ){
05127         /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
05128         ** are duplicates of keys on the child pages.  We need to remove
05129         ** the divider cells from pParent, but the dividers cells are not
05130         ** added to apCell[] because they are duplicates of child cells.
05131         */
05132         dropCell(pParent, nxDiv, sz);
05133       }else{
05134         u8 *pTemp;
05135         assert( nCell<nMaxCells );
05136         szCell[nCell] = sz;
05137         pTemp = &aSpace1[iSpace1];
05138         iSpace1 += sz;
05139         assert( sz<=pBt->pageSize/4 );
05140         assert( iSpace1<=pBt->pageSize );
05141         memcpy(pTemp, apDiv[i], sz);
05142         apCell[nCell] = pTemp+leafCorrection;
05143         if( ISAUTOVACUUM ){
05144           aFrom[nCell] = 0xFF;
05145         }
05146         dropCell(pParent, nxDiv, sz);
05147         szCell[nCell] -= leafCorrection;
05148         assert( get4byte(pTemp)==pgnoOld[i] );
05149         if( !pOld->leaf ){
05150           assert( leafCorrection==0 );
05151           /* The right pointer of the child page pOld becomes the left
05152           ** pointer of the divider cell */
05153           memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
05154         }else{
05155           assert( leafCorrection==4 );
05156           if( szCell[nCell]<4 ){
05157             /* Do not allow any cells smaller than 4 bytes. */
05158             szCell[nCell] = 4;
05159           }
05160         }
05161         nCell++;
05162       }
05163     }
05164   }
05165 
05166   /*
05167   ** Figure out the number of pages needed to hold all nCell cells.
05168   ** Store this number in "k".  Also compute szNew[] which is the total
05169   ** size of all cells on the i-th page and cntNew[] which is the index
05170   ** in apCell[] of the cell that divides page i from page i+1.  
05171   ** cntNew[k] should equal nCell.
05172   **
05173   ** Values computed by this block:
05174   **
05175   **           k: The total number of sibling pages
05176   **    szNew[i]: Spaced used on the i-th sibling page.
05177   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
05178   **              the right of the i-th sibling page.
05179   ** usableSpace: Number of bytes of space available on each sibling.
05180   ** 
05181   */
05182   usableSpace = pBt->usableSize - 12 + leafCorrection;
05183   for(subtotal=k=i=0; i<nCell; i++){
05184     assert( i<nMaxCells );
05185     subtotal += szCell[i] + 2;
05186     if( subtotal > usableSpace ){
05187       szNew[k] = subtotal - szCell[i];
05188       cntNew[k] = i;
05189       if( leafData ){ i--; }
05190       subtotal = 0;
05191       k++;
05192     }
05193   }
05194   szNew[k] = subtotal;
05195   cntNew[k] = nCell;
05196   k++;
05197 
05198   /*
05199   ** The packing computed by the previous block is biased toward the siblings
05200   ** on the left side.  The left siblings are always nearly full, while the
05201   ** right-most sibling might be nearly empty.  This block of code attempts
05202   ** to adjust the packing of siblings to get a better balance.
05203   **
05204   ** This adjustment is more than an optimization.  The packing above might
05205   ** be so out of balance as to be illegal.  For example, the right-most
05206   ** sibling might be completely empty.  This adjustment is not optional.
05207   */
05208   for(i=k-1; i>0; i--){
05209     int szRight = szNew[i];  /* Size of sibling on the right */
05210     int szLeft = szNew[i-1]; /* Size of sibling on the left */
05211     int r;              /* Index of right-most cell in left sibling */
05212     int d;              /* Index of first cell to the left of right sibling */
05213 
05214     r = cntNew[i-1] - 1;
05215     d = r + 1 - leafData;
05216     assert( d<nMaxCells );
05217     assert( r<nMaxCells );
05218     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
05219       szRight += szCell[d] + 2;
05220       szLeft -= szCell[r] + 2;
05221       cntNew[i-1]--;
05222       r = cntNew[i-1] - 1;
05223       d = r + 1 - leafData;
05224     }
05225     szNew[i] = szRight;
05226     szNew[i-1] = szLeft;
05227   }
05228 
05229   /* Either we found one or more cells (cntnew[0])>0) or we are the
05230   ** a virtual root page.  A virtual root page is when the real root
05231   ** page is page 1 and we are the only child of that page.
05232   */
05233   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
05234 
05235   /*
05236   ** Allocate k new pages.  Reuse old pages where possible.
05237   */
05238   assert( pPage->pgno>1 );
05239   pageFlags = pPage->aData[0];
05240   for(i=0; i<k; i++){
05241     MemPage *pNew;
05242     if( i<nOld ){
05243       pNew = apNew[i] = apOld[i];
05244       pgnoNew[i] = pgnoOld[i];
05245       apOld[i] = 0;
05246       rc = sqlite3PagerWrite(pNew->pDbPage);
05247       nNew++;
05248       if( rc ) goto balance_cleanup;
05249     }else{
05250       assert( i>0 );
05251       rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
05252       if( rc ) goto balance_cleanup;
05253       apNew[i] = pNew;
05254       nNew++;
05255     }
05256   }
05257 
05258   /* Free any old pages that were not reused as new pages.
05259   */
05260   while( i<nOld ){
05261     rc = freePage(apOld[i]);
05262     if( rc ) goto balance_cleanup;
05263     releasePage(apOld[i]);
05264     apOld[i] = 0;
05265     i++;
05266   }
05267 
05268   /*
05269   ** Put the new pages in accending order.  This helps to
05270   ** keep entries in the disk file in order so that a scan
05271   ** of the table is a linear scan through the file.  That
05272   ** in turn helps the operating system to deliver pages
05273   ** from the disk more rapidly.
05274   **
05275   ** An O(n^2) insertion sort algorithm is used, but since
05276   ** n is never more than NB (a small constant), that should
05277   ** not be a problem.
05278   **
05279   ** When NB==3, this one optimization makes the database
05280   ** about 25% faster for large insertions and deletions.
05281   */
05282   for(i=0; i<k-1; i++){
05283     int minV = pgnoNew[i];
05284     int minI = i;
05285     for(j=i+1; j<k; j++){
05286       if( pgnoNew[j]<(unsigned)minV ){
05287         minI = j;
05288         minV = pgnoNew[j];
05289       }
05290     }
05291     if( minI>i ){
05292       int t;
05293       MemPage *pT;
05294       t = pgnoNew[i];
05295       pT = apNew[i];
05296       pgnoNew[i] = pgnoNew[minI];
05297       apNew[i] = apNew[minI];
05298       pgnoNew[minI] = t;
05299       apNew[minI] = pT;
05300     }
05301   }
05302   TRACE(("BALANCE: old: %d %d %d  new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
05303     pgnoOld[0], 
05304     nOld>=2 ? pgnoOld[1] : 0,
05305     nOld>=3 ? pgnoOld[2] : 0,
05306     pgnoNew[0], szNew[0],
05307     nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
05308     nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
05309     nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
05310     nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
05311 
05312   /*
05313   ** Evenly distribute the data in apCell[] across the new pages.
05314   ** Insert divider cells into pParent as necessary.
05315   */
05316   j = 0;
05317   for(i=0; i<nNew; i++){
05318     /* Assemble the new sibling page. */
05319     MemPage *pNew = apNew[i];
05320     assert( j<nMaxCells );
05321     assert( pNew->pgno==pgnoNew[i] );
05322     zeroPage(pNew, pageFlags);
05323     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
05324     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
05325     assert( pNew->nOverflow==0 );
05326 
05327     /* If this is an auto-vacuum database, update the pointer map entries
05328     ** that point to the siblings that were rearranged. These can be: left
05329     ** children of cells, the right-child of the page, or overflow pages
05330     ** pointed to by cells.
05331     */
05332     if( ISAUTOVACUUM ){
05333       for(k=j; k<cntNew[i]; k++){
05334         assert( k<nMaxCells );
05335         if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
05336           rc = ptrmapPutOvfl(pNew, k-j);
05337           if( rc==SQLITE_OK && leafCorrection==0 ){
05338             rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
05339           }
05340           if( rc!=SQLITE_OK ){
05341             goto balance_cleanup;
05342           }
05343         }
05344       }
05345     }
05346 
05347     j = cntNew[i];
05348 
05349     /* If the sibling page assembled above was not the right-most sibling,
05350     ** insert a divider cell into the parent page.
05351     */
05352     if( i<nNew-1 && j<nCell ){
05353       u8 *pCell;
05354       u8 *pTemp;
05355       int sz;
05356 
05357       assert( j<nMaxCells );
05358       pCell = apCell[j];
05359       sz = szCell[j] + leafCorrection;
05360       pTemp = &aSpace2[iSpace2];
05361       if( !pNew->leaf ){
05362         memcpy(&pNew->aData[8], pCell, 4);
05363         if( ISAUTOVACUUM 
05364          && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
05365         ){
05366           rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
05367           if( rc!=SQLITE_OK ){
05368             goto balance_cleanup;
05369           }
05370         }
05371       }else if( leafData ){
05372         /* If the tree is a leaf-data tree, and the siblings are leaves, 
05373         ** then there is no divider cell in apCell[]. Instead, the divider 
05374         ** cell consists of the integer key for the right-most cell of 
05375         ** the sibling-page assembled above only.
05376         */
05377         CellInfo info;
05378         j--;
05379         sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
05380         pCell = pTemp;
05381         fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
05382         pTemp = 0;
05383       }else{
05384         pCell -= 4;
05385         /* Obscure case for non-leaf-data trees: If the cell at pCell was
05386         ** previously stored on a leaf node, and its reported size was 4
05387         ** bytes, then it may actually be smaller than this 
05388         ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
05389         ** any cell). But it is important to pass the correct size to 
05390         ** insertCell(), so reparse the cell now.
05391         **
05392         ** Note that this can never happen in an SQLite data file, as all
05393         ** cells are at least 4 bytes. It only happens in b-trees used
05394         ** to evaluate "IN (SELECT ...)" and similar clauses.
05395         */
05396         if( szCell[j]==4 ){
05397           assert(leafCorrection==4);
05398           sz = cellSizePtr(pParent, pCell);
05399         }
05400       }
05401       iSpace2 += sz;
05402       assert( sz<=pBt->pageSize/4 );
05403       assert( iSpace2<=pBt->pageSize );
05404       rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
05405       if( rc!=SQLITE_OK ) goto balance_cleanup;
05406       put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
05407 
05408       /* If this is an auto-vacuum database, and not a leaf-data tree,
05409       ** then update the pointer map with an entry for the overflow page
05410       ** that the cell just inserted points to (if any).
05411       */
05412       if( ISAUTOVACUUM && !leafData ){
05413         rc = ptrmapPutOvfl(pParent, nxDiv);
05414         if( rc!=SQLITE_OK ){
05415           goto balance_cleanup;
05416         }
05417       }
05418       j++;
05419       nxDiv++;
05420     }
05421 
05422     /* Set the pointer-map entry for the new sibling page. */
05423     if( ISAUTOVACUUM ){
05424       rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
05425       if( rc!=SQLITE_OK ){
05426         goto balance_cleanup;
05427       }
05428     }
05429   }
05430   assert( j==nCell );
05431   assert( nOld>0 );
05432   assert( nNew>0 );
05433   if( (pageFlags & PTF_LEAF)==0 ){
05434     u8 *zChild = &apCopy[nOld-1]->aData[8];
05435     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
05436     if( ISAUTOVACUUM ){
05437       rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
05438       if( rc!=SQLITE_OK ){
05439         goto balance_cleanup;
05440       }
05441     }
05442   }
05443   if( nxDiv==pParent->nCell+pParent->nOverflow ){
05444     /* Right-most sibling is the right-most child of pParent */
05445     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
05446   }else{
05447     /* Right-most sibling is the left child of the first entry in pParent
05448     ** past the right-most divider entry */
05449     put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
05450   }
05451 
05452   /*
05453   ** Balance the parent page.  Note that the current page (pPage) might
05454   ** have been added to the freelist so it might no longer be initialized.
05455   ** But the parent page will always be initialized.
05456   */
05457   assert( pParent->isInit );
05458   sqlite3ScratchFree(apCell);
05459   apCell = 0;
05460   releasePage(pPage);
05461   pCur->iPage--;
05462   rc = balance(pCur, 0);
05463   
05464   /*
05465   ** Cleanup before returning.
05466   */
05467 balance_cleanup:
05468   sqlite3PageFree(aSpace2);
05469   sqlite3ScratchFree(apCell);
05470   for(i=0; i<nOld; i++){
05471     releasePage(apOld[i]);
05472   }
05473   for(i=0; i<nNew; i++){
05474     releasePage(apNew[i]);
05475   }
05476 
05477   /* releasePage(pParent); */
05478   TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
05479           pPage->pgno, nOld, nNew, nCell));
05480 
05481   return rc;
05482 }
05483 
05484 /*
05485 ** This routine is called for the root page of a btree when the root
05486 ** page contains no cells.  This is an opportunity to make the tree
05487 ** shallower by one level.
05488 */
05489 static int balance_shallower(BtCursor *pCur){
05490   MemPage *pPage;              /* Root page of B-Tree */
05491   MemPage *pChild;             /* The only child page of pPage */
05492   Pgno pgnoChild;              /* Page number for pChild */
05493   int rc = SQLITE_OK;          /* Return code from subprocedures */
05494   BtShared *pBt;                  /* The main BTree structure */
05495   int mxCellPerPage;           /* Maximum number of cells per page */
05496   u8 **apCell;                 /* All cells from pages being balanced */
05497   u16 *szCell;                 /* Local size of all cells */
05498 
05499   assert( pCur->iPage==0 );
05500   pPage = pCur->apPage[0];
05501 
05502   assert( pPage->nCell==0 );
05503   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
05504   pBt = pPage->pBt;
05505   mxCellPerPage = MX_CELL(pBt);
05506   apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
05507   if( apCell==0 ) return SQLITE_NOMEM;
05508   szCell = (u16*)&apCell[mxCellPerPage];
05509   if( pPage->leaf ){
05510     /* The table is completely empty */
05511     TRACE(("BALANCE: empty table %d\n", pPage->pgno));
05512   }else{
05513     /* The root page is empty but has one child.  Transfer the
05514     ** information from that one child into the root page if it 
05515     ** will fit.  This reduces the depth of the tree by one.
05516     **
05517     ** If the root page is page 1, it has less space available than
05518     ** its child (due to the 100 byte header that occurs at the beginning
05519     ** of the database fle), so it might not be able to hold all of the 
05520     ** information currently contained in the child.  If this is the 
05521     ** case, then do not do the transfer.  Leave page 1 empty except
05522     ** for the right-pointer to the child page.  The child page becomes
05523     ** the virtual root of the tree.
05524     */
05525     VVA_ONLY( pCur->pagesShuffled = 1 );
05526     pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
05527     assert( pgnoChild>0 );
05528     assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
05529     rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
05530     if( rc ) goto end_shallow_balance;
05531     if( pPage->pgno==1 ){
05532       rc = sqlite3BtreeInitPage(pChild);
05533       if( rc ) goto end_shallow_balance;
05534       assert( pChild->nOverflow==0 );
05535       if( pChild->nFree>=100 ){
05536         /* The child information will fit on the root page, so do the
05537         ** copy */
05538         int i;
05539         zeroPage(pPage, pChild->aData[0]);
05540         for(i=0; i<pChild->nCell; i++){
05541           apCell[i] = findCell(pChild,i);
05542           szCell[i] = cellSizePtr(pChild, apCell[i]);
05543         }
05544         assemblePage(pPage, pChild->nCell, apCell, szCell);
05545         /* Copy the right-pointer of the child to the parent. */
05546         put4byte(&pPage->aData[pPage->hdrOffset+8], 
05547             get4byte(&pChild->aData[pChild->hdrOffset+8]));
05548         freePage(pChild);
05549         TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
05550       }else{
05551         /* The child has more information that will fit on the root.
05552         ** The tree is already balanced.  Do nothing. */
05553         TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
05554       }
05555     }else{
05556       memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
05557       pPage->isInit = 0;
05558       rc = sqlite3BtreeInitPage(pPage);
05559       assert( rc==SQLITE_OK );
05560       freePage(pChild);
05561       TRACE(("BALANCE: transfer child %d into root %d\n",
05562               pChild->pgno, pPage->pgno));
05563     }
05564     assert( pPage->nOverflow==0 );
05565 #ifndef SQLITE_OMIT_AUTOVACUUM
05566     if( ISAUTOVACUUM ){
05567       rc = setChildPtrmaps(pPage);
05568     }
05569 #endif
05570     releasePage(pChild);
05571   }
05572 end_shallow_balance:
05573   sqlite3_free(apCell);
05574   return rc;
05575 }
05576 
05577 
05578 /*
05579 ** The root page is overfull
05580 **
05581 ** When this happens, Create a new child page and copy the
05582 ** contents of the root into the child.  Then make the root
05583 ** page an empty page with rightChild pointing to the new
05584 ** child.   Finally, call balance_internal() on the new child
05585 ** to cause it to split.
05586 */
05587 static int balance_deeper(BtCursor *pCur){
05588   int rc;             /* Return value from subprocedures */
05589   MemPage *pPage;     /* Pointer to the root page */
05590   MemPage *pChild;    /* Pointer to a new child page */
05591   Pgno pgnoChild;     /* Page number of the new child page */
05592   BtShared *pBt;         /* The BTree */
05593   int usableSize;     /* Total usable size of a page */
05594   u8 *data;           /* Content of the parent page */
05595   u8 *cdata;          /* Content of the child page */
05596   int hdr;            /* Offset to page header in parent */
05597   int cbrk;           /* Offset to content of first cell in parent */
05598 
05599   assert( pCur->iPage==0 );
05600   assert( pCur->apPage[0]->nOverflow>0 );
05601 
05602   VVA_ONLY( pCur->pagesShuffled = 1 );
05603   pPage = pCur->apPage[0];
05604   pBt = pPage->pBt;
05605   assert( sqlite3_mutex_held(pBt->mutex) );
05606   rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
05607   if( rc ) return rc;
05608   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
05609   usableSize = pBt->usableSize;
05610   data = pPage->aData;
05611   hdr = pPage->hdrOffset;
05612   cbrk = get2byte(&data[hdr+5]);
05613   cdata = pChild->aData;
05614   memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
05615   memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
05616   
05617   rc = sqlite3BtreeInitPage(pChild);
05618   if( rc==SQLITE_OK ){
05619     int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
05620     memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
05621     pChild->nOverflow = pPage->nOverflow;
05622     if( pChild->nOverflow ){
05623       pChild->nFree = 0;
05624     }
05625     assert( pChild->nCell==pPage->nCell );
05626     zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
05627     put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
05628     TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
05629     if( ISAUTOVACUUM ){
05630       rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
05631 #ifndef SQLITE_OMIT_AUTOVACUUM
05632       if( rc==SQLITE_OK ){
05633         rc = setChildPtrmaps(pChild);
05634       }
05635 #endif
05636     }
05637   }
05638 
05639   if( rc==SQLITE_OK ){
05640     pCur->iPage++;
05641     pCur->apPage[1] = pChild;
05642     pCur->aiIdx[0] = 0;
05643     rc = balance_nonroot(pCur);
05644   }else{
05645     releasePage(pChild);
05646   }
05647 
05648   return rc;
05649 }
05650 
05651 /*
05652 ** The page that pCur currently points to has just been modified in
05653 ** some way. This function figures out if this modification means the
05654 ** tree needs to be balanced, and if so calls the appropriate balancing 
05655 ** routine.
05656 ** 
05657 ** Parameter isInsert is true if a new cell was just inserted into the
05658 ** page, or false otherwise.
05659 */
05660 static int balance(BtCursor *pCur, int isInsert){
05661   int rc = SQLITE_OK;
05662   MemPage *pPage = pCur->apPage[pCur->iPage];
05663 
05664   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
05665   if( pCur->iPage==0 ){
05666     rc = sqlite3PagerWrite(pPage->pDbPage);
05667     if( rc==SQLITE_OK && pPage->nOverflow>0 ){
05668       rc = balance_deeper(pCur);
05669     }
05670     if( rc==SQLITE_OK && pPage->nCell==0 ){
05671       rc = balance_shallower(pCur);
05672     }
05673   }else{
05674     if( pPage->nOverflow>0 || 
05675         (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
05676       rc = balance_nonroot(pCur);
05677     }
05678   }
05679   return rc;
05680 }
05681 
05682 /*
05683 ** This routine checks all cursors that point to table pgnoRoot.
05684 ** If any of those cursors were opened with wrFlag==0 in a different
05685 ** database connection (a database connection that shares the pager
05686 ** cache with the current connection) and that other connection 
05687 ** is not in the ReadUncommmitted state, then this routine returns 
05688 ** SQLITE_LOCKED.
05689 **
05690 ** As well as cursors with wrFlag==0, cursors with wrFlag==1 and 
05691 ** isIncrblobHandle==1 are also considered 'read' cursors. Incremental 
05692 ** blob cursors are used for both reading and writing.
05693 **
05694 ** When pgnoRoot is the root page of an intkey table, this function is also
05695 ** responsible for invalidating incremental blob cursors when the table row
05696 ** on which they are opened is deleted or modified. Cursors are invalidated
05697 ** according to the following rules:
05698 **
05699 **   1) When BtreeClearTable() is called to completely delete the contents
05700 **      of a B-Tree table, pExclude is set to zero and parameter iRow is 
05701 **      set to non-zero. In this case all incremental blob cursors open
05702 **      on the table rooted at pgnoRoot are invalidated.
05703 **
05704 **   2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to 
05705 **      modify a table row via an SQL statement, pExclude is set to the 
05706 **      write cursor used to do the modification and parameter iRow is set
05707 **      to the integer row id of the B-Tree entry being modified. Unless
05708 **      pExclude is itself an incremental blob cursor, then all incremental
05709 **      blob cursors open on row iRow of the B-Tree are invalidated.
05710 **
05711 **   3) If both pExclude and iRow are set to zero, no incremental blob 
05712 **      cursors are invalidated.
05713 */
05714 static int checkReadLocks(
05715   Btree *pBtree, 
05716   Pgno pgnoRoot, 
05717   BtCursor *pExclude,
05718   i64 iRow
05719 ){
05720   BtCursor *p;
05721   BtShared *pBt = pBtree->pBt;
05722   sqlite3 *db = pBtree->db;
05723   assert( sqlite3BtreeHoldsMutex(pBtree) );
05724   for(p=pBt->pCursor; p; p=p->pNext){
05725     if( p==pExclude ) continue;
05726     if( p->pgnoRoot!=pgnoRoot ) continue;
05727 #ifndef SQLITE_OMIT_INCRBLOB
05728     if( p->isIncrblobHandle && ( 
05729          (!pExclude && iRow)
05730       || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
05731     )){
05732       p->eState = CURSOR_INVALID;
05733     }
05734 #endif
05735     if( p->eState!=CURSOR_VALID ) continue;
05736     if( p->wrFlag==0 
05737 #ifndef SQLITE_OMIT_INCRBLOB
05738      || p->isIncrblobHandle
05739 #endif
05740     ){
05741       sqlite3 *dbOther = p->pBtree->db;
05742       if( dbOther==0 ||
05743          (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
05744         return SQLITE_LOCKED;
05745       }
05746     }
05747   }
05748   return SQLITE_OK;
05749 }
05750 
05751 /*
05752 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
05753 ** and the data is given by (pData,nData).  The cursor is used only to
05754 ** define what table the record should be inserted into.  The cursor
05755 ** is left pointing at a random location.
05756 **
05757 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
05758 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
05759 */
05760 int sqlite3BtreeInsert(
05761   BtCursor *pCur,                /* Insert data into the table of this cursor */
05762   const void *pKey, i64 nKey,    /* The key of the new record */
05763   const void *pData, int nData,  /* The data of the new record */
05764   int nZero,                     /* Number of extra 0 bytes to append to data */
05765   int appendBias                 /* True if this is likely an append */
05766 ){
05767   int rc;
05768   int loc;
05769   int szNew;
05770   int idx;
05771   MemPage *pPage;
05772   Btree *p = pCur->pBtree;
05773   BtShared *pBt = p->pBt;
05774   unsigned char *oldCell;
05775   unsigned char *newCell = 0;
05776 
05777   assert( cursorHoldsMutex(pCur) );
05778   if( pBt->inTransaction!=TRANS_WRITE ){
05779     /* Must start a transaction before doing an insert */
05780     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
05781     return rc;
05782   }
05783   assert( !pBt->readOnly );
05784   if( !pCur->wrFlag ){
05785     return SQLITE_PERM;   /* Cursor not open for writing */
05786   }
05787   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
05788     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
05789   }
05790   if( pCur->eState==CURSOR_FAULT ){
05791     return pCur->skip;
05792   }
05793 
05794   /* Save the positions of any other cursors open on this table */
05795   sqlite3BtreeClearCursor(pCur);
05796   if( 
05797     SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
05798     SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
05799   ){
05800     return rc;
05801   }
05802 
05803   pPage = pCur->apPage[pCur->iPage];
05804   assert( pPage->intKey || nKey>=0 );
05805   assert( pPage->leaf || !pPage->intKey );
05806   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
05807           pCur->pgnoRoot, nKey, nData, pPage->pgno,
05808           loc==0 ? "overwrite" : "new entry"));
05809   assert( pPage->isInit );
05810   allocateTempSpace(pBt);
05811   newCell = pBt->pTmpSpace;
05812   if( newCell==0 ) return SQLITE_NOMEM;
05813   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
05814   if( rc ) goto end_insert;
05815   assert( szNew==cellSizePtr(pPage, newCell) );
05816   assert( szNew<=MX_CELL_SIZE(pBt) );
05817   idx = pCur->aiIdx[pCur->iPage];
05818   if( loc==0 && CURSOR_VALID==pCur->eState ){
05819     u16 szOld;
05820     assert( idx<pPage->nCell );
05821     rc = sqlite3PagerWrite(pPage->pDbPage);
05822     if( rc ){
05823       goto end_insert;
05824     }
05825     oldCell = findCell(pPage, idx);
05826     if( !pPage->leaf ){
05827       memcpy(newCell, oldCell, 4);
05828     }
05829     szOld = cellSizePtr(pPage, oldCell);
05830     rc = clearCell(pPage, oldCell);
05831     if( rc ) goto end_insert;
05832     rc = dropCell(pPage, idx, szOld);
05833     if( rc!=SQLITE_OK ) {
05834       goto end_insert;
05835     }
05836   }else if( loc<0 && pPage->nCell>0 ){
05837     assert( pPage->leaf );
05838     idx = ++pCur->aiIdx[pCur->iPage];
05839     pCur->info.nSize = 0;
05840     pCur->validNKey = 0;
05841   }else{
05842     assert( pPage->leaf );
05843   }
05844   rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
05845   if( rc!=SQLITE_OK ) goto end_insert;
05846   rc = balance(pCur, 1);
05847   if( rc==SQLITE_OK ){
05848     moveToRoot(pCur);
05849   }
05850 end_insert:
05851   return rc;
05852 }
05853 
05854 /*
05855 ** Delete the entry that the cursor is pointing to.  The cursor
05856 ** is left pointing at a arbitrary location.
05857 */
05858 int sqlite3BtreeDelete(BtCursor *pCur){
05859   MemPage *pPage = pCur->apPage[pCur->iPage];
05860   int idx;
05861   unsigned char *pCell;
05862   int rc;
05863   Pgno pgnoChild = 0;
05864   Btree *p = pCur->pBtree;
05865   BtShared *pBt = p->pBt;
05866 
05867   assert( cursorHoldsMutex(pCur) );
05868   assert( pPage->isInit );
05869   if( pBt->inTransaction!=TRANS_WRITE ){
05870     /* Must start a transaction before doing a delete */
05871     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
05872     return rc;
05873   }
05874   assert( !pBt->readOnly );
05875   if( pCur->eState==CURSOR_FAULT ){
05876     return pCur->skip;
05877   }
05878   if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
05879     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
05880   }
05881   if( !pCur->wrFlag ){
05882     return SQLITE_PERM;   /* Did not open this cursor for writing */
05883   }
05884   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
05885     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
05886   }
05887 
05888   /* Restore the current cursor position (a no-op if the cursor is not in 
05889   ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors 
05890   ** open on the same table. Then call sqlite3PagerWrite() on the page
05891   ** that the entry will be deleted from.
05892   */
05893   if( 
05894     (rc = restoreCursorPosition(pCur))!=0 ||
05895     (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
05896     (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
05897   ){
05898     return rc;
05899   }
05900 
05901   /* Locate the cell within its page and leave pCell pointing to the
05902   ** data. The clearCell() call frees any overflow pages associated with the
05903   ** cell. The cell itself is still intact.
05904   */
05905   idx = pCur->aiIdx[pCur->iPage];
05906   pCell = findCell(pPage, idx);
05907   if( !pPage->leaf ){
05908     pgnoChild = get4byte(pCell);
05909   }
05910   rc = clearCell(pPage, pCell);
05911   if( rc ){
05912     return rc;
05913   }
05914 
05915   if( !pPage->leaf ){
05916     /*
05917     ** The entry we are about to delete is not a leaf so if we do not
05918     ** do something we will leave a hole on an internal page.
05919     ** We have to fill the hole by moving in a cell from a leaf.  The
05920     ** next Cell after the one to be deleted is guaranteed to exist and
05921     ** to be a leaf so we can use it.
05922     */
05923     BtCursor leafCur;
05924     MemPage *pLeafPage;
05925 
05926     unsigned char *pNext;
05927     int notUsed;
05928     unsigned char *tempCell = 0;
05929     assert( !pPage->intKey );
05930     sqlite3BtreeGetTempCursor(pCur, &leafCur);
05931     rc = sqlite3BtreeNext(&leafCur, &notUsed);
05932     if( rc==SQLITE_OK ){
05933       assert( leafCur.aiIdx[leafCur.iPage]==0 );
05934       pLeafPage = leafCur.apPage[leafCur.iPage];
05935       rc = sqlite3PagerWrite(pLeafPage->pDbPage);
05936     }
05937     if( rc==SQLITE_OK ){
05938       int leafCursorInvalid = 0;
05939       u16 szNext;
05940       TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
05941          pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
05942       dropCell(pPage, idx, cellSizePtr(pPage, pCell));
05943       pNext = findCell(pLeafPage, 0);
05944       szNext = cellSizePtr(pLeafPage, pNext);
05945       assert( MX_CELL_SIZE(pBt)>=szNext+4 );
05946       allocateTempSpace(pBt);
05947       tempCell = pBt->pTmpSpace;
05948       if( tempCell==0 ){
05949         rc = SQLITE_NOMEM;
05950       }
05951       if( rc==SQLITE_OK ){
05952         rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
05953       }
05954 
05955 
05956       /* The "if" statement in the next code block is critical.  The
05957       ** slightest error in that statement would allow SQLite to operate
05958       ** correctly most of the time but produce very rare failures.  To
05959       ** guard against this, the following macros help to verify that
05960       ** the "if" statement is well tested.
05961       */
05962       testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3 
05963                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
05964       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3 
05965                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
05966       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1 
05967                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
05968       testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
05969                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
05970       testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
05971                  && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
05972 
05973 
05974       if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
05975           (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
05976       ){
05977         /* This branch is taken if the internal node is now either overflowing
05978         ** or underfull and the leaf node will be underfull after the just cell 
05979         ** copied to the internal node is deleted from it. This is a special
05980         ** case because the call to balance() to correct the internal node
05981         ** may change the tree structure and invalidate the contents of
05982         ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
05983         ** used by the balance() required to correct the underfull leaf
05984         ** node.
05985         **
05986         ** The formula used in the expression above are based on facets of
05987         ** the SQLite file-format that do not change over time.
05988         */
05989         testcase( pPage->nFree==pBt->usableSize*2/3+1 );
05990         testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
05991         leafCursorInvalid = 1;
05992       }        
05993 
05994       if( rc==SQLITE_OK ){
05995         put4byte(findOverflowCell(pPage, idx), pgnoChild);
05996         VVA_ONLY( pCur->pagesShuffled = 0 );
05997         rc = balance(pCur, 0);
05998       }
05999 
06000       if( rc==SQLITE_OK && leafCursorInvalid ){
06001         /* The leaf-node is now underfull and so the tree needs to be 
06002         ** rebalanced. However, the balance() operation on the internal
06003         ** node above may have modified the structure of the B-Tree and
06004         ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
06005         ** may not be trusted.
06006         **
06007         ** It is not possible to copy the ancestry from pCur, as the same
06008         ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
06009         ** arrays. 
06010         **
06011         ** The call to saveCursorPosition() below internally saves the 
06012         ** key that leafCur is currently pointing to. Currently, there
06013         ** are two copies of that key in the tree - one here on the leaf
06014         ** page and one on some internal node in the tree. The copy on
06015         ** the leaf node is always the next key in tree-order after the 
06016         ** copy on the internal node. So, the call to sqlite3BtreeNext()
06017         ** calls restoreCursorPosition() to point the cursor to the copy
06018         ** stored on the internal node, then advances to the next entry,
06019         ** which happens to be the copy of the key on the internal node.
06020         ** Net effect: leafCur is pointing back to the duplicate cell
06021         ** that needs to be removed, and the leafCur.apPage[] and
06022         ** leafCur.aiIdx[] arrays are correct.
06023         */
06024         VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
06025         rc = saveCursorPosition(&leafCur);
06026         if( rc==SQLITE_OK ){
06027           rc = sqlite3BtreeNext(&leafCur, &notUsed);
06028         }
06029         pLeafPage = leafCur.apPage[leafCur.iPage];
06030         assert( pLeafPage->pgno==leafPgno );
06031         assert( leafCur.aiIdx[leafCur.iPage]==0 );
06032       }
06033 
06034       if( rc==SQLITE_OK ){
06035         dropCell(pLeafPage, 0, szNext);
06036         VVA_ONLY( leafCur.pagesShuffled = 0 );
06037         rc = balance(&leafCur, 0);
06038         assert( leafCursorInvalid || !leafCur.pagesShuffled
06039                                    || !pCur->pagesShuffled );
06040       }
06041     }
06042     sqlite3BtreeReleaseTempCursor(&leafCur);
06043   }else{
06044     TRACE(("DELETE: table=%d delete from leaf %d\n",
06045        pCur->pgnoRoot, pPage->pgno));
06046     dropCell(pPage, idx, cellSizePtr(pPage, pCell));
06047     rc = balance(pCur, 0);
06048   }
06049   if( rc==SQLITE_OK ){
06050     moveToRoot(pCur);
06051   }
06052   return rc;
06053 }
06054 
06055 /*
06056 ** Create a new BTree table.  Write into *piTable the page
06057 ** number for the root page of the new table.
06058 **
06059 ** The type of type is determined by the flags parameter.  Only the
06060 ** following values of flags are currently in use.  Other values for
06061 ** flags might not work:
06062 **
06063 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
06064 **     BTREE_ZERODATA                  Used for SQL indices
06065 */
06066 static int btreeCreateTable(Btree *p, int *piTable, int flags){
06067   BtShared *pBt = p->pBt;
06068   MemPage *pRoot;
06069   Pgno pgnoRoot;
06070   int rc;
06071 
06072   assert( sqlite3BtreeHoldsMutex(p) );
06073   if( pBt->inTransaction!=TRANS_WRITE ){
06074     /* Must start a transaction first */
06075     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
06076     return rc;
06077   }
06078   assert( !pBt->readOnly );
06079 
06080 #ifdef SQLITE_OMIT_AUTOVACUUM
06081   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
06082   if( rc ){
06083     return rc;
06084   }
06085 #else
06086   if( pBt->autoVacuum ){
06087     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
06088     MemPage *pPageMove; /* The page to move to. */
06089 
06090     /* Creating a new table may probably require moving an existing database
06091     ** to make room for the new tables root page. In case this page turns
06092     ** out to be an overflow page, delete all overflow page-map caches
06093     ** held by open cursors.
06094     */
06095     invalidateAllOverflowCache(pBt);
06096 
06097     /* Read the value of meta[3] from the database to determine where the
06098     ** root page of the new table should go. meta[3] is the largest root-page
06099     ** created so far, so the new root-page is (meta[3]+1).
06100     */
06101     rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
06102     if( rc!=SQLITE_OK ){
06103       return rc;
06104     }
06105     pgnoRoot++;
06106 
06107     /* The new root-page may not be allocated on a pointer-map page, or the
06108     ** PENDING_BYTE page.
06109     */
06110     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
06111         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
06112       pgnoRoot++;
06113     }
06114     assert( pgnoRoot>=3 );
06115 
06116     /* Allocate a page. The page that currently resides at pgnoRoot will
06117     ** be moved to the allocated page (unless the allocated page happens
06118     ** to reside at pgnoRoot).
06119     */
06120     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
06121     if( rc!=SQLITE_OK ){
06122       return rc;
06123     }
06124 
06125     if( pgnoMove!=pgnoRoot ){
06126       /* pgnoRoot is the page that will be used for the root-page of
06127       ** the new table (assuming an error did not occur). But we were
06128       ** allocated pgnoMove. If required (i.e. if it was not allocated
06129       ** by extending the file), the current page at position pgnoMove
06130       ** is already journaled.
06131       */
06132       u8 eType;
06133       Pgno iPtrPage;
06134 
06135       releasePage(pPageMove);
06136 
06137       /* Move the page currently at pgnoRoot to pgnoMove. */
06138       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
06139       if( rc!=SQLITE_OK ){
06140         return rc;
06141       }
06142       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
06143       if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
06144         releasePage(pRoot);
06145         return rc;
06146       }
06147       assert( eType!=PTRMAP_ROOTPAGE );
06148       assert( eType!=PTRMAP_FREEPAGE );
06149       rc = sqlite3PagerWrite(pRoot->pDbPage);
06150       if( rc!=SQLITE_OK ){
06151         releasePage(pRoot);
06152         return rc;
06153       }
06154       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
06155       releasePage(pRoot);
06156 
06157       /* Obtain the page at pgnoRoot */
06158       if( rc!=SQLITE_OK ){
06159         return rc;
06160       }
06161       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
06162       if( rc!=SQLITE_OK ){
06163         return rc;
06164       }
06165       rc = sqlite3PagerWrite(pRoot->pDbPage);
06166       if( rc!=SQLITE_OK ){
06167         releasePage(pRoot);
06168         return rc;
06169       }
06170     }else{
06171       pRoot = pPageMove;
06172     } 
06173 
06174     /* Update the pointer-map and meta-data with the new root-page number. */
06175     rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
06176     if( rc ){
06177       releasePage(pRoot);
06178       return rc;
06179     }
06180     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
06181     if( rc ){
06182       releasePage(pRoot);
06183       return rc;
06184     }
06185 
06186   }else{
06187     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
06188     if( rc ) return rc;
06189   }
06190 #endif
06191   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
06192   zeroPage(pRoot, flags | PTF_LEAF);
06193   sqlite3PagerUnref(pRoot->pDbPage);
06194   *piTable = (int)pgnoRoot;
06195   return SQLITE_OK;
06196 }
06197 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
06198   int rc;
06199   sqlite3BtreeEnter(p);
06200   p->pBt->db = p->db;
06201   rc = btreeCreateTable(p, piTable, flags);
06202   sqlite3BtreeLeave(p);
06203   return rc;
06204 }
06205 
06206 /*
06207 ** Erase the given database page and all its children.  Return
06208 ** the page to the freelist.
06209 */
06210 static int clearDatabasePage(
06211   BtShared *pBt,           /* The BTree that contains the table */
06212   Pgno pgno,            /* Page number to clear */
06213   MemPage *pParent,     /* Parent page.  NULL for the root */
06214   int freePageFlag,     /* Deallocate page if true */
06215   int *pnChange
06216 ){
06217   MemPage *pPage = 0;
06218   int rc;
06219   unsigned char *pCell;
06220   int i;
06221 
06222   assert( sqlite3_mutex_held(pBt->mutex) );
06223   if( pgno>pagerPagecount(pBt->pPager) ){
06224     return SQLITE_CORRUPT_BKPT;
06225   }
06226 
06227   rc = getAndInitPage(pBt, pgno, &pPage);
06228   if( rc ) goto cleardatabasepage_out;
06229   for(i=0; i<pPage->nCell; i++){
06230     pCell = findCell(pPage, i);
06231     if( !pPage->leaf ){
06232       rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1, pnChange);
06233       if( rc ) goto cleardatabasepage_out;
06234     }
06235     rc = clearCell(pPage, pCell);
06236     if( rc ) goto cleardatabasepage_out;
06237   }
06238   if( !pPage->leaf ){
06239     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1, pnChange);
06240     if( rc ) goto cleardatabasepage_out;
06241   }else if( pnChange ){
06242     assert( pPage->intKey );
06243     *pnChange += pPage->nCell;
06244   }
06245   if( freePageFlag ){
06246     rc = freePage(pPage);
06247   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
06248     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
06249   }
06250 
06251 cleardatabasepage_out:
06252   releasePage(pPage);
06253   return rc;
06254 }
06255 
06256 /*
06257 ** Delete all information from a single table in the database.  iTable is
06258 ** the page number of the root of the table.  After this routine returns,
06259 ** the root page is empty, but still exists.
06260 **
06261 ** This routine will fail with SQLITE_LOCKED if there are any open
06262 ** read cursors on the table.  Open write cursors are moved to the
06263 ** root of the table.
06264 **
06265 ** If pnChange is not NULL, then table iTable must be an intkey table. The
06266 ** integer value pointed to by pnChange is incremented by the number of
06267 ** entries in the table.
06268 */
06269 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
06270   int rc;
06271   BtShared *pBt = p->pBt;
06272   sqlite3BtreeEnter(p);
06273   pBt->db = p->db;
06274   if( p->inTrans!=TRANS_WRITE ){
06275     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
06276   }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
06277     /* nothing to do */
06278   }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
06279     /* nothing to do */
06280   }else{
06281     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0, pnChange);
06282   }
06283   sqlite3BtreeLeave(p);
06284   return rc;
06285 }
06286 
06287 /*
06288 ** Erase all information in a table and add the root of the table to
06289 ** the freelist.  Except, the root of the principle table (the one on
06290 ** page 1) is never added to the freelist.
06291 **
06292 ** This routine will fail with SQLITE_LOCKED if there are any open
06293 ** cursors on the table.
06294 **
06295 ** If AUTOVACUUM is enabled and the page at iTable is not the last
06296 ** root page in the database file, then the last root page 
06297 ** in the database file is moved into the slot formerly occupied by
06298 ** iTable and that last slot formerly occupied by the last root page
06299 ** is added to the freelist instead of iTable.  In this say, all
06300 ** root pages are kept at the beginning of the database file, which
06301 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the 
06302 ** page number that used to be the last root page in the file before
06303 ** the move.  If no page gets moved, *piMoved is set to 0.
06304 ** The last root page is recorded in meta[3] and the value of
06305 ** meta[3] is updated by this procedure.
06306 */
06307 static int btreeDropTable(Btree *p, int iTable, int *piMoved){
06308   int rc;
06309   MemPage *pPage = 0;
06310   BtShared *pBt = p->pBt;
06311 
06312   assert( sqlite3BtreeHoldsMutex(p) );
06313   if( p->inTrans!=TRANS_WRITE ){
06314     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
06315   }
06316 
06317   /* It is illegal to drop a table if any cursors are open on the
06318   ** database. This is because in auto-vacuum mode the backend may
06319   ** need to move another root-page to fill a gap left by the deleted
06320   ** root page. If an open cursor was using this page a problem would 
06321   ** occur.
06322   */
06323   if( pBt->pCursor ){
06324     return SQLITE_LOCKED;
06325   }
06326 
06327   rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
06328   if( rc ) return rc;
06329   rc = sqlite3BtreeClearTable(p, iTable, 0);
06330   if( rc ){
06331     releasePage(pPage);
06332     return rc;
06333   }
06334 
06335   *piMoved = 0;
06336 
06337   if( iTable>1 ){
06338 #ifdef SQLITE_OMIT_AUTOVACUUM
06339     rc = freePage(pPage);
06340     releasePage(pPage);
06341 #else
06342     if( pBt->autoVacuum ){
06343       Pgno maxRootPgno;
06344       rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
06345       if( rc!=SQLITE_OK ){
06346         releasePage(pPage);
06347         return rc;
06348       }
06349 
06350       if( iTable==maxRootPgno ){
06351         /* If the table being dropped is the table with the largest root-page
06352         ** number in the database, put the root page on the free list. 
06353         */
06354         rc = freePage(pPage);
06355         releasePage(pPage);
06356         if( rc!=SQLITE_OK ){
06357           return rc;
06358         }
06359       }else{
06360         /* The table being dropped does not have the largest root-page
06361         ** number in the database. So move the page that does into the 
06362         ** gap left by the deleted root-page.
06363         */
06364         MemPage *pMove;
06365         releasePage(pPage);
06366         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
06367         if( rc!=SQLITE_OK ){
06368           return rc;
06369         }
06370         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
06371         releasePage(pMove);
06372         if( rc!=SQLITE_OK ){
06373           return rc;
06374         }
06375         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
06376         if( rc!=SQLITE_OK ){
06377           return rc;
06378         }
06379         rc = freePage(pMove);
06380         releasePage(pMove);
06381         if( rc!=SQLITE_OK ){
06382           return rc;
06383         }
06384         *piMoved = maxRootPgno;
06385       }
06386 
06387       /* Set the new 'max-root-page' value in the database header. This
06388       ** is the old value less one, less one more if that happens to
06389       ** be a root-page number, less one again if that is the
06390       ** PENDING_BYTE_PAGE.
06391       */
06392       maxRootPgno--;
06393       if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
06394         maxRootPgno--;
06395       }
06396       if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
06397         maxRootPgno--;
06398       }
06399       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
06400 
06401       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
06402     }else{
06403       rc = freePage(pPage);
06404       releasePage(pPage);
06405     }
06406 #endif
06407   }else{
06408     /* If sqlite3BtreeDropTable was called on page 1. */
06409     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
06410     releasePage(pPage);
06411   }
06412   return rc;  
06413 }
06414 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
06415   int rc;
06416   sqlite3BtreeEnter(p);
06417   p->pBt->db = p->db;
06418   rc = btreeDropTable(p, iTable, piMoved);
06419   sqlite3BtreeLeave(p);
06420   return rc;
06421 }
06422 
06423 
06424 /*
06425 ** Read the meta-information out of a database file.  Meta[0]
06426 ** is the number of free pages currently in the database.  Meta[1]
06427 ** through meta[15] are available for use by higher layers.  Meta[0]
06428 ** is read-only, the others are read/write.
06429 ** 
06430 ** The schema layer numbers meta values differently.  At the schema
06431 ** layer (and the SetCookie and ReadCookie opcodes) the number of
06432 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
06433 */
06434 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
06435   DbPage *pDbPage;
06436   int rc;
06437   unsigned char *pP1;
06438   BtShared *pBt = p->pBt;
06439 
06440   sqlite3BtreeEnter(p);
06441   pBt->db = p->db;
06442 
06443   /* Reading a meta-data value requires a read-lock on page 1 (and hence
06444   ** the sqlite_master table. We grab this lock regardless of whether or
06445   ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
06446   ** 1 is treated as a special case by queryTableLock() and lockTable()).
06447   */
06448   rc = queryTableLock(p, 1, READ_LOCK);
06449   if( rc!=SQLITE_OK ){
06450     sqlite3BtreeLeave(p);
06451     return rc;
06452   }
06453 
06454   assert( idx>=0 && idx<=15 );
06455   if( pBt->pPage1 ){
06456     /* The b-tree is already holding a reference to page 1 of the database
06457     ** file. In this case the required meta-data value can be read directly
06458     ** from the page data of this reference. This is slightly faster than
06459     ** requesting a new reference from the pager layer.
06460     */
06461     pP1 = (unsigned char *)pBt->pPage1->aData;
06462   }else{
06463     /* The b-tree does not have a reference to page 1 of the database file.
06464     ** Obtain one from the pager layer.
06465     */
06466     rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
06467     if( rc ){
06468       sqlite3BtreeLeave(p);
06469       return rc;
06470     }
06471     pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
06472   }
06473   *pMeta = get4byte(&pP1[36 + idx*4]);
06474 
06475   /* If the b-tree is not holding a reference to page 1, then one was 
06476   ** requested from the pager layer in the above block. Release it now.
06477   */
06478   if( !pBt->pPage1 ){
06479     sqlite3PagerUnref(pDbPage);
06480   }
06481 
06482   /* If autovacuumed is disabled in this build but we are trying to 
06483   ** access an autovacuumed database, then make the database readonly. 
06484   */
06485 #ifdef SQLITE_OMIT_AUTOVACUUM
06486   if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
06487 #endif
06488 
06489   /* Grab the read-lock on page 1. */
06490   rc = lockTable(p, 1, READ_LOCK);
06491   sqlite3BtreeLeave(p);
06492   return rc;
06493 }
06494 
06495 /*
06496 ** Write meta-information back into the database.  Meta[0] is
06497 ** read-only and may not be written.
06498 */
06499 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
06500   BtShared *pBt = p->pBt;
06501   unsigned char *pP1;
06502   int rc;
06503   assert( idx>=1 && idx<=15 );
06504   sqlite3BtreeEnter(p);
06505   pBt->db = p->db;
06506   if( p->inTrans!=TRANS_WRITE ){
06507     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
06508   }else{
06509     assert( pBt->pPage1!=0 );
06510     pP1 = pBt->pPage1->aData;
06511     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
06512     if( rc==SQLITE_OK ){
06513       put4byte(&pP1[36 + idx*4], iMeta);
06514 #ifndef SQLITE_OMIT_AUTOVACUUM
06515       if( idx==7 ){
06516         assert( pBt->autoVacuum || iMeta==0 );
06517         assert( iMeta==0 || iMeta==1 );
06518         pBt->incrVacuum = iMeta;
06519       }
06520 #endif
06521     }
06522   }
06523   sqlite3BtreeLeave(p);
06524   return rc;
06525 }
06526 
06527 /*
06528 ** Return the flag byte at the beginning of the page that the cursor
06529 ** is currently pointing to.
06530 */
06531 int sqlite3BtreeFlags(BtCursor *pCur){
06532   /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
06533   ** restoreCursorPosition() here.
06534   */
06535   MemPage *pPage;
06536   restoreCursorPosition(pCur);
06537   pPage = pCur->apPage[pCur->iPage];
06538   assert( cursorHoldsMutex(pCur) );
06539   assert( pPage->pBt==pCur->pBt );
06540   return pPage ? pPage->aData[pPage->hdrOffset] : 0;
06541 }
06542 
06543 
06544 /*
06545 ** Return the pager associated with a BTree.  This routine is used for
06546 ** testing and debugging only.
06547 */
06548 Pager *sqlite3BtreePager(Btree *p){
06549   return p->pBt->pPager;
06550 }
06551 
06552 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
06553 /*
06554 ** Append a message to the error message string.
06555 */
06556 static void checkAppendMsg(
06557   IntegrityCk *pCheck,
06558   char *zMsg1,
06559   const char *zFormat,
06560   ...
06561 ){
06562   va_list ap;
06563   if( !pCheck->mxErr ) return;
06564   pCheck->mxErr--;
06565   pCheck->nErr++;
06566   va_start(ap, zFormat);
06567   if( pCheck->errMsg.nChar ){
06568     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
06569   }
06570   if( zMsg1 ){
06571     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
06572   }
06573   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
06574   va_end(ap);
06575   if( pCheck->errMsg.mallocFailed ){
06576     pCheck->mallocFailed = 1;
06577   }
06578 }
06579 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
06580 
06581 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
06582 /*
06583 ** Add 1 to the reference count for page iPage.  If this is the second
06584 ** reference to the page, add an error message to pCheck->zErrMsg.
06585 ** Return 1 if there are 2 ore more references to the page and 0 if
06586 ** if this is the first reference to the page.
06587 **
06588 ** Also check that the page number is in bounds.
06589 */
06590 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
06591   if( iPage==0 ) return 1;
06592   if( iPage>pCheck->nPage || iPage<0 ){
06593     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
06594     return 1;
06595   }
06596   if( pCheck->anRef[iPage]==1 ){
06597     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
06598     return 1;
06599   }
06600   return  (pCheck->anRef[iPage]++)>1;
06601 }
06602 
06603 #ifndef SQLITE_OMIT_AUTOVACUUM
06604 /*
06605 ** Check that the entry in the pointer-map for page iChild maps to 
06606 ** page iParent, pointer type ptrType. If not, append an error message
06607 ** to pCheck.
06608 */
06609 static void checkPtrmap(
06610   IntegrityCk *pCheck,   /* Integrity check context */
06611   Pgno iChild,           /* Child page number */
06612   u8 eType,              /* Expected pointer map type */
06613   Pgno iParent,          /* Expected pointer map parent page number */
06614   char *zContext         /* Context description (used for error msg) */
06615 ){
06616   int rc;
06617   u8 ePtrmapType;
06618   Pgno iPtrmapParent;
06619 
06620   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
06621   if( rc!=SQLITE_OK ){
06622     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
06623     return;
06624   }
06625 
06626   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
06627     checkAppendMsg(pCheck, zContext, 
06628       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 
06629       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
06630   }
06631 }
06632 #endif
06633 
06634 /*
06635 ** Check the integrity of the freelist or of an overflow page list.
06636 ** Verify that the number of pages on the list is N.
06637 */
06638 static void checkList(
06639   IntegrityCk *pCheck,  /* Integrity checking context */
06640   int isFreeList,       /* True for a freelist.  False for overflow page list */
06641   int iPage,            /* Page number for first page in the list */
06642   int N,                /* Expected number of pages in the list */
06643   char *zContext        /* Context for error messages */
06644 ){
06645   int i;
06646   int expected = N;
06647   int iFirst = iPage;
06648   while( N-- > 0 && pCheck->mxErr ){
06649     DbPage *pOvflPage;
06650     unsigned char *pOvflData;
06651     if( iPage<1 ){
06652       checkAppendMsg(pCheck, zContext,
06653          "%d of %d pages missing from overflow list starting at %d",
06654           N+1, expected, iFirst);
06655       break;
06656     }
06657     if( checkRef(pCheck, iPage, zContext) ) break;
06658     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
06659       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
06660       break;
06661     }
06662     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
06663     if( isFreeList ){
06664       int n = get4byte(&pOvflData[4]);
06665 #ifndef SQLITE_OMIT_AUTOVACUUM
06666       if( pCheck->pBt->autoVacuum ){
06667         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
06668       }
06669 #endif
06670       if( n>pCheck->pBt->usableSize/4-2 ){
06671         checkAppendMsg(pCheck, zContext,
06672            "freelist leaf count too big on page %d", iPage);
06673         N--;
06674       }else{
06675         for(i=0; i<n; i++){
06676           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
06677 #ifndef SQLITE_OMIT_AUTOVACUUM
06678           if( pCheck->pBt->autoVacuum ){
06679             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
06680           }
06681 #endif
06682           checkRef(pCheck, iFreePage, zContext);
06683         }
06684         N -= n;
06685       }
06686     }
06687 #ifndef SQLITE_OMIT_AUTOVACUUM
06688     else{
06689       /* If this database supports auto-vacuum and iPage is not the last
06690       ** page in this overflow list, check that the pointer-map entry for
06691       ** the following page matches iPage.
06692       */
06693       if( pCheck->pBt->autoVacuum && N>0 ){
06694         i = get4byte(pOvflData);
06695         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
06696       }
06697     }
06698 #endif
06699     iPage = get4byte(pOvflData);
06700     sqlite3PagerUnref(pOvflPage);
06701   }
06702 }
06703 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
06704 
06705 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
06706 /*
06707 ** Do various sanity checks on a single page of a tree.  Return
06708 ** the tree depth.  Root pages return 0.  Parents of root pages
06709 ** return 1, and so forth.
06710 ** 
06711 ** These checks are done:
06712 **
06713 **      1.  Make sure that cells and freeblocks do not overlap
06714 **          but combine to completely cover the page.
06715 **  NO  2.  Make sure cell keys are in order.
06716 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
06717 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
06718 **      5.  Check the integrity of overflow pages.
06719 **      6.  Recursively call checkTreePage on all children.
06720 **      7.  Verify that the depth of all children is the same.
06721 **      8.  Make sure this page is at least 33% full or else it is
06722 **          the root of the tree.
06723 */
06724 static int checkTreePage(
06725   IntegrityCk *pCheck,  /* Context for the sanity check */
06726   int iPage,            /* Page number of the page to check */
06727   MemPage *pParent,     /* Parent page */
06728   char *zParentContext  /* Parent context */
06729 ){
06730   MemPage *pPage;
06731   int i, rc, depth, d2, pgno, cnt;
06732   int hdr, cellStart;
06733   int nCell;
06734   u8 *data;
06735   BtShared *pBt;
06736   int usableSize;
06737   char zContext[100];
06738   char *hit = 0;
06739 
06740   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
06741 
06742   /* Check that the page exists
06743   */
06744   pBt = pCheck->pBt;
06745   usableSize = pBt->usableSize;
06746   if( iPage==0 ) return 0;
06747   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
06748   if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
06749     checkAppendMsg(pCheck, zContext,
06750        "unable to get the page. error code=%d", rc);
06751     return 0;
06752   }
06753   if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
06754     checkAppendMsg(pCheck, zContext, 
06755                    "sqlite3BtreeInitPage() returns error code %d", rc);
06756     releasePage(pPage);
06757     return 0;
06758   }
06759 
06760   /* Check out all the cells.
06761   */
06762   depth = 0;
06763   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
06764     u8 *pCell;
06765     int sz;
06766     CellInfo info;
06767 
06768     /* Check payload overflow pages
06769     */
06770     sqlite3_snprintf(sizeof(zContext), zContext,
06771              "On tree page %d cell %d: ", iPage, i);
06772     pCell = findCell(pPage,i);
06773     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
06774     sz = info.nData;
06775     if( !pPage->intKey ) sz += info.nKey;
06776     assert( sz==info.nPayload );
06777     if( sz>info.nLocal ){
06778       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
06779       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
06780 #ifndef SQLITE_OMIT_AUTOVACUUM
06781       if( pBt->autoVacuum ){
06782         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
06783       }
06784 #endif
06785       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
06786     }
06787 
06788     /* Check sanity of left child page.
06789     */
06790     if( !pPage->leaf ){
06791       pgno = get4byte(pCell);
06792 #ifndef SQLITE_OMIT_AUTOVACUUM
06793       if( pBt->autoVacuum ){
06794         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
06795       }
06796 #endif
06797       d2 = checkTreePage(pCheck,pgno,pPage,zContext);
06798       if( i>0 && d2!=depth ){
06799         checkAppendMsg(pCheck, zContext, "Child page depth differs");
06800       }
06801       depth = d2;
06802     }
06803   }
06804   if( !pPage->leaf ){
06805     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
06806     sqlite3_snprintf(sizeof(zContext), zContext, 
06807                      "On page %d at right child: ", iPage);
06808 #ifndef SQLITE_OMIT_AUTOVACUUM
06809     if( pBt->autoVacuum ){
06810       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
06811     }
06812 #endif
06813     checkTreePage(pCheck, pgno, pPage, zContext);
06814   }
06815  
06816   /* Check for complete coverage of the page
06817   */
06818   data = pPage->aData;
06819   hdr = pPage->hdrOffset;
06820   hit = sqlite3PageMalloc( pBt->pageSize );
06821   if( hit==0 ){
06822     pCheck->mallocFailed = 1;
06823   }else{
06824     u16 contentOffset = get2byte(&data[hdr+5]);
06825     if (contentOffset > usableSize) {
06826       checkAppendMsg(pCheck, 0, 
06827                      "Corruption detected in header on page %d",iPage,0);
06828       goto check_page_abort;
06829     }
06830     memset(hit+contentOffset, 0, usableSize-contentOffset);
06831     memset(hit, 1, contentOffset);
06832     nCell = get2byte(&data[hdr+3]);
06833     cellStart = hdr + 12 - 4*pPage->leaf;
06834     for(i=0; i<nCell; i++){
06835       int pc = get2byte(&data[cellStart+i*2]);
06836       u16 size = 1024;
06837       int j;
06838       if( pc<=usableSize ){
06839         size = cellSizePtr(pPage, &data[pc]);
06840       }
06841       if( (pc+size-1)>=usableSize || pc<0 ){
06842         checkAppendMsg(pCheck, 0, 
06843             "Corruption detected in cell %d on page %d",i,iPage,0);
06844       }else{
06845         for(j=pc+size-1; j>=pc; j--) hit[j]++;
06846       }
06847     }
06848     for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; 
06849            cnt++){
06850       int size = get2byte(&data[i+2]);
06851       int j;
06852       if( (i+size-1)>=usableSize || i<0 ){
06853         checkAppendMsg(pCheck, 0,  
06854             "Corruption detected in cell %d on page %d",i,iPage,0);
06855       }else{
06856         for(j=i+size-1; j>=i; j--) hit[j]++;
06857       }
06858       i = get2byte(&data[i]);
06859     }
06860     for(i=cnt=0; i<usableSize; i++){
06861       if( hit[i]==0 ){
06862         cnt++;
06863       }else if( hit[i]>1 ){
06864         checkAppendMsg(pCheck, 0,
06865           "Multiple uses for byte %d of page %d", i, iPage);
06866         break;
06867       }
06868     }
06869     if( cnt!=data[hdr+7] ){
06870       checkAppendMsg(pCheck, 0, 
06871           "Fragmented space is %d byte reported as %d on page %d",
06872           cnt, data[hdr+7], iPage);
06873     }
06874   }
06875 check_page_abort:
06876   if (hit) sqlite3PageFree(hit);
06877 
06878   releasePage(pPage);
06879   return depth+1;
06880 }
06881 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
06882 
06883 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
06884 /*
06885 ** This routine does a complete check of the given BTree file.  aRoot[] is
06886 ** an array of pages numbers were each page number is the root page of
06887 ** a table.  nRoot is the number of entries in aRoot.
06888 **
06889 ** Write the number of error seen in *pnErr.  Except for some memory
06890 ** allocation errors,  nn error message is held in memory obtained from
06891 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
06892 ** returned.
06893 */
06894 char *sqlite3BtreeIntegrityCheck(
06895   Btree *p,     /* The btree to be checked */
06896   int *aRoot,   /* An array of root pages numbers for individual trees */
06897   int nRoot,    /* Number of entries in aRoot[] */
06898   int mxErr,    /* Stop reporting errors after this many */
06899   int *pnErr    /* Write number of errors seen to this variable */
06900 ){
06901   int i;
06902   int nRef;
06903   IntegrityCk sCheck;
06904   BtShared *pBt = p->pBt;
06905   char zErr[100];
06906 
06907   sqlite3BtreeEnter(p);
06908   pBt->db = p->db;
06909   nRef = sqlite3PagerRefcount(pBt->pPager);
06910   if( lockBtreeWithRetry(p)!=SQLITE_OK ){
06911     *pnErr = 1;
06912     sqlite3BtreeLeave(p);
06913     return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
06914   }
06915   sCheck.pBt = pBt;
06916   sCheck.pPager = pBt->pPager;
06917   sCheck.nPage = pagerPagecount(sCheck.pPager);
06918   sCheck.mxErr = mxErr;
06919   sCheck.nErr = 0;
06920   sCheck.mallocFailed = 0;
06921   *pnErr = 0;
06922 #ifndef SQLITE_OMIT_AUTOVACUUM
06923   if( pBt->nTrunc!=0 ){
06924     sCheck.nPage = pBt->nTrunc;
06925   }
06926 #endif
06927   if( sCheck.nPage==0 ){
06928     unlockBtreeIfUnused(pBt);
06929     sqlite3BtreeLeave(p);
06930     return 0;
06931   }
06932   sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
06933   if( !sCheck.anRef ){
06934     unlockBtreeIfUnused(pBt);
06935     *pnErr = 1;
06936     sqlite3BtreeLeave(p);
06937     return 0;
06938   }
06939   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
06940   i = PENDING_BYTE_PAGE(pBt);
06941   if( i<=sCheck.nPage ){
06942     sCheck.anRef[i] = 1;
06943   }
06944   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
06945 
06946   /* Check the integrity of the freelist
06947   */
06948   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
06949             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
06950 
06951   /* Check all the tables.
06952   */
06953   for(i=0; i<nRoot && sCheck.mxErr; i++){
06954     if( aRoot[i]==0 ) continue;
06955 #ifndef SQLITE_OMIT_AUTOVACUUM
06956     if( pBt->autoVacuum && aRoot[i]>1 ){
06957       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
06958     }
06959 #endif
06960     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
06961   }
06962 
06963   /* Make sure every page in the file is referenced
06964   */
06965   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
06966 #ifdef SQLITE_OMIT_AUTOVACUUM
06967     if( sCheck.anRef[i]==0 ){
06968       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
06969     }
06970 #else
06971     /* If the database supports auto-vacuum, make sure no tables contain
06972     ** references to pointer-map pages.
06973     */
06974     if( sCheck.anRef[i]==0 && 
06975        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
06976       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
06977     }
06978     if( sCheck.anRef[i]!=0 && 
06979        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
06980       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
06981     }
06982 #endif
06983   }
06984 
06985   /* Make sure this analysis did not leave any unref() pages
06986   */
06987   unlockBtreeIfUnused(pBt);
06988   if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
06989     checkAppendMsg(&sCheck, 0, 
06990       "Outstanding page count goes from %d to %d during this analysis",
06991       nRef, sqlite3PagerRefcount(pBt->pPager)
06992     );
06993   }
06994 
06995   /* Clean  up and report errors.
06996   */
06997   sqlite3BtreeLeave(p);
06998   sqlite3_free(sCheck.anRef);
06999   if( sCheck.mallocFailed ){
07000     sqlite3StrAccumReset(&sCheck.errMsg);
07001     *pnErr = sCheck.nErr+1;
07002     return 0;
07003   }
07004   *pnErr = sCheck.nErr;
07005   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
07006   return sqlite3StrAccumFinish(&sCheck.errMsg);
07007 }
07008 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
07009 
07010 /*
07011 ** Return the full pathname of the underlying database file.
07012 **
07013 ** The pager filename is invariant as long as the pager is
07014 ** open so it is safe to access without the BtShared mutex.
07015 */
07016 const char *sqlite3BtreeGetFilename(Btree *p){
07017   assert( p->pBt->pPager!=0 );
07018   return sqlite3PagerFilename(p->pBt->pPager);
07019 }
07020 
07021 /*
07022 ** Return the pathname of the directory that contains the database file.
07023 **
07024 ** The pager directory name is invariant as long as the pager is
07025 ** open so it is safe to access without the BtShared mutex.
07026 */
07027 const char *sqlite3BtreeGetDirname(Btree *p){
07028   assert( p->pBt->pPager!=0 );
07029   return sqlite3PagerDirname(p->pBt->pPager);
07030 }
07031 
07032 /*
07033 ** Return the pathname of the journal file for this database. The return
07034 ** value of this routine is the same regardless of whether the journal file
07035 ** has been created or not.
07036 **
07037 ** The pager journal filename is invariant as long as the pager is
07038 ** open so it is safe to access without the BtShared mutex.
07039 */
07040 const char *sqlite3BtreeGetJournalname(Btree *p){
07041   assert( p->pBt->pPager!=0 );
07042   return sqlite3PagerJournalname(p->pBt->pPager);
07043 }
07044 
07045 #ifndef SQLITE_OMIT_VACUUM
07046 /*
07047 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
07048 ** must be active for both files.
07049 **
07050 ** The size of file pTo may be reduced by this operation.
07051 ** If anything goes wrong, the transaction on pTo is rolled back. 
07052 **
07053 ** If successful, CommitPhaseOne() may be called on pTo before returning. 
07054 ** The caller should finish committing the transaction on pTo by calling
07055 ** sqlite3BtreeCommit().
07056 */
07057 static int btreeCopyFile(Btree *pTo, Btree *pFrom){
07058   int rc = SQLITE_OK;
07059   Pgno i;
07060 
07061   Pgno nFromPage;     /* Number of pages in pFrom */
07062   Pgno nToPage;       /* Number of pages in pTo */
07063   Pgno nNewPage;      /* Number of pages in pTo after the copy */
07064 
07065   Pgno iSkip;         /* Pending byte page in pTo */
07066   int nToPageSize;    /* Page size of pTo in bytes */
07067   int nFromPageSize;  /* Page size of pFrom in bytes */
07068 
07069   BtShared *pBtTo = pTo->pBt;
07070   BtShared *pBtFrom = pFrom->pBt;
07071   pBtTo->db = pTo->db;
07072   pBtFrom->db = pFrom->db;
07073 
07074   nToPageSize = pBtTo->pageSize;
07075   nFromPageSize = pBtFrom->pageSize;
07076 
07077   if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
07078     return SQLITE_ERROR;
07079   }
07080   if( pBtTo->pCursor ){
07081     return SQLITE_BUSY;
07082   }
07083 
07084   nToPage = pagerPagecount(pBtTo->pPager);
07085   nFromPage = pagerPagecount(pBtFrom->pPager);
07086   iSkip = PENDING_BYTE_PAGE(pBtTo);
07087 
07088   /* Variable nNewPage is the number of pages required to store the
07089   ** contents of pFrom using the current page-size of pTo.
07090   */
07091   nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) / 
07092       (i64)nToPageSize;
07093 
07094   for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
07095 
07096     /* Journal the original page.
07097     **
07098     ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
07099     ** in database *pTo (before the copy). This page is never written 
07100     ** into the journal file. Unless i==iSkip or the page was not
07101     ** present in pTo before the copy operation, journal page i from pTo.
07102     */
07103     if( i!=iSkip && i<=nToPage ){
07104       DbPage *pDbPage = 0;
07105       rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
07106       if( rc==SQLITE_OK ){
07107         rc = sqlite3PagerWrite(pDbPage);
07108         if( rc==SQLITE_OK && i>nFromPage ){
07109           /* Yeah.  It seems wierd to call DontWrite() right after Write(). But
07110           ** that is because the names of those procedures do not exactly 
07111           ** represent what they do.  Write() really means "put this page in the
07112           ** rollback journal and mark it as dirty so that it will be written
07113           ** to the database file later."  DontWrite() undoes the second part of
07114           ** that and prevents the page from being written to the database. The
07115           ** page is still on the rollback journal, though.  And that is the 
07116           ** whole point of this block: to put pages on the rollback journal. 
07117           */
07118           rc = sqlite3PagerDontWrite(pDbPage);
07119         }
07120         sqlite3PagerUnref(pDbPage);
07121       }
07122     }
07123 
07124     /* Overwrite the data in page i of the target database */
07125     if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
07126 
07127       DbPage *pToPage = 0;
07128       sqlite3_int64 iOff;
07129 
07130       rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
07131       if( rc==SQLITE_OK ){
07132         rc = sqlite3PagerWrite(pToPage);
07133       }
07134 
07135       for(
07136         iOff=(i-1)*nToPageSize; 
07137         rc==SQLITE_OK && iOff<i*nToPageSize; 
07138         iOff += nFromPageSize
07139       ){
07140         DbPage *pFromPage = 0;
07141         Pgno iFrom = (iOff/nFromPageSize)+1;
07142 
07143         if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
07144           continue;
07145         }
07146 
07147         rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
07148         if( rc==SQLITE_OK ){
07149           char *zTo = sqlite3PagerGetData(pToPage);
07150           char *zFrom = sqlite3PagerGetData(pFromPage);
07151           int nCopy;
07152 
07153           if( nFromPageSize>=nToPageSize ){
07154             zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
07155             nCopy = nToPageSize;
07156           }else{
07157             zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
07158             nCopy = nFromPageSize;
07159           }
07160 
07161           memcpy(zTo, zFrom, nCopy);
07162           sqlite3PagerUnref(pFromPage);
07163         }
07164       }
07165 
07166       if( pToPage ){
07167         MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
07168         p->isInit = 0;
07169         sqlite3PagerUnref(pToPage);
07170       }
07171     }
07172   }
07173 
07174   /* If things have worked so far, the database file may need to be 
07175   ** truncated. The complex part is that it may need to be truncated to
07176   ** a size that is not an integer multiple of nToPageSize - the current
07177   ** page size used by the pager associated with B-Tree pTo.
07178   **
07179   ** For example, say the page-size of pTo is 2048 bytes and the original 
07180   ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024 
07181   ** bytes and 9 pages, then the file needs to be truncated to 9KB.
07182   */
07183   if( rc==SQLITE_OK ){
07184     if( nFromPageSize!=nToPageSize ){
07185       sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
07186       i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
07187       i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize; 
07188       i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
07189   
07190       assert( iSize<=iNow );
07191   
07192       /* Commit phase one syncs the journal file associated with pTo 
07193       ** containing the original data. It does not sync the database file
07194       ** itself. After doing this it is safe to use OsTruncate() and other
07195       ** file APIs on the database file directly.
07196       */
07197       pBtTo->db = pTo->db;
07198       rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
07199       if( iSize<iNow && rc==SQLITE_OK ){
07200         rc = sqlite3OsTruncate(pFile, iSize);
07201       }
07202   
07203       /* The loop that copied data from database pFrom to pTo did not
07204       ** populate the locking page of database pTo. If the page-size of
07205       ** pFrom is smaller than that of pTo, this means some data will
07206       ** not have been copied. 
07207       **
07208       ** This block copies the missing data from database pFrom to pTo 
07209       ** using file APIs. This is safe because at this point we know that
07210       ** all of the original data from pTo has been synced into the 
07211       ** journal file. At this point it would be safe to do anything at
07212       ** all to the database file except truncate it to zero bytes.
07213       */
07214       if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
07215         i64 iOff;
07216         for(
07217           iOff=iPending; 
07218           rc==SQLITE_OK && iOff<(iPending+nToPageSize); 
07219           iOff += nFromPageSize
07220         ){
07221           DbPage *pFromPage = 0;
07222           Pgno iFrom = (iOff/nFromPageSize)+1;
07223   
07224           if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
07225             continue;
07226           }
07227   
07228           rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
07229           if( rc==SQLITE_OK ){
07230             char *zFrom = sqlite3PagerGetData(pFromPage);
07231             rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
07232             sqlite3PagerUnref(pFromPage);
07233           }
07234         }
07235       }
07236   
07237       /* Sync the database file */
07238       if( rc==SQLITE_OK ){
07239         rc = sqlite3PagerSync(pBtTo->pPager);
07240       }
07241     }else{
07242       rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
07243     }
07244     if( rc==SQLITE_OK ){
07245       pBtTo->pageSizeFixed = 0;
07246     }
07247   }
07248 
07249   if( rc ){
07250     sqlite3BtreeRollback(pTo);
07251   }
07252 
07253   return rc;  
07254 }
07255 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
07256   int rc;
07257   sqlite3BtreeEnter(pTo);
07258   sqlite3BtreeEnter(pFrom);
07259   rc = btreeCopyFile(pTo, pFrom);
07260   sqlite3BtreeLeave(pFrom);
07261   sqlite3BtreeLeave(pTo);
07262   return rc;
07263 }
07264 
07265 #endif /* SQLITE_OMIT_VACUUM */
07266 
07267 /*
07268 ** Return non-zero if a transaction is active.
07269 */
07270 int sqlite3BtreeIsInTrans(Btree *p){
07271   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
07272   return (p && (p->inTrans==TRANS_WRITE));
07273 }
07274 
07275 /*
07276 ** Return non-zero if a statement transaction is active.
07277 */
07278 int sqlite3BtreeIsInStmt(Btree *p){
07279   assert( sqlite3BtreeHoldsMutex(p) );
07280   return (p->pBt && p->pBt->inStmt);
07281 }
07282 
07283 /*
07284 ** Return non-zero if a read (or write) transaction is active.
07285 */
07286 int sqlite3BtreeIsInReadTrans(Btree *p){
07287   assert( sqlite3_mutex_held(p->db->mutex) );
07288   return (p && (p->inTrans!=TRANS_NONE));
07289 }
07290 
07291 /*
07292 ** This function returns a pointer to a blob of memory associated with
07293 ** a single shared-btree. The memory is used by client code for its own
07294 ** purposes (for example, to store a high-level schema associated with 
07295 ** the shared-btree). The btree layer manages reference counting issues.
07296 **
07297 ** The first time this is called on a shared-btree, nBytes bytes of memory
07298 ** are allocated, zeroed, and returned to the caller. For each subsequent 
07299 ** call the nBytes parameter is ignored and a pointer to the same blob
07300 ** of memory returned. 
07301 **
07302 ** If the nBytes parameter is 0 and the blob of memory has not yet been
07303 ** allocated, a null pointer is returned. If the blob has already been
07304 ** allocated, it is returned as normal.
07305 **
07306 ** Just before the shared-btree is closed, the function passed as the 
07307 ** xFree argument when the memory allocation was made is invoked on the 
07308 ** blob of allocated memory. This function should not call sqlite3_free()
07309 ** on the memory, the btree layer does that.
07310 */
07311 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
07312   BtShared *pBt = p->pBt;
07313   sqlite3BtreeEnter(p);
07314   if( !pBt->pSchema && nBytes ){
07315     pBt->pSchema = sqlite3MallocZero(nBytes);
07316     pBt->xFreeSchema = xFree;
07317   }
07318   sqlite3BtreeLeave(p);
07319   return pBt->pSchema;
07320 }
07321 
07322 /*
07323 ** Return true if another user of the same shared btree as the argument
07324 ** handle holds an exclusive lock on the sqlite_master table.
07325 */
07326 int sqlite3BtreeSchemaLocked(Btree *p){
07327   int rc;
07328   assert( sqlite3_mutex_held(p->db->mutex) );
07329   sqlite3BtreeEnter(p);
07330   rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
07331   sqlite3BtreeLeave(p);
07332   return rc;
07333 }
07334 
07335 
07336 #ifndef SQLITE_OMIT_SHARED_CACHE
07337 /*
07338 ** Obtain a lock on the table whose root page is iTab.  The
07339 ** lock is a write lock if isWritelock is true or a read lock
07340 ** if it is false.
07341 */
07342 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
07343   int rc = SQLITE_OK;
07344   if( p->sharable ){
07345     u8 lockType = READ_LOCK + isWriteLock;
07346     assert( READ_LOCK+1==WRITE_LOCK );
07347     assert( isWriteLock==0 || isWriteLock==1 );
07348     sqlite3BtreeEnter(p);
07349     rc = queryTableLock(p, iTab, lockType);
07350     if( rc==SQLITE_OK ){
07351       rc = lockTable(p, iTab, lockType);
07352     }
07353     sqlite3BtreeLeave(p);
07354   }
07355   return rc;
07356 }
07357 #endif
07358 
07359 #ifndef SQLITE_OMIT_INCRBLOB
07360 /*
07361 ** Argument pCsr must be a cursor opened for writing on an 
07362 ** INTKEY table currently pointing at a valid table entry. 
07363 ** This function modifies the data stored as part of that entry.
07364 ** Only the data content may only be modified, it is not possible
07365 ** to change the length of the data stored.
07366 */
07367 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
07368   assert( cursorHoldsMutex(pCsr) );
07369   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
07370   assert(pCsr->isIncrblobHandle);
07371 
07372   restoreCursorPosition(pCsr);
07373   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
07374   if( pCsr->eState!=CURSOR_VALID ){
07375     return SQLITE_ABORT;
07376   }
07377 
07378   /* Check some preconditions: 
07379   **   (a) the cursor is open for writing,
07380   **   (b) there is no read-lock on the table being modified and
07381   **   (c) the cursor points at a valid row of an intKey table.
07382   */
07383   if( !pCsr->wrFlag ){
07384     return SQLITE_READONLY;
07385   }
07386   assert( !pCsr->pBt->readOnly 
07387           && pCsr->pBt->inTransaction==TRANS_WRITE );
07388   if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
07389     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
07390   }
07391   if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
07392     return SQLITE_ERROR;
07393   }
07394 
07395   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
07396 }
07397 
07398 /* 
07399 ** Set a flag on this cursor to cache the locations of pages from the 
07400 ** overflow list for the current row. This is used by cursors opened
07401 ** for incremental blob IO only.
07402 **
07403 ** This function sets a flag only. The actual page location cache
07404 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
07405 ** accessPayload() (the worker function for sqlite3BtreeData() and
07406 ** sqlite3BtreePutData()).
07407 */
07408 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
07409   assert( cursorHoldsMutex(pCur) );
07410   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
07411   assert(!pCur->isIncrblobHandle);
07412   assert(!pCur->aOverflow);
07413   pCur->isIncrblobHandle = 1;
07414 }
07415 #endif

ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:52 2011 by Doxygen 1.6.1