fts1.c

Go to the documentation of this file.
00001 /* fts1 has a design flaw which can lead to database corruption (see
00002 ** below).  It is recommended not to use it any longer, instead use
00003 ** fts3 (or higher).  If you believe that your use of fts1 is safe,
00004 ** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS.
00005 */
00006 #if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)) \
00007         && !defined(SQLITE_ENABLE_BROKEN_FTS1)
00008 #error fts1 has a design flaw and has been deprecated.
00009 #endif
00010 /* The flaw is that fts1 uses the content table's unaliased rowid as
00011 ** the unique docid.  fts1 embeds the rowid in the index it builds,
00012 ** and expects the rowid to not change.  The SQLite VACUUM operation
00013 ** will renumber such rowids, thereby breaking fts1.  If you are using
00014 ** fts1 in a system which has disabled VACUUM, then you can continue
00015 ** to use it safely.  Note that PRAGMA auto_vacuum does NOT disable
00016 ** VACUUM, though systems using auto_vacuum are unlikely to invoke
00017 ** VACUUM.
00018 **
00019 ** fts1 should be safe even across VACUUM if you only insert documents
00020 ** and never delete.
00021 */
00022 
00023 /* The author disclaims copyright to this source code.
00024  *
00025  * This is an SQLite module implementing full-text search.
00026  */
00027 
00028 /*
00029 ** The code in this file is only compiled if:
00030 **
00031 **     * The FTS1 module is being built as an extension
00032 **       (in which case SQLITE_CORE is not defined), or
00033 **
00034 **     * The FTS1 module is being built into the core of
00035 **       SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
00036 */
00037 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
00038 
00039 #if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)
00040 # define SQLITE_CORE 1
00041 #endif
00042 
00043 #include <assert.h>
00044 #include <stdlib.h>
00045 #include <stdio.h>
00046 #include <string.h>
00047 #include <ctype.h>
00048 
00049 #include "fts1.h"
00050 #include "fts1_hash.h"
00051 #include "fts1_tokenizer.h"
00052 #include "sqlite3.h"
00053 #include "sqlite3ext.h"
00054 SQLITE_EXTENSION_INIT1
00055 
00056 
00057 #if 0
00058 # define TRACE(A)  printf A; fflush(stdout)
00059 #else
00060 # define TRACE(A)
00061 #endif
00062 
00063 /* utility functions */
00064 
00065 typedef struct StringBuffer {
00066   int len;      /* length, not including null terminator */
00067   int alloced;  /* Space allocated for s[] */ 
00068   char *s;      /* Content of the string */
00069 } StringBuffer;
00070 
00071 static void initStringBuffer(StringBuffer *sb){
00072   sb->len = 0;
00073   sb->alloced = 100;
00074   sb->s = malloc(100);
00075   sb->s[0] = '\0';
00076 }
00077 
00078 static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
00079   if( sb->len + nFrom >= sb->alloced ){
00080     sb->alloced = sb->len + nFrom + 100;
00081     sb->s = realloc(sb->s, sb->alloced+1);
00082     if( sb->s==0 ){
00083       initStringBuffer(sb);
00084       return;
00085     }
00086   }
00087   memcpy(sb->s + sb->len, zFrom, nFrom);
00088   sb->len += nFrom;
00089   sb->s[sb->len] = 0;
00090 }
00091 static void append(StringBuffer *sb, const char *zFrom){
00092   nappend(sb, zFrom, strlen(zFrom));
00093 }
00094 
00095 /* We encode variable-length integers in little-endian order using seven bits
00096  * per byte as follows:
00097 **
00098 ** KEY:
00099 **         A = 0xxxxxxx    7 bits of data and one flag bit
00100 **         B = 1xxxxxxx    7 bits of data and one flag bit
00101 **
00102 **  7 bits - A
00103 ** 14 bits - BA
00104 ** 21 bits - BBA
00105 ** and so on.
00106 */
00107 
00108 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
00109 #define VARINT_MAX 10
00110 
00111 /* Write a 64-bit variable-length integer to memory starting at p[0].
00112  * The length of data written will be between 1 and VARINT_MAX bytes.
00113  * The number of bytes written is returned. */
00114 static int putVarint(char *p, sqlite_int64 v){
00115   unsigned char *q = (unsigned char *) p;
00116   sqlite_uint64 vu = v;
00117   do{
00118     *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
00119     vu >>= 7;
00120   }while( vu!=0 );
00121   q[-1] &= 0x7f;  /* turn off high bit in final byte */
00122   assert( q - (unsigned char *)p <= VARINT_MAX );
00123   return (int) (q - (unsigned char *)p);
00124 }
00125 
00126 /* Read a 64-bit variable-length integer from memory starting at p[0].
00127  * Return the number of bytes read, or 0 on error.
00128  * The value is stored in *v. */
00129 static int getVarint(const char *p, sqlite_int64 *v){
00130   const unsigned char *q = (const unsigned char *) p;
00131   sqlite_uint64 x = 0, y = 1;
00132   while( (*q & 0x80) == 0x80 ){
00133     x += y * (*q++ & 0x7f);
00134     y <<= 7;
00135     if( q - (unsigned char *)p >= VARINT_MAX ){  /* bad data */
00136       assert( 0 );
00137       return 0;
00138     }
00139   }
00140   x += y * (*q++);
00141   *v = (sqlite_int64) x;
00142   return (int) (q - (unsigned char *)p);
00143 }
00144 
00145 static int getVarint32(const char *p, int *pi){
00146  sqlite_int64 i;
00147  int ret = getVarint(p, &i);
00148  *pi = (int) i;
00149  assert( *pi==i );
00150  return ret;
00151 }
00152 
00153 /*** Document lists ***
00154  *
00155  * A document list holds a sorted list of varint-encoded document IDs.
00156  *
00157  * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
00158  *
00159  * array {
00160  *   varint docid;
00161  *   array {
00162  *     varint position;     (delta from previous position plus POS_BASE)
00163  *     varint startOffset;  (delta from previous startOffset)
00164  *     varint endOffset;    (delta from startOffset)
00165  *   }
00166  * }
00167  *
00168  * Here, array { X } means zero or more occurrences of X, adjacent in memory.
00169  *
00170  * A position list may hold positions for text in multiple columns.  A position
00171  * POS_COLUMN is followed by a varint containing the index of the column for
00172  * following positions in the list.  Any positions appearing before any
00173  * occurrences of POS_COLUMN are for column 0.
00174  *
00175  * A doclist with type DL_POSITIONS is like the above, but holds only docids
00176  * and positions without offset information.
00177  *
00178  * A doclist with type DL_DOCIDS is like the above, but holds only docids
00179  * without positions or offset information.
00180  *
00181  * On disk, every document list has positions and offsets, so we don't bother
00182  * to serialize a doclist's type.
00183  * 
00184  * We don't yet delta-encode document IDs; doing so will probably be a
00185  * modest win.
00186  *
00187  * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
00188  * After the first offset, estimate the next offset by using the
00189  * current token position and the previous token position and offset,
00190  * offset to handle some variance.  So the estimate would be
00191  * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
00192  * as normal.  Offsets more than 64 chars from the estimate are
00193  * encoded as the delta to the previous start offset + 128.  An
00194  * additional tiny increment can be gained by using the end offset of
00195  * the previous token to make the estimate a tiny bit more precise.
00196 */
00197 
00198 /* It is not safe to call isspace(), tolower(), or isalnum() on
00199 ** hi-bit-set characters.  This is the same solution used in the
00200 ** tokenizer.
00201 */
00202 /* TODO(shess) The snippet-generation code should be using the
00203 ** tokenizer-generated tokens rather than doing its own local
00204 ** tokenization.
00205 */
00206 /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
00207 static int safe_isspace(char c){
00208   return (c&0x80)==0 ? isspace(c) : 0;
00209 }
00210 static int safe_tolower(char c){
00211   return (c&0x80)==0 ? tolower(c) : c;
00212 }
00213 static int safe_isalnum(char c){
00214   return (c&0x80)==0 ? isalnum(c) : 0;
00215 }
00216 
00217 typedef enum DocListType {
00218   DL_DOCIDS,              /* docids only */
00219   DL_POSITIONS,           /* docids + positions */
00220   DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
00221 } DocListType;
00222 
00223 /*
00224 ** By default, only positions and not offsets are stored in the doclists.
00225 ** To change this so that offsets are stored too, compile with
00226 **
00227 **          -DDL_DEFAULT=DL_POSITIONS_OFFSETS
00228 **
00229 */
00230 #ifndef DL_DEFAULT
00231 # define DL_DEFAULT DL_POSITIONS
00232 #endif
00233 
00234 typedef struct DocList {
00235   char *pData;
00236   int nData;
00237   DocListType iType;
00238   int iLastColumn;    /* the last column written */
00239   int iLastPos;       /* the last position written */
00240   int iLastOffset;    /* the last start offset written */
00241 } DocList;
00242 
00243 enum {
00244   POS_END = 0,        /* end of this position list */
00245   POS_COLUMN,         /* followed by new column number */
00246   POS_BASE
00247 };
00248 
00249 /* Initialize a new DocList to hold the given data. */
00250 static void docListInit(DocList *d, DocListType iType,
00251                         const char *pData, int nData){
00252   d->nData = nData;
00253   if( nData>0 ){
00254     d->pData = malloc(nData);
00255     memcpy(d->pData, pData, nData);
00256   } else {
00257     d->pData = NULL;
00258   }
00259   d->iType = iType;
00260   d->iLastColumn = 0;
00261   d->iLastPos = d->iLastOffset = 0;
00262 }
00263 
00264 /* Create a new dynamically-allocated DocList. */
00265 static DocList *docListNew(DocListType iType){
00266   DocList *d = (DocList *) malloc(sizeof(DocList));
00267   docListInit(d, iType, 0, 0);
00268   return d;
00269 }
00270 
00271 static void docListDestroy(DocList *d){
00272   free(d->pData);
00273 #ifndef NDEBUG
00274   memset(d, 0x55, sizeof(*d));
00275 #endif
00276 }
00277 
00278 static void docListDelete(DocList *d){
00279   docListDestroy(d);
00280   free(d);
00281 }
00282 
00283 static char *docListEnd(DocList *d){
00284   return d->pData + d->nData;
00285 }
00286 
00287 /* Append a varint to a DocList's data. */
00288 static void appendVarint(DocList *d, sqlite_int64 i){
00289   char c[VARINT_MAX];
00290   int n = putVarint(c, i);
00291   d->pData = realloc(d->pData, d->nData + n);
00292   memcpy(d->pData + d->nData, c, n);
00293   d->nData += n;
00294 }
00295 
00296 static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
00297   appendVarint(d, iDocid);
00298   if( d->iType>=DL_POSITIONS ){
00299     appendVarint(d, POS_END);  /* initially empty position list */
00300     d->iLastColumn = 0;
00301     d->iLastPos = d->iLastOffset = 0;
00302   }
00303 }
00304 
00305 /* helper function for docListAddPos and docListAddPosOffset */
00306 static void addPos(DocList *d, int iColumn, int iPos){
00307   assert( d->nData>0 );
00308   --d->nData;  /* remove previous terminator */
00309   if( iColumn!=d->iLastColumn ){
00310     assert( iColumn>d->iLastColumn );
00311     appendVarint(d, POS_COLUMN);
00312     appendVarint(d, iColumn);
00313     d->iLastColumn = iColumn;
00314     d->iLastPos = d->iLastOffset = 0;
00315   }
00316   assert( iPos>=d->iLastPos );
00317   appendVarint(d, iPos-d->iLastPos+POS_BASE);
00318   d->iLastPos = iPos;
00319 }
00320 
00321 /* Add a position to the last position list in a doclist. */
00322 static void docListAddPos(DocList *d, int iColumn, int iPos){
00323   assert( d->iType==DL_POSITIONS );
00324   addPos(d, iColumn, iPos);
00325   appendVarint(d, POS_END);  /* add new terminator */
00326 }
00327 
00328 /*
00329 ** Add a position and starting and ending offsets to a doclist.
00330 **
00331 ** If the doclist is setup to handle only positions, then insert
00332 ** the position only and ignore the offsets.
00333 */
00334 static void docListAddPosOffset(
00335   DocList *d,             /* Doclist under construction */
00336   int iColumn,            /* Column the inserted term is part of */
00337   int iPos,               /* Position of the inserted term */
00338   int iStartOffset,       /* Starting offset of inserted term */
00339   int iEndOffset          /* Ending offset of inserted term */
00340 ){
00341   assert( d->iType>=DL_POSITIONS );
00342   addPos(d, iColumn, iPos);
00343   if( d->iType==DL_POSITIONS_OFFSETS ){
00344     assert( iStartOffset>=d->iLastOffset );
00345     appendVarint(d, iStartOffset-d->iLastOffset);
00346     d->iLastOffset = iStartOffset;
00347     assert( iEndOffset>=iStartOffset );
00348     appendVarint(d, iEndOffset-iStartOffset);
00349   }
00350   appendVarint(d, POS_END);  /* add new terminator */
00351 }
00352 
00353 /*
00354 ** A DocListReader object is a cursor into a doclist.  Initialize
00355 ** the cursor to the beginning of the doclist by calling readerInit().
00356 ** Then use routines
00357 **
00358 **      peekDocid()
00359 **      readDocid()
00360 **      readPosition()
00361 **      skipPositionList()
00362 **      and so forth...
00363 **
00364 ** to read information out of the doclist.  When we reach the end
00365 ** of the doclist, atEnd() returns TRUE.
00366 */
00367 typedef struct DocListReader {
00368   DocList *pDoclist;  /* The document list we are stepping through */
00369   char *p;            /* Pointer to next unread byte in the doclist */
00370   int iLastColumn;
00371   int iLastPos;  /* the last position read, or -1 when not in a position list */
00372 } DocListReader;
00373 
00374 /*
00375 ** Initialize the DocListReader r to point to the beginning of pDoclist.
00376 */
00377 static void readerInit(DocListReader *r, DocList *pDoclist){
00378   r->pDoclist = pDoclist;
00379   if( pDoclist!=NULL ){
00380     r->p = pDoclist->pData;
00381   }
00382   r->iLastColumn = -1;
00383   r->iLastPos = -1;
00384 }
00385 
00386 /*
00387 ** Return TRUE if we have reached then end of pReader and there is
00388 ** nothing else left to read.
00389 */
00390 static int atEnd(DocListReader *pReader){
00391   return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
00392 }
00393 
00394 /* Peek at the next docid without advancing the read pointer. 
00395 */
00396 static sqlite_int64 peekDocid(DocListReader *pReader){
00397   sqlite_int64 ret;
00398   assert( !atEnd(pReader) );
00399   assert( pReader->iLastPos==-1 );
00400   getVarint(pReader->p, &ret);
00401   return ret;
00402 }
00403 
00404 /* Read the next docid.   See also nextDocid().
00405 */
00406 static sqlite_int64 readDocid(DocListReader *pReader){
00407   sqlite_int64 ret;
00408   assert( !atEnd(pReader) );
00409   assert( pReader->iLastPos==-1 );
00410   pReader->p += getVarint(pReader->p, &ret);
00411   if( pReader->pDoclist->iType>=DL_POSITIONS ){
00412     pReader->iLastColumn = 0;
00413     pReader->iLastPos = 0;
00414   }
00415   return ret;
00416 }
00417 
00418 /* Read the next position and column index from a position list.
00419  * Returns the position, or -1 at the end of the list. */
00420 static int readPosition(DocListReader *pReader, int *iColumn){
00421   int i;
00422   int iType = pReader->pDoclist->iType;
00423 
00424   if( pReader->iLastPos==-1 ){
00425     return -1;
00426   }
00427   assert( !atEnd(pReader) );
00428 
00429   if( iType<DL_POSITIONS ){
00430     return -1;
00431   }
00432   pReader->p += getVarint32(pReader->p, &i);
00433   if( i==POS_END ){
00434     pReader->iLastColumn = pReader->iLastPos = -1;
00435     *iColumn = -1;
00436     return -1;
00437   }
00438   if( i==POS_COLUMN ){
00439     pReader->p += getVarint32(pReader->p, &pReader->iLastColumn);
00440     pReader->iLastPos = 0;
00441     pReader->p += getVarint32(pReader->p, &i);
00442     assert( i>=POS_BASE );
00443   }
00444   pReader->iLastPos += ((int) i)-POS_BASE;
00445   if( iType>=DL_POSITIONS_OFFSETS ){
00446     /* Skip over offsets, ignoring them for now. */
00447     int iStart, iEnd;
00448     pReader->p += getVarint32(pReader->p, &iStart);
00449     pReader->p += getVarint32(pReader->p, &iEnd);
00450   }
00451   *iColumn = pReader->iLastColumn;
00452   return pReader->iLastPos;
00453 }
00454 
00455 /* Skip past the end of a position list. */
00456 static void skipPositionList(DocListReader *pReader){
00457   DocList *p = pReader->pDoclist;
00458   if( p && p->iType>=DL_POSITIONS ){
00459     int iColumn;
00460     while( readPosition(pReader, &iColumn)!=-1 ){}
00461   }
00462 }
00463 
00464 /* Skip over a docid, including its position list if the doclist has
00465  * positions. */
00466 static void skipDocument(DocListReader *pReader){
00467   readDocid(pReader);
00468   skipPositionList(pReader);
00469 }
00470 
00471 /* Skip past all docids which are less than [iDocid].  Returns 1 if a docid
00472  * matching [iDocid] was found.  */
00473 static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){
00474   sqlite_int64 d = 0;
00475   while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){
00476     skipDocument(pReader);
00477   }
00478   return !atEnd(pReader) && d==iDocid;
00479 }
00480 
00481 /* Return the first document in a document list.
00482 */
00483 static sqlite_int64 firstDocid(DocList *d){
00484   DocListReader r;
00485   readerInit(&r, d);
00486   return readDocid(&r);
00487 }
00488 
00489 #ifdef SQLITE_DEBUG
00490 /*
00491 ** This routine is used for debugging purpose only.
00492 **
00493 ** Write the content of a doclist to standard output.
00494 */
00495 static void printDoclist(DocList *p){
00496   DocListReader r;
00497   const char *zSep = "";
00498 
00499   readerInit(&r, p);
00500   while( !atEnd(&r) ){
00501     sqlite_int64 docid = readDocid(&r);
00502     if( docid==0 ){
00503       skipPositionList(&r);
00504       continue;
00505     }
00506     printf("%s%lld", zSep, docid);
00507     zSep =  ",";
00508     if( p->iType>=DL_POSITIONS ){
00509       int iPos, iCol;
00510       const char *zDiv = "";
00511       printf("(");
00512       while( (iPos = readPosition(&r, &iCol))>=0 ){
00513         printf("%s%d:%d", zDiv, iCol, iPos);
00514         zDiv = ":";
00515       }
00516       printf(")");
00517     }
00518   }
00519   printf("\n");
00520   fflush(stdout);
00521 }
00522 #endif /* SQLITE_DEBUG */
00523 
00524 /* Trim the given doclist to contain only positions in column
00525  * [iRestrictColumn]. */
00526 static void docListRestrictColumn(DocList *in, int iRestrictColumn){
00527   DocListReader r;
00528   DocList out;
00529 
00530   assert( in->iType>=DL_POSITIONS );
00531   readerInit(&r, in);
00532   docListInit(&out, DL_POSITIONS, NULL, 0);
00533 
00534   while( !atEnd(&r) ){
00535     sqlite_int64 iDocid = readDocid(&r);
00536     int iPos, iColumn;
00537 
00538     docListAddDocid(&out, iDocid);
00539     while( (iPos = readPosition(&r, &iColumn)) != -1 ){
00540       if( iColumn==iRestrictColumn ){
00541         docListAddPos(&out, iColumn, iPos);
00542       }
00543     }
00544   }
00545 
00546   docListDestroy(in);
00547   *in = out;
00548 }
00549 
00550 /* Trim the given doclist by discarding any docids without any remaining
00551  * positions. */
00552 static void docListDiscardEmpty(DocList *in) {
00553   DocListReader r;
00554   DocList out;
00555 
00556   /* TODO: It would be nice to implement this operation in place; that
00557    * could save a significant amount of memory in queries with long doclists. */
00558   assert( in->iType>=DL_POSITIONS );
00559   readerInit(&r, in);
00560   docListInit(&out, DL_POSITIONS, NULL, 0);
00561 
00562   while( !atEnd(&r) ){
00563     sqlite_int64 iDocid = readDocid(&r);
00564     int match = 0;
00565     int iPos, iColumn;
00566     while( (iPos = readPosition(&r, &iColumn)) != -1 ){
00567       if( !match ){
00568         docListAddDocid(&out, iDocid);
00569         match = 1;
00570       }
00571       docListAddPos(&out, iColumn, iPos);
00572     }
00573   }
00574 
00575   docListDestroy(in);
00576   *in = out;
00577 }
00578 
00579 /* Helper function for docListUpdate() and docListAccumulate().
00580 ** Splices a doclist element into the doclist represented by r,
00581 ** leaving r pointing after the newly spliced element.
00582 */
00583 static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
00584                                  const char *pSource, int nSource){
00585   DocList *d = r->pDoclist;
00586   char *pTarget;
00587   int nTarget, found;
00588 
00589   found = skipToDocid(r, iDocid);
00590 
00591   /* Describe slice in d to place pSource/nSource. */
00592   pTarget = r->p;
00593   if( found ){
00594     skipDocument(r);
00595     nTarget = r->p-pTarget;
00596   }else{
00597     nTarget = 0;
00598   }
00599 
00600   /* The sense of the following is that there are three possibilities.
00601   ** If nTarget==nSource, we should not move any memory nor realloc.
00602   ** If nTarget>nSource, trim target and realloc.
00603   ** If nTarget<nSource, realloc then expand target.
00604   */
00605   if( nTarget>nSource ){
00606     memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
00607   }
00608   if( nTarget!=nSource ){
00609     int iDoclist = pTarget-d->pData;
00610     d->pData = realloc(d->pData, d->nData+nSource-nTarget);
00611     pTarget = d->pData+iDoclist;
00612   }
00613   if( nTarget<nSource ){
00614     memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
00615   }
00616 
00617   memcpy(pTarget, pSource, nSource);
00618   d->nData += nSource-nTarget;
00619   r->p = pTarget+nSource;
00620 }
00621 
00622 /* Insert/update pUpdate into the doclist. */
00623 static void docListUpdate(DocList *d, DocList *pUpdate){
00624   DocListReader reader;
00625 
00626   assert( d!=NULL && pUpdate!=NULL );
00627   assert( d->iType==pUpdate->iType);
00628 
00629   readerInit(&reader, d);
00630   docListSpliceElement(&reader, firstDocid(pUpdate),
00631                        pUpdate->pData, pUpdate->nData);
00632 }
00633 
00634 /* Propagate elements from pUpdate to pAcc, overwriting elements with
00635 ** matching docids.
00636 */
00637 static void docListAccumulate(DocList *pAcc, DocList *pUpdate){
00638   DocListReader accReader, updateReader;
00639 
00640   /* Handle edge cases where one doclist is empty. */
00641   assert( pAcc!=NULL );
00642   if( pUpdate==NULL || pUpdate->nData==0 ) return;
00643   if( pAcc->nData==0 ){
00644     pAcc->pData = malloc(pUpdate->nData);
00645     memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData);
00646     pAcc->nData = pUpdate->nData;
00647     return;
00648   }
00649 
00650   readerInit(&accReader, pAcc);
00651   readerInit(&updateReader, pUpdate);
00652 
00653   while( !atEnd(&updateReader) ){
00654     char *pSource = updateReader.p;
00655     sqlite_int64 iDocid = readDocid(&updateReader);
00656     skipPositionList(&updateReader);
00657     docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
00658   }
00659 }
00660 
00661 /*
00662 ** Read the next docid off of pIn.  Return 0 if we reach the end.
00663 *
00664 * TODO: This assumes that docids are never 0, but they may actually be 0 since
00665 * users can choose docids when inserting into a full-text table.  Fix this.
00666 */
00667 static sqlite_int64 nextDocid(DocListReader *pIn){
00668   skipPositionList(pIn);
00669   return atEnd(pIn) ? 0 : readDocid(pIn);
00670 }
00671 
00672 /*
00673 ** pLeft and pRight are two DocListReaders that are pointing to
00674 ** positions lists of the same document: iDocid. 
00675 **
00676 ** If there are no instances in pLeft or pRight where the position
00677 ** of pLeft is one less than the position of pRight, then this
00678 ** routine adds nothing to pOut.
00679 **
00680 ** If there are one or more instances where positions from pLeft
00681 ** are exactly one less than positions from pRight, then add a new
00682 ** document record to pOut.  If pOut wants to hold positions, then
00683 ** include the positions from pRight that are one more than a
00684 ** position in pLeft.  In other words:  pRight.iPos==pLeft.iPos+1.
00685 **
00686 ** pLeft and pRight are left pointing at the next document record.
00687 */
00688 static void mergePosList(
00689   DocListReader *pLeft,    /* Left position list */
00690   DocListReader *pRight,   /* Right position list */
00691   sqlite_int64 iDocid,     /* The docid from pLeft and pRight */
00692   DocList *pOut            /* Write the merged document record here */
00693 ){
00694   int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol);
00695   int iRightCol, iRightPos = readPosition(pRight, &iRightCol);
00696   int match = 0;
00697 
00698   /* Loop until we've reached the end of both position lists. */
00699   while( iLeftPos!=-1 && iRightPos!=-1 ){
00700     if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){
00701       if( !match ){
00702         docListAddDocid(pOut, iDocid);
00703         match = 1;
00704       }
00705       if( pOut->iType>=DL_POSITIONS ){
00706         docListAddPos(pOut, iRightCol, iRightPos);
00707       }
00708       iLeftPos = readPosition(pLeft, &iLeftCol);
00709       iRightPos = readPosition(pRight, &iRightCol);
00710     }else if( iRightCol<iLeftCol ||
00711               (iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){
00712       iRightPos = readPosition(pRight, &iRightCol);
00713     }else{
00714       iLeftPos = readPosition(pLeft, &iLeftCol);
00715     }
00716   }
00717   if( iLeftPos>=0 ) skipPositionList(pLeft);
00718   if( iRightPos>=0 ) skipPositionList(pRight);
00719 }
00720 
00721 /* We have two doclists:  pLeft and pRight.
00722 ** Write the phrase intersection of these two doclists into pOut.
00723 **
00724 ** A phrase intersection means that two documents only match
00725 ** if pLeft.iPos+1==pRight.iPos.
00726 **
00727 ** The output pOut may or may not contain positions.  If pOut
00728 ** does contain positions, they are the positions of pRight.
00729 */
00730 static void docListPhraseMerge(
00731   DocList *pLeft,    /* Doclist resulting from the words on the left */
00732   DocList *pRight,   /* Doclist for the next word to the right */
00733   DocList *pOut      /* Write the combined doclist here */
00734 ){
00735   DocListReader left, right;
00736   sqlite_int64 docidLeft, docidRight;
00737 
00738   readerInit(&left, pLeft);
00739   readerInit(&right, pRight);
00740   docidLeft = nextDocid(&left);
00741   docidRight = nextDocid(&right);
00742 
00743   while( docidLeft>0 && docidRight>0 ){
00744     if( docidLeft<docidRight ){
00745       docidLeft = nextDocid(&left);
00746     }else if( docidRight<docidLeft ){
00747       docidRight = nextDocid(&right);
00748     }else{
00749       mergePosList(&left, &right, docidLeft, pOut);
00750       docidLeft = nextDocid(&left);
00751       docidRight = nextDocid(&right);
00752     }
00753   }
00754 }
00755 
00756 /* We have two doclists:  pLeft and pRight.
00757 ** Write the intersection of these two doclists into pOut.
00758 ** Only docids are matched.  Position information is ignored.
00759 **
00760 ** The output pOut never holds positions.
00761 */
00762 static void docListAndMerge(
00763   DocList *pLeft,    /* Doclist resulting from the words on the left */
00764   DocList *pRight,   /* Doclist for the next word to the right */
00765   DocList *pOut      /* Write the combined doclist here */
00766 ){
00767   DocListReader left, right;
00768   sqlite_int64 docidLeft, docidRight;
00769 
00770   assert( pOut->iType<DL_POSITIONS );
00771 
00772   readerInit(&left, pLeft);
00773   readerInit(&right, pRight);
00774   docidLeft = nextDocid(&left);
00775   docidRight = nextDocid(&right);
00776 
00777   while( docidLeft>0 && docidRight>0 ){
00778     if( docidLeft<docidRight ){
00779       docidLeft = nextDocid(&left);
00780     }else if( docidRight<docidLeft ){
00781       docidRight = nextDocid(&right);
00782     }else{
00783       docListAddDocid(pOut, docidLeft);
00784       docidLeft = nextDocid(&left);
00785       docidRight = nextDocid(&right);
00786     }
00787   }
00788 }
00789 
00790 /* We have two doclists:  pLeft and pRight.
00791 ** Write the union of these two doclists into pOut.
00792 ** Only docids are matched.  Position information is ignored.
00793 **
00794 ** The output pOut never holds positions.
00795 */
00796 static void docListOrMerge(
00797   DocList *pLeft,    /* Doclist resulting from the words on the left */
00798   DocList *pRight,   /* Doclist for the next word to the right */
00799   DocList *pOut      /* Write the combined doclist here */
00800 ){
00801   DocListReader left, right;
00802   sqlite_int64 docidLeft, docidRight, priorLeft;
00803 
00804   readerInit(&left, pLeft);
00805   readerInit(&right, pRight);
00806   docidLeft = nextDocid(&left);
00807   docidRight = nextDocid(&right);
00808 
00809   while( docidLeft>0 && docidRight>0 ){
00810     if( docidLeft<=docidRight ){
00811       docListAddDocid(pOut, docidLeft);
00812     }else{
00813       docListAddDocid(pOut, docidRight);
00814     }
00815     priorLeft = docidLeft;
00816     if( docidLeft<=docidRight ){
00817       docidLeft = nextDocid(&left);
00818     }
00819     if( docidRight>0 && docidRight<=priorLeft ){
00820       docidRight = nextDocid(&right);
00821     }
00822   }
00823   while( docidLeft>0 ){
00824     docListAddDocid(pOut, docidLeft);
00825     docidLeft = nextDocid(&left);
00826   }
00827   while( docidRight>0 ){
00828     docListAddDocid(pOut, docidRight);
00829     docidRight = nextDocid(&right);
00830   }
00831 }
00832 
00833 /* We have two doclists:  pLeft and pRight.
00834 ** Write into pOut all documents that occur in pLeft but not
00835 ** in pRight.
00836 **
00837 ** Only docids are matched.  Position information is ignored.
00838 **
00839 ** The output pOut never holds positions.
00840 */
00841 static void docListExceptMerge(
00842   DocList *pLeft,    /* Doclist resulting from the words on the left */
00843   DocList *pRight,   /* Doclist for the next word to the right */
00844   DocList *pOut      /* Write the combined doclist here */
00845 ){
00846   DocListReader left, right;
00847   sqlite_int64 docidLeft, docidRight, priorLeft;
00848 
00849   readerInit(&left, pLeft);
00850   readerInit(&right, pRight);
00851   docidLeft = nextDocid(&left);
00852   docidRight = nextDocid(&right);
00853 
00854   while( docidLeft>0 && docidRight>0 ){
00855     priorLeft = docidLeft;
00856     if( docidLeft<docidRight ){
00857       docListAddDocid(pOut, docidLeft);
00858     }
00859     if( docidLeft<=docidRight ){
00860       docidLeft = nextDocid(&left);
00861     }
00862     if( docidRight>0 && docidRight<=priorLeft ){
00863       docidRight = nextDocid(&right);
00864     }
00865   }
00866   while( docidLeft>0 ){
00867     docListAddDocid(pOut, docidLeft);
00868     docidLeft = nextDocid(&left);
00869   }
00870 }
00871 
00872 static char *string_dup_n(const char *s, int n){
00873   char *str = malloc(n + 1);
00874   memcpy(str, s, n);
00875   str[n] = '\0';
00876   return str;
00877 }
00878 
00879 /* Duplicate a string; the caller must free() the returned string.
00880  * (We don't use strdup() since it is not part of the standard C library and
00881  * may not be available everywhere.) */
00882 static char *string_dup(const char *s){
00883   return string_dup_n(s, strlen(s));
00884 }
00885 
00886 /* Format a string, replacing each occurrence of the % character with
00887  * zDb.zName.  This may be more convenient than sqlite_mprintf()
00888  * when one string is used repeatedly in a format string.
00889  * The caller must free() the returned string. */
00890 static char *string_format(const char *zFormat,
00891                            const char *zDb, const char *zName){
00892   const char *p;
00893   size_t len = 0;
00894   size_t nDb = strlen(zDb);
00895   size_t nName = strlen(zName);
00896   size_t nFullTableName = nDb+1+nName;
00897   char *result;
00898   char *r;
00899 
00900   /* first compute length needed */
00901   for(p = zFormat ; *p ; ++p){
00902     len += (*p=='%' ? nFullTableName : 1);
00903   }
00904   len += 1;  /* for null terminator */
00905 
00906   r = result = malloc(len);
00907   for(p = zFormat; *p; ++p){
00908     if( *p=='%' ){
00909       memcpy(r, zDb, nDb);
00910       r += nDb;
00911       *r++ = '.';
00912       memcpy(r, zName, nName);
00913       r += nName;
00914     } else {
00915       *r++ = *p;
00916     }
00917   }
00918   *r++ = '\0';
00919   assert( r == result + len );
00920   return result;
00921 }
00922 
00923 static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
00924                     const char *zFormat){
00925   char *zCommand = string_format(zFormat, zDb, zName);
00926   int rc;
00927   TRACE(("FTS1 sql: %s\n", zCommand));
00928   rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
00929   free(zCommand);
00930   return rc;
00931 }
00932 
00933 static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
00934                        sqlite3_stmt **ppStmt, const char *zFormat){
00935   char *zCommand = string_format(zFormat, zDb, zName);
00936   int rc;
00937   TRACE(("FTS1 prepare: %s\n", zCommand));
00938   rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
00939   free(zCommand);
00940   return rc;
00941 }
00942 
00943 /* end utility functions */
00944 
00945 /* Forward reference */
00946 typedef struct fulltext_vtab fulltext_vtab;
00947 
00948 /* A single term in a query is represented by an instances of
00949 ** the following structure.
00950 */
00951 typedef struct QueryTerm {
00952   short int nPhrase; /* How many following terms are part of the same phrase */
00953   short int iPhrase; /* This is the i-th term of a phrase. */
00954   short int iColumn; /* Column of the index that must match this term */
00955   signed char isOr;  /* this term is preceded by "OR" */
00956   signed char isNot; /* this term is preceded by "-" */
00957   char *pTerm;       /* text of the term.  '\000' terminated.  malloced */
00958   int nTerm;         /* Number of bytes in pTerm[] */
00959 } QueryTerm;
00960 
00961 
00962 /* A query string is parsed into a Query structure.
00963  *
00964  * We could, in theory, allow query strings to be complicated
00965  * nested expressions with precedence determined by parentheses.
00966  * But none of the major search engines do this.  (Perhaps the
00967  * feeling is that an parenthesized expression is two complex of
00968  * an idea for the average user to grasp.)  Taking our lead from
00969  * the major search engines, we will allow queries to be a list
00970  * of terms (with an implied AND operator) or phrases in double-quotes,
00971  * with a single optional "-" before each non-phrase term to designate
00972  * negation and an optional OR connector.
00973  *
00974  * OR binds more tightly than the implied AND, which is what the
00975  * major search engines seem to do.  So, for example:
00976  * 
00977  *    [one two OR three]     ==>    one AND (two OR three)
00978  *    [one OR two three]     ==>    (one OR two) AND three
00979  *
00980  * A "-" before a term matches all entries that lack that term.
00981  * The "-" must occur immediately before the term with in intervening
00982  * space.  This is how the search engines do it.
00983  *
00984  * A NOT term cannot be the right-hand operand of an OR.  If this
00985  * occurs in the query string, the NOT is ignored:
00986  *
00987  *    [one OR -two]          ==>    one OR two
00988  *
00989  */
00990 typedef struct Query {
00991   fulltext_vtab *pFts;  /* The full text index */
00992   int nTerms;           /* Number of terms in the query */
00993   QueryTerm *pTerms;    /* Array of terms.  Space obtained from malloc() */
00994   int nextIsOr;         /* Set the isOr flag on the next inserted term */
00995   int nextColumn;       /* Next word parsed must be in this column */
00996   int dfltColumn;       /* The default column */
00997 } Query;
00998 
00999 
01000 /*
01001 ** An instance of the following structure keeps track of generated
01002 ** matching-word offset information and snippets.
01003 */
01004 typedef struct Snippet {
01005   int nMatch;     /* Total number of matches */
01006   int nAlloc;     /* Space allocated for aMatch[] */
01007   struct snippetMatch { /* One entry for each matching term */
01008     char snStatus;       /* Status flag for use while constructing snippets */
01009     short int iCol;      /* The column that contains the match */
01010     short int iTerm;     /* The index in Query.pTerms[] of the matching term */
01011     short int nByte;     /* Number of bytes in the term */
01012     int iStart;          /* The offset to the first character of the term */
01013   } *aMatch;      /* Points to space obtained from malloc */
01014   char *zOffset;  /* Text rendering of aMatch[] */
01015   int nOffset;    /* strlen(zOffset) */
01016   char *zSnippet; /* Snippet text */
01017   int nSnippet;   /* strlen(zSnippet) */
01018 } Snippet;
01019 
01020 
01021 typedef enum QueryType {
01022   QUERY_GENERIC,   /* table scan */
01023   QUERY_ROWID,     /* lookup by rowid */
01024   QUERY_FULLTEXT   /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
01025 } QueryType;
01026 
01027 /* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0
01028 ** before we start aggregating into larger segments.  Lower CHUNK_MAX
01029 ** means that for a given input we have more individual segments per
01030 ** term, which means more rows in the table and a bigger index (due to
01031 ** both more rows and bigger rowids).  But it also reduces the average
01032 ** cost of adding new elements to the segment 0 doclist, and it seems
01033 ** to reduce the number of pages read and written during inserts.  256
01034 ** was chosen by measuring insertion times for a certain input (first
01035 ** 10k documents of Enron corpus), though including query performance
01036 ** in the decision may argue for a larger value.
01037 */
01038 #define CHUNK_MAX 256
01039 
01040 typedef enum fulltext_statement {
01041   CONTENT_INSERT_STMT,
01042   CONTENT_SELECT_STMT,
01043   CONTENT_UPDATE_STMT,
01044   CONTENT_DELETE_STMT,
01045 
01046   TERM_SELECT_STMT,
01047   TERM_SELECT_ALL_STMT,
01048   TERM_INSERT_STMT,
01049   TERM_UPDATE_STMT,
01050   TERM_DELETE_STMT,
01051 
01052   MAX_STMT                     /* Always at end! */
01053 } fulltext_statement;
01054 
01055 /* These must exactly match the enum above. */
01056 /* TODO(adam): Is there some risk that a statement (in particular,
01057 ** pTermSelectStmt) will be used in two cursors at once, e.g.  if a
01058 ** query joins a virtual table to itself?  If so perhaps we should
01059 ** move some of these to the cursor object.
01060 */
01061 static const char *const fulltext_zStatement[MAX_STMT] = {
01062   /* CONTENT_INSERT */ NULL,  /* generated in contentInsertStatement() */
01063   /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
01064   /* CONTENT_UPDATE */ NULL,  /* generated in contentUpdateStatement() */
01065   /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
01066 
01067   /* TERM_SELECT */
01068   "select rowid, doclist from %_term where term = ? and segment = ?",
01069   /* TERM_SELECT_ALL */
01070   "select doclist from %_term where term = ? order by segment",
01071   /* TERM_INSERT */
01072   "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)",
01073   /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
01074   /* TERM_DELETE */ "delete from %_term where rowid = ?",
01075 };
01076 
01077 /*
01078 ** A connection to a fulltext index is an instance of the following
01079 ** structure.  The xCreate and xConnect methods create an instance
01080 ** of this structure and xDestroy and xDisconnect free that instance.
01081 ** All other methods receive a pointer to the structure as one of their
01082 ** arguments.
01083 */
01084 struct fulltext_vtab {
01085   sqlite3_vtab base;               /* Base class used by SQLite core */
01086   sqlite3 *db;                     /* The database connection */
01087   const char *zDb;                 /* logical database name */
01088   const char *zName;               /* virtual table name */
01089   int nColumn;                     /* number of columns in virtual table */
01090   char **azColumn;                 /* column names.  malloced */
01091   char **azContentColumn;          /* column names in content table; malloced */
01092   sqlite3_tokenizer *pTokenizer;   /* tokenizer for inserts and queries */
01093 
01094   /* Precompiled statements which we keep as long as the table is
01095   ** open.
01096   */
01097   sqlite3_stmt *pFulltextStatements[MAX_STMT];
01098 };
01099 
01100 /*
01101 ** When the core wants to do a query, it create a cursor using a
01102 ** call to xOpen.  This structure is an instance of a cursor.  It
01103 ** is destroyed by xClose.
01104 */
01105 typedef struct fulltext_cursor {
01106   sqlite3_vtab_cursor base;        /* Base class used by SQLite core */
01107   QueryType iCursorType;           /* Copy of sqlite3_index_info.idxNum */
01108   sqlite3_stmt *pStmt;             /* Prepared statement in use by the cursor */
01109   int eof;                         /* True if at End Of Results */
01110   Query q;                         /* Parsed query string */
01111   Snippet snippet;                 /* Cached snippet for the current row */
01112   int iColumn;                     /* Column being searched */
01113   DocListReader result;  /* used when iCursorType == QUERY_FULLTEXT */ 
01114 } fulltext_cursor;
01115 
01116 static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
01117   return (fulltext_vtab *) c->base.pVtab;
01118 }
01119 
01120 static const sqlite3_module fulltextModule;   /* forward declaration */
01121 
01122 /* Append a list of strings separated by commas to a StringBuffer. */
01123 static void appendList(StringBuffer *sb, int nString, char **azString){
01124   int i;
01125   for(i=0; i<nString; ++i){
01126     if( i>0 ) append(sb, ", ");
01127     append(sb, azString[i]);
01128   }
01129 }
01130 
01131 /* Return a dynamically generated statement of the form
01132  *   insert into %_content (rowid, ...) values (?, ...)
01133  */
01134 static const char *contentInsertStatement(fulltext_vtab *v){
01135   StringBuffer sb;
01136   int i;
01137 
01138   initStringBuffer(&sb);
01139   append(&sb, "insert into %_content (rowid, ");
01140   appendList(&sb, v->nColumn, v->azContentColumn);
01141   append(&sb, ") values (?");
01142   for(i=0; i<v->nColumn; ++i)
01143     append(&sb, ", ?");
01144   append(&sb, ")");
01145   return sb.s;
01146 }
01147 
01148 /* Return a dynamically generated statement of the form
01149  *   update %_content set [col_0] = ?, [col_1] = ?, ...
01150  *                    where rowid = ?
01151  */
01152 static const char *contentUpdateStatement(fulltext_vtab *v){
01153   StringBuffer sb;
01154   int i;
01155 
01156   initStringBuffer(&sb);
01157   append(&sb, "update %_content set ");
01158   for(i=0; i<v->nColumn; ++i) {
01159     if( i>0 ){
01160       append(&sb, ", ");
01161     }
01162     append(&sb, v->azContentColumn[i]);
01163     append(&sb, " = ?");
01164   }
01165   append(&sb, " where rowid = ?");
01166   return sb.s;
01167 }
01168 
01169 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
01170 ** If the indicated statement has never been prepared, it is prepared
01171 ** and cached, otherwise the cached version is reset.
01172 */
01173 static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
01174                              sqlite3_stmt **ppStmt){
01175   assert( iStmt<MAX_STMT );
01176   if( v->pFulltextStatements[iStmt]==NULL ){
01177     const char *zStmt;
01178     int rc;
01179     switch( iStmt ){
01180       case CONTENT_INSERT_STMT:
01181         zStmt = contentInsertStatement(v); break;
01182       case CONTENT_UPDATE_STMT:
01183         zStmt = contentUpdateStatement(v); break;
01184       default:
01185         zStmt = fulltext_zStatement[iStmt];
01186     }
01187     rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
01188                          zStmt);
01189     if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
01190     if( rc!=SQLITE_OK ) return rc;
01191   } else {
01192     int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
01193     if( rc!=SQLITE_OK ) return rc;
01194   }
01195 
01196   *ppStmt = v->pFulltextStatements[iStmt];
01197   return SQLITE_OK;
01198 }
01199 
01200 /* Step the indicated statement, handling errors SQLITE_BUSY (by
01201 ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
01202 ** bindings to the new statement).
01203 ** TODO(adam): We should extend this function so that it can work with
01204 ** statements declared locally, not only globally cached statements.
01205 */
01206 static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
01207                               sqlite3_stmt **ppStmt){
01208   int rc;
01209   sqlite3_stmt *s = *ppStmt;
01210   assert( iStmt<MAX_STMT );
01211   assert( s==v->pFulltextStatements[iStmt] );
01212 
01213   while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
01214     if( rc==SQLITE_BUSY ) continue;
01215     if( rc!=SQLITE_ERROR ) return rc;
01216 
01217     /* If an SQLITE_SCHEMA error has occured, then finalizing this
01218      * statement is going to delete the fulltext_vtab structure. If
01219      * the statement just executed is in the pFulltextStatements[]
01220      * array, it will be finalized twice. So remove it before
01221      * calling sqlite3_finalize().
01222      */
01223     v->pFulltextStatements[iStmt] = NULL;
01224     rc = sqlite3_finalize(s);
01225     break;
01226   }
01227   return rc;
01228 
01229  err:
01230   sqlite3_finalize(s);
01231   return rc;
01232 }
01233 
01234 /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
01235 ** Useful for statements like UPDATE, where we expect no results.
01236 */
01237 static int sql_single_step_statement(fulltext_vtab *v,
01238                                      fulltext_statement iStmt,
01239                                      sqlite3_stmt **ppStmt){
01240   int rc = sql_step_statement(v, iStmt, ppStmt);
01241   return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
01242 }
01243 
01244 /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
01245 static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
01246                           sqlite3_value **pValues){
01247   sqlite3_stmt *s;
01248   int i;
01249   int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
01250   if( rc!=SQLITE_OK ) return rc;
01251 
01252   rc = sqlite3_bind_value(s, 1, rowid);
01253   if( rc!=SQLITE_OK ) return rc;
01254 
01255   for(i=0; i<v->nColumn; ++i){
01256     rc = sqlite3_bind_value(s, 2+i, pValues[i]);
01257     if( rc!=SQLITE_OK ) return rc;
01258   }
01259 
01260   return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
01261 }
01262 
01263 /* update %_content set col0 = pValues[0], col1 = pValues[1], ...
01264  *                  where rowid = [iRowid] */
01265 static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
01266                           sqlite_int64 iRowid){
01267   sqlite3_stmt *s;
01268   int i;
01269   int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
01270   if( rc!=SQLITE_OK ) return rc;
01271 
01272   for(i=0; i<v->nColumn; ++i){
01273     rc = sqlite3_bind_value(s, 1+i, pValues[i]);
01274     if( rc!=SQLITE_OK ) return rc;
01275   }
01276 
01277   rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
01278   if( rc!=SQLITE_OK ) return rc;
01279 
01280   return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s);
01281 }
01282 
01283 static void freeStringArray(int nString, const char **pString){
01284   int i;
01285 
01286   for (i=0 ; i < nString ; ++i) {
01287     if( pString[i]!=NULL ) free((void *) pString[i]);
01288   }
01289   free((void *) pString);
01290 }
01291 
01292 /* select * from %_content where rowid = [iRow]
01293  * The caller must delete the returned array and all strings in it.
01294  * null fields will be NULL in the returned array.
01295  *
01296  * TODO: Perhaps we should return pointer/length strings here for consistency
01297  * with other code which uses pointer/length. */
01298 static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
01299                           const char ***pValues){
01300   sqlite3_stmt *s;
01301   const char **values;
01302   int i;
01303   int rc;
01304 
01305   *pValues = NULL;
01306 
01307   rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
01308   if( rc!=SQLITE_OK ) return rc;
01309 
01310   rc = sqlite3_bind_int64(s, 1, iRow);
01311   if( rc!=SQLITE_OK ) return rc;
01312 
01313   rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
01314   if( rc!=SQLITE_ROW ) return rc;
01315 
01316   values = (const char **) malloc(v->nColumn * sizeof(const char *));
01317   for(i=0; i<v->nColumn; ++i){
01318     if( sqlite3_column_type(s, i)==SQLITE_NULL ){
01319       values[i] = NULL;
01320     }else{
01321       values[i] = string_dup((char*)sqlite3_column_text(s, i));
01322     }
01323   }
01324 
01325   /* We expect only one row.  We must execute another sqlite3_step()
01326    * to complete the iteration; otherwise the table will remain locked. */
01327   rc = sqlite3_step(s);
01328   if( rc==SQLITE_DONE ){
01329     *pValues = values;
01330     return SQLITE_OK;
01331   }
01332 
01333   freeStringArray(v->nColumn, values);
01334   return rc;
01335 }
01336 
01337 /* delete from %_content where rowid = [iRow ] */
01338 static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
01339   sqlite3_stmt *s;
01340   int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
01341   if( rc!=SQLITE_OK ) return rc;
01342 
01343   rc = sqlite3_bind_int64(s, 1, iRow);
01344   if( rc!=SQLITE_OK ) return rc;
01345 
01346   return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
01347 }
01348 
01349 /* select rowid, doclist from %_term
01350  *  where term = [pTerm] and segment = [iSegment]
01351  * If found, returns SQLITE_ROW; the caller must free the
01352  * returned doclist.  If no rows found, returns SQLITE_DONE. */
01353 static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm,
01354                        int iSegment,
01355                        sqlite_int64 *rowid, DocList *out){
01356   sqlite3_stmt *s;
01357   int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
01358   if( rc!=SQLITE_OK ) return rc;
01359 
01360   rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
01361   if( rc!=SQLITE_OK ) return rc;
01362 
01363   rc = sqlite3_bind_int(s, 2, iSegment);
01364   if( rc!=SQLITE_OK ) return rc;
01365 
01366   rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
01367   if( rc!=SQLITE_ROW ) return rc;
01368 
01369   *rowid = sqlite3_column_int64(s, 0);
01370   docListInit(out, DL_DEFAULT,
01371               sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
01372 
01373   /* We expect only one row.  We must execute another sqlite3_step()
01374    * to complete the iteration; otherwise the table will remain locked. */
01375   rc = sqlite3_step(s);
01376   return rc==SQLITE_DONE ? SQLITE_ROW : rc;
01377 }
01378 
01379 /* Load the segment doclists for term pTerm and merge them in
01380 ** appropriate order into out.  Returns SQLITE_OK if successful.  If
01381 ** there are no segments for pTerm, successfully returns an empty
01382 ** doclist in out.
01383 **
01384 ** Each document consists of 1 or more "columns".  The number of
01385 ** columns is v->nColumn.  If iColumn==v->nColumn, then return
01386 ** position information about all columns.  If iColumn<v->nColumn,
01387 ** then only return position information about the iColumn-th column
01388 ** (where the first column is 0).
01389 */
01390 static int term_select_all(
01391   fulltext_vtab *v,     /* The fulltext index we are querying against */
01392   int iColumn,          /* If <nColumn, only look at the iColumn-th column */
01393   const char *pTerm,    /* The term whose posting lists we want */
01394   int nTerm,            /* Number of bytes in pTerm */
01395   DocList *out          /* Write the resulting doclist here */
01396 ){
01397   DocList doclist;
01398   sqlite3_stmt *s;
01399   int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s);
01400   if( rc!=SQLITE_OK ) return rc;
01401 
01402   rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
01403   if( rc!=SQLITE_OK ) return rc;
01404 
01405   docListInit(&doclist, DL_DEFAULT, 0, 0);
01406 
01407   /* TODO(shess) Handle schema and busy errors. */
01408   while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){
01409     DocList old;
01410 
01411     /* TODO(shess) If we processed doclists from oldest to newest, we
01412     ** could skip the malloc() involved with the following call.  For
01413     ** now, I'd rather keep this logic similar to index_insert_term().
01414     ** We could additionally drop elements when we see deletes, but
01415     ** that would require a distinct version of docListAccumulate().
01416     */
01417     docListInit(&old, DL_DEFAULT,
01418                 sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0));
01419 
01420     if( iColumn<v->nColumn ){   /* querying a single column */
01421       docListRestrictColumn(&old, iColumn);
01422     }
01423 
01424     /* doclist contains the newer data, so write it over old.  Then
01425     ** steal accumulated result for doclist.
01426     */
01427     docListAccumulate(&old, &doclist);
01428     docListDestroy(&doclist);
01429     doclist = old;
01430   }
01431   if( rc!=SQLITE_DONE ){
01432     docListDestroy(&doclist);
01433     return rc;
01434   }
01435 
01436   docListDiscardEmpty(&doclist);
01437   *out = doclist;
01438   return SQLITE_OK;
01439 }
01440 
01441 /* insert into %_term (rowid, term, segment, doclist)
01442                values ([piRowid], [pTerm], [iSegment], [doclist])
01443 ** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid.
01444 **
01445 ** NOTE(shess) piRowid is IN, with values of "space of int64" plus
01446 ** null, it is not used to pass data back to the caller.
01447 */
01448 static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid,
01449                        const char *pTerm, int nTerm,
01450                        int iSegment, DocList *doclist){
01451   sqlite3_stmt *s;
01452   int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
01453   if( rc!=SQLITE_OK ) return rc;
01454 
01455   if( piRowid==NULL ){
01456     rc = sqlite3_bind_null(s, 1);
01457   }else{
01458     rc = sqlite3_bind_int64(s, 1, *piRowid);
01459   }
01460   if( rc!=SQLITE_OK ) return rc;
01461 
01462   rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC);
01463   if( rc!=SQLITE_OK ) return rc;
01464 
01465   rc = sqlite3_bind_int(s, 3, iSegment);
01466   if( rc!=SQLITE_OK ) return rc;
01467 
01468   rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC);
01469   if( rc!=SQLITE_OK ) return rc;
01470 
01471   return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
01472 }
01473 
01474 /* update %_term set doclist = [doclist] where rowid = [rowid] */
01475 static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
01476                        DocList *doclist){
01477   sqlite3_stmt *s;
01478   int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
01479   if( rc!=SQLITE_OK ) return rc;
01480 
01481   rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC);
01482   if( rc!=SQLITE_OK ) return rc;
01483 
01484   rc = sqlite3_bind_int64(s, 2, rowid);
01485   if( rc!=SQLITE_OK ) return rc;
01486 
01487   return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
01488 }
01489 
01490 static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
01491   sqlite3_stmt *s;
01492   int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
01493   if( rc!=SQLITE_OK ) return rc;
01494 
01495   rc = sqlite3_bind_int64(s, 1, rowid);
01496   if( rc!=SQLITE_OK ) return rc;
01497 
01498   return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
01499 }
01500 
01501 /*
01502 ** Free the memory used to contain a fulltext_vtab structure.
01503 */
01504 static void fulltext_vtab_destroy(fulltext_vtab *v){
01505   int iStmt, i;
01506 
01507   TRACE(("FTS1 Destroy %p\n", v));
01508   for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
01509     if( v->pFulltextStatements[iStmt]!=NULL ){
01510       sqlite3_finalize(v->pFulltextStatements[iStmt]);
01511       v->pFulltextStatements[iStmt] = NULL;
01512     }
01513   }
01514 
01515   if( v->pTokenizer!=NULL ){
01516     v->pTokenizer->pModule->xDestroy(v->pTokenizer);
01517     v->pTokenizer = NULL;
01518   }
01519   
01520   free(v->azColumn);
01521   for(i = 0; i < v->nColumn; ++i) {
01522     sqlite3_free(v->azContentColumn[i]);
01523   }
01524   free(v->azContentColumn);
01525   free(v);
01526 }
01527 
01528 /*
01529 ** Token types for parsing the arguments to xConnect or xCreate.
01530 */
01531 #define TOKEN_EOF         0    /* End of file */
01532 #define TOKEN_SPACE       1    /* Any kind of whitespace */
01533 #define TOKEN_ID          2    /* An identifier */
01534 #define TOKEN_STRING      3    /* A string literal */
01535 #define TOKEN_PUNCT       4    /* A single punctuation character */
01536 
01537 /*
01538 ** If X is a character that can be used in an identifier then
01539 ** IdChar(X) will be true.  Otherwise it is false.
01540 **
01541 ** For ASCII, any character with the high-order bit set is
01542 ** allowed in an identifier.  For 7-bit characters, 
01543 ** sqlite3IsIdChar[X] must be 1.
01544 **
01545 ** Ticket #1066.  the SQL standard does not allow '$' in the
01546 ** middle of identfiers.  But many SQL implementations do. 
01547 ** SQLite will allow '$' in identifiers for compatibility.
01548 ** But the feature is undocumented.
01549 */
01550 static const char isIdChar[] = {
01551 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
01552     0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
01553     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
01554     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
01555     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
01556     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
01557     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
01558 };
01559 #define IdChar(C)  (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
01560 
01561 
01562 /*
01563 ** Return the length of the token that begins at z[0]. 
01564 ** Store the token type in *tokenType before returning.
01565 */
01566 static int getToken(const char *z, int *tokenType){
01567   int i, c;
01568   switch( *z ){
01569     case 0: {
01570       *tokenType = TOKEN_EOF;
01571       return 0;
01572     }
01573     case ' ': case '\t': case '\n': case '\f': case '\r': {
01574       for(i=1; safe_isspace(z[i]); i++){}
01575       *tokenType = TOKEN_SPACE;
01576       return i;
01577     }
01578     case '`':
01579     case '\'':
01580     case '"': {
01581       int delim = z[0];
01582       for(i=1; (c=z[i])!=0; i++){
01583         if( c==delim ){
01584           if( z[i+1]==delim ){
01585             i++;
01586           }else{
01587             break;
01588           }
01589         }
01590       }
01591       *tokenType = TOKEN_STRING;
01592       return i + (c!=0);
01593     }
01594     case '[': {
01595       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
01596       *tokenType = TOKEN_ID;
01597       return i;
01598     }
01599     default: {
01600       if( !IdChar(*z) ){
01601         break;
01602       }
01603       for(i=1; IdChar(z[i]); i++){}
01604       *tokenType = TOKEN_ID;
01605       return i;
01606     }
01607   }
01608   *tokenType = TOKEN_PUNCT;
01609   return 1;
01610 }
01611 
01612 /*
01613 ** A token extracted from a string is an instance of the following
01614 ** structure.
01615 */
01616 typedef struct Token {
01617   const char *z;       /* Pointer to token text.  Not '\000' terminated */
01618   short int n;         /* Length of the token text in bytes. */
01619 } Token;
01620 
01621 /*
01622 ** Given a input string (which is really one of the argv[] parameters
01623 ** passed into xConnect or xCreate) split the string up into tokens.
01624 ** Return an array of pointers to '\000' terminated strings, one string
01625 ** for each non-whitespace token.
01626 **
01627 ** The returned array is terminated by a single NULL pointer.
01628 **
01629 ** Space to hold the returned array is obtained from a single
01630 ** malloc and should be freed by passing the return value to free().
01631 ** The individual strings within the token list are all a part of
01632 ** the single memory allocation and will all be freed at once.
01633 */
01634 static char **tokenizeString(const char *z, int *pnToken){
01635   int nToken = 0;
01636   Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
01637   int n = 1;
01638   int e, i;
01639   int totalSize = 0;
01640   char **azToken;
01641   char *zCopy;
01642   while( n>0 ){
01643     n = getToken(z, &e);
01644     if( e!=TOKEN_SPACE ){
01645       aToken[nToken].z = z;
01646       aToken[nToken].n = n;
01647       nToken++;
01648       totalSize += n+1;
01649     }
01650     z += n;
01651   }
01652   azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
01653   zCopy = (char*)&azToken[nToken];
01654   nToken--;
01655   for(i=0; i<nToken; i++){
01656     azToken[i] = zCopy;
01657     n = aToken[i].n;
01658     memcpy(zCopy, aToken[i].z, n);
01659     zCopy[n] = 0;
01660     zCopy += n+1;
01661   }
01662   azToken[nToken] = 0;
01663   free(aToken);
01664   *pnToken = nToken;
01665   return azToken;
01666 }
01667 
01668 /*
01669 ** Convert an SQL-style quoted string into a normal string by removing
01670 ** the quote characters.  The conversion is done in-place.  If the
01671 ** input does not begin with a quote character, then this routine
01672 ** is a no-op.
01673 **
01674 ** Examples:
01675 **
01676 **     "abc"   becomes   abc
01677 **     'xyz'   becomes   xyz
01678 **     [pqr]   becomes   pqr
01679 **     `mno`   becomes   mno
01680 */
01681 static void dequoteString(char *z){
01682   int quote;
01683   int i, j;
01684   if( z==0 ) return;
01685   quote = z[0];
01686   switch( quote ){
01687     case '\'':  break;
01688     case '"':   break;
01689     case '`':   break;                /* For MySQL compatibility */
01690     case '[':   quote = ']';  break;  /* For MS SqlServer compatibility */
01691     default:    return;
01692   }
01693   for(i=1, j=0; z[i]; i++){
01694     if( z[i]==quote ){
01695       if( z[i+1]==quote ){
01696         z[j++] = quote;
01697         i++;
01698       }else{
01699         z[j++] = 0;
01700         break;
01701       }
01702     }else{
01703       z[j++] = z[i];
01704     }
01705   }
01706 }
01707 
01708 /*
01709 ** The input azIn is a NULL-terminated list of tokens.  Remove the first
01710 ** token and all punctuation tokens.  Remove the quotes from
01711 ** around string literal tokens.
01712 **
01713 ** Example:
01714 **
01715 **     input:      tokenize chinese ( 'simplifed' , 'mixed' )
01716 **     output:     chinese simplifed mixed
01717 **
01718 ** Another example:
01719 **
01720 **     input:      delimiters ( '[' , ']' , '...' )
01721 **     output:     [ ] ...
01722 */
01723 static void tokenListToIdList(char **azIn){
01724   int i, j;
01725   if( azIn ){
01726     for(i=0, j=-1; azIn[i]; i++){
01727       if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
01728         dequoteString(azIn[i]);
01729         if( j>=0 ){
01730           azIn[j] = azIn[i];
01731         }
01732         j++;
01733       }
01734     }
01735     azIn[j] = 0;
01736   }
01737 }
01738 
01739 
01740 /*
01741 ** Find the first alphanumeric token in the string zIn.  Null-terminate
01742 ** this token.  Remove any quotation marks.  And return a pointer to
01743 ** the result.
01744 */
01745 static char *firstToken(char *zIn, char **pzTail){
01746   int n, ttype;
01747   while(1){
01748     n = getToken(zIn, &ttype);
01749     if( ttype==TOKEN_SPACE ){
01750       zIn += n;
01751     }else if( ttype==TOKEN_EOF ){
01752       *pzTail = zIn;
01753       return 0;
01754     }else{
01755       zIn[n] = 0;
01756       *pzTail = &zIn[1];
01757       dequoteString(zIn);
01758       return zIn;
01759     }
01760   }
01761   /*NOTREACHED*/
01762 }
01763 
01764 /* Return true if...
01765 **
01766 **   *  s begins with the string t, ignoring case
01767 **   *  s is longer than t
01768 **   *  The first character of s beyond t is not a alphanumeric
01769 ** 
01770 ** Ignore leading space in *s.
01771 **
01772 ** To put it another way, return true if the first token of
01773 ** s[] is t[].
01774 */
01775 static int startsWith(const char *s, const char *t){
01776   while( safe_isspace(*s) ){ s++; }
01777   while( *t ){
01778     if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
01779   }
01780   return *s!='_' && !safe_isalnum(*s);
01781 }
01782 
01783 /*
01784 ** An instance of this structure defines the "spec" of a
01785 ** full text index.  This structure is populated by parseSpec
01786 ** and use by fulltextConnect and fulltextCreate.
01787 */
01788 typedef struct TableSpec {
01789   const char *zDb;         /* Logical database name */
01790   const char *zName;       /* Name of the full-text index */
01791   int nColumn;             /* Number of columns to be indexed */
01792   char **azColumn;         /* Original names of columns to be indexed */
01793   char **azContentColumn;  /* Column names for %_content */
01794   char **azTokenizer;      /* Name of tokenizer and its arguments */
01795 } TableSpec;
01796 
01797 /*
01798 ** Reclaim all of the memory used by a TableSpec
01799 */
01800 static void clearTableSpec(TableSpec *p) {
01801   free(p->azColumn);
01802   free(p->azContentColumn);
01803   free(p->azTokenizer);
01804 }
01805 
01806 /* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
01807  *
01808  * CREATE VIRTUAL TABLE email
01809  *        USING fts1(subject, body, tokenize mytokenizer(myarg))
01810  *
01811  * We return parsed information in a TableSpec structure.
01812  * 
01813  */
01814 static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
01815                      char**pzErr){
01816   int i, n;
01817   char *z, *zDummy;
01818   char **azArg;
01819   const char *zTokenizer = 0;    /* argv[] entry describing the tokenizer */
01820 
01821   assert( argc>=3 );
01822   /* Current interface:
01823   ** argv[0] - module name
01824   ** argv[1] - database name
01825   ** argv[2] - table name
01826   ** argv[3..] - columns, optionally followed by tokenizer specification
01827   **             and snippet delimiters specification.
01828   */
01829 
01830   /* Make a copy of the complete argv[][] array in a single allocation.
01831   ** The argv[][] array is read-only and transient.  We can write to the
01832   ** copy in order to modify things and the copy is persistent.
01833   */
01834   memset(pSpec, 0, sizeof(*pSpec));
01835   for(i=n=0; i<argc; i++){
01836     n += strlen(argv[i]) + 1;
01837   }
01838   azArg = malloc( sizeof(char*)*argc + n );
01839   if( azArg==0 ){
01840     return SQLITE_NOMEM;
01841   }
01842   z = (char*)&azArg[argc];
01843   for(i=0; i<argc; i++){
01844     azArg[i] = z;
01845     strcpy(z, argv[i]);
01846     z += strlen(z)+1;
01847   }
01848 
01849   /* Identify the column names and the tokenizer and delimiter arguments
01850   ** in the argv[][] array.
01851   */
01852   pSpec->zDb = azArg[1];
01853   pSpec->zName = azArg[2];
01854   pSpec->nColumn = 0;
01855   pSpec->azColumn = azArg;
01856   zTokenizer = "tokenize simple";
01857   for(i=3; i<argc; ++i){
01858     if( startsWith(azArg[i],"tokenize") ){
01859       zTokenizer = azArg[i];
01860     }else{
01861       z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
01862       pSpec->nColumn++;
01863     }
01864   }
01865   if( pSpec->nColumn==0 ){
01866     azArg[0] = "content";
01867     pSpec->nColumn = 1;
01868   }
01869 
01870   /*
01871   ** Construct the list of content column names.
01872   **
01873   ** Each content column name will be of the form cNNAAAA
01874   ** where NN is the column number and AAAA is the sanitized
01875   ** column name.  "sanitized" means that special characters are
01876   ** converted to "_".  The cNN prefix guarantees that all column
01877   ** names are unique.
01878   **
01879   ** The AAAA suffix is not strictly necessary.  It is included
01880   ** for the convenience of people who might examine the generated
01881   ** %_content table and wonder what the columns are used for.
01882   */
01883   pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
01884   if( pSpec->azContentColumn==0 ){
01885     clearTableSpec(pSpec);
01886     return SQLITE_NOMEM;
01887   }
01888   for(i=0; i<pSpec->nColumn; i++){
01889     char *p;
01890     pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
01891     for (p = pSpec->azContentColumn[i]; *p ; ++p) {
01892       if( !safe_isalnum(*p) ) *p = '_';
01893     }
01894   }
01895 
01896   /*
01897   ** Parse the tokenizer specification string.
01898   */
01899   pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
01900   tokenListToIdList(pSpec->azTokenizer);
01901 
01902   return SQLITE_OK;
01903 }
01904 
01905 /*
01906 ** Generate a CREATE TABLE statement that describes the schema of
01907 ** the virtual table.  Return a pointer to this schema string.
01908 **
01909 ** Space is obtained from sqlite3_mprintf() and should be freed
01910 ** using sqlite3_free().
01911 */
01912 static char *fulltextSchema(
01913   int nColumn,                  /* Number of columns */
01914   const char *const* azColumn,  /* List of columns */
01915   const char *zTableName        /* Name of the table */
01916 ){
01917   int i;
01918   char *zSchema, *zNext;
01919   const char *zSep = "(";
01920   zSchema = sqlite3_mprintf("CREATE TABLE x");
01921   for(i=0; i<nColumn; i++){
01922     zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
01923     sqlite3_free(zSchema);
01924     zSchema = zNext;
01925     zSep = ",";
01926   }
01927   zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
01928   sqlite3_free(zSchema);
01929   return zNext;
01930 }
01931 
01932 /*
01933 ** Build a new sqlite3_vtab structure that will describe the
01934 ** fulltext index defined by spec.
01935 */
01936 static int constructVtab(
01937   sqlite3 *db,              /* The SQLite database connection */
01938   TableSpec *spec,          /* Parsed spec information from parseSpec() */
01939   sqlite3_vtab **ppVTab,    /* Write the resulting vtab structure here */
01940   char **pzErr              /* Write any error message here */
01941 ){
01942   int rc;
01943   int n;
01944   fulltext_vtab *v = 0;
01945   const sqlite3_tokenizer_module *m = NULL;
01946   char *schema;
01947 
01948   v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
01949   if( v==0 ) return SQLITE_NOMEM;
01950   memset(v, 0, sizeof(*v));
01951   /* sqlite will initialize v->base */
01952   v->db = db;
01953   v->zDb = spec->zDb;       /* Freed when azColumn is freed */
01954   v->zName = spec->zName;   /* Freed when azColumn is freed */
01955   v->nColumn = spec->nColumn;
01956   v->azContentColumn = spec->azContentColumn;
01957   spec->azContentColumn = 0;
01958   v->azColumn = spec->azColumn;
01959   spec->azColumn = 0;
01960 
01961   if( spec->azTokenizer==0 ){
01962     return SQLITE_NOMEM;
01963   }
01964   /* TODO(shess) For now, add new tokenizers as else if clauses. */
01965   if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){
01966     sqlite3Fts1SimpleTokenizerModule(&m);
01967   }else if( startsWith(spec->azTokenizer[0], "porter") ){
01968     sqlite3Fts1PorterTokenizerModule(&m);
01969   }else{
01970     *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
01971     rc = SQLITE_ERROR;
01972     goto err;
01973   }
01974   for(n=0; spec->azTokenizer[n]; n++){}
01975   if( n ){
01976     rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
01977                     &v->pTokenizer);
01978   }else{
01979     rc = m->xCreate(0, 0, &v->pTokenizer);
01980   }
01981   if( rc!=SQLITE_OK ) goto err;
01982   v->pTokenizer->pModule = m;
01983 
01984   /* TODO: verify the existence of backing tables foo_content, foo_term */
01985 
01986   schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
01987                           spec->zName);
01988   rc = sqlite3_declare_vtab(db, schema);
01989   sqlite3_free(schema);
01990   if( rc!=SQLITE_OK ) goto err;
01991 
01992   memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
01993 
01994   *ppVTab = &v->base;
01995   TRACE(("FTS1 Connect %p\n", v));
01996 
01997   return rc;
01998 
01999 err:
02000   fulltext_vtab_destroy(v);
02001   return rc;
02002 }
02003 
02004 static int fulltextConnect(
02005   sqlite3 *db,
02006   void *pAux,
02007   int argc, const char *const*argv,
02008   sqlite3_vtab **ppVTab,
02009   char **pzErr
02010 ){
02011   TableSpec spec;
02012   int rc = parseSpec(&spec, argc, argv, pzErr);
02013   if( rc!=SQLITE_OK ) return rc;
02014 
02015   rc = constructVtab(db, &spec, ppVTab, pzErr);
02016   clearTableSpec(&spec);
02017   return rc;
02018 }
02019 
02020   /* The %_content table holds the text of each document, with
02021   ** the rowid used as the docid.
02022   **
02023   ** The %_term table maps each term to a document list blob
02024   ** containing elements sorted by ascending docid, each element
02025   ** encoded as:
02026   **
02027   **   docid varint-encoded
02028   **   token elements:
02029   **     position+1 varint-encoded as delta from previous position
02030   **     start offset varint-encoded as delta from previous start offset
02031   **     end offset varint-encoded as delta from start offset
02032   **
02033   ** The sentinel position of 0 indicates the end of the token list.
02034   **
02035   ** Additionally, doclist blobs are chunked into multiple segments,
02036   ** using segment to order the segments.  New elements are added to
02037   ** the segment at segment 0, until it exceeds CHUNK_MAX.  Then
02038   ** segment 0 is deleted, and the doclist is inserted at segment 1.
02039   ** If there is already a doclist at segment 1, the segment 0 doclist
02040   ** is merged with it, the segment 1 doclist is deleted, and the
02041   ** merged doclist is inserted at segment 2, repeating those
02042   ** operations until an insert succeeds.
02043   **
02044   ** Since this structure doesn't allow us to update elements in place
02045   ** in case of deletion or update, these are simply written to
02046   ** segment 0 (with an empty token list in case of deletion), with
02047   ** docListAccumulate() taking care to retain lower-segment
02048   ** information in preference to higher-segment information.
02049   */
02050   /* TODO(shess) Provide a VACUUM type operation which both removes
02051   ** deleted elements which are no longer necessary, and duplicated
02052   ** elements.  I suspect this will probably not be necessary in
02053   ** practice, though.
02054   */
02055 static int fulltextCreate(sqlite3 *db, void *pAux,
02056                           int argc, const char * const *argv,
02057                           sqlite3_vtab **ppVTab, char **pzErr){
02058   int rc;
02059   TableSpec spec;
02060   StringBuffer schema;
02061   TRACE(("FTS1 Create\n"));
02062 
02063   rc = parseSpec(&spec, argc, argv, pzErr);
02064   if( rc!=SQLITE_OK ) return rc;
02065 
02066   initStringBuffer(&schema);
02067   append(&schema, "CREATE TABLE %_content(");
02068   appendList(&schema, spec.nColumn, spec.azContentColumn);
02069   append(&schema, ")");
02070   rc = sql_exec(db, spec.zDb, spec.zName, schema.s);
02071   free(schema.s);
02072   if( rc!=SQLITE_OK ) goto out;
02073 
02074   rc = sql_exec(db, spec.zDb, spec.zName,
02075     "create table %_term(term text, segment integer, doclist blob, "
02076                         "primary key(term, segment));");
02077   if( rc!=SQLITE_OK ) goto out;
02078 
02079   rc = constructVtab(db, &spec, ppVTab, pzErr);
02080 
02081 out:
02082   clearTableSpec(&spec);
02083   return rc;
02084 }
02085 
02086 /* Decide how to handle an SQL query. */
02087 static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
02088   int i;
02089   TRACE(("FTS1 BestIndex\n"));
02090 
02091   for(i=0; i<pInfo->nConstraint; ++i){
02092     const struct sqlite3_index_constraint *pConstraint;
02093     pConstraint = &pInfo->aConstraint[i];
02094     if( pConstraint->usable ) {
02095       if( pConstraint->iColumn==-1 &&
02096           pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
02097         pInfo->idxNum = QUERY_ROWID;      /* lookup by rowid */
02098         TRACE(("FTS1 QUERY_ROWID\n"));
02099       } else if( pConstraint->iColumn>=0 &&
02100                  pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
02101         /* full-text search */
02102         pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
02103         TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
02104       } else continue;
02105 
02106       pInfo->aConstraintUsage[i].argvIndex = 1;
02107       pInfo->aConstraintUsage[i].omit = 1;
02108 
02109       /* An arbitrary value for now.
02110        * TODO: Perhaps rowid matches should be considered cheaper than
02111        * full-text searches. */
02112       pInfo->estimatedCost = 1.0;   
02113 
02114       return SQLITE_OK;
02115     }
02116   }
02117   pInfo->idxNum = QUERY_GENERIC;
02118   return SQLITE_OK;
02119 }
02120 
02121 static int fulltextDisconnect(sqlite3_vtab *pVTab){
02122   TRACE(("FTS1 Disconnect %p\n", pVTab));
02123   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
02124   return SQLITE_OK;
02125 }
02126 
02127 static int fulltextDestroy(sqlite3_vtab *pVTab){
02128   fulltext_vtab *v = (fulltext_vtab *)pVTab;
02129   int rc;
02130 
02131   TRACE(("FTS1 Destroy %p\n", pVTab));
02132   rc = sql_exec(v->db, v->zDb, v->zName,
02133                 "drop table if exists %_content;"
02134                 "drop table if exists %_term;"
02135                 );
02136   if( rc!=SQLITE_OK ) return rc;
02137 
02138   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
02139   return SQLITE_OK;
02140 }
02141 
02142 static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
02143   fulltext_cursor *c;
02144 
02145   c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
02146   /* sqlite will initialize c->base */
02147   *ppCursor = &c->base;
02148   TRACE(("FTS1 Open %p: %p\n", pVTab, c));
02149 
02150   return SQLITE_OK;
02151 }
02152 
02153 
02154 /* Free all of the dynamically allocated memory held by *q
02155 */
02156 static void queryClear(Query *q){
02157   int i;
02158   for(i = 0; i < q->nTerms; ++i){
02159     free(q->pTerms[i].pTerm);
02160   }
02161   free(q->pTerms);
02162   memset(q, 0, sizeof(*q));
02163 }
02164 
02165 /* Free all of the dynamically allocated memory held by the
02166 ** Snippet
02167 */
02168 static void snippetClear(Snippet *p){
02169   free(p->aMatch);
02170   free(p->zOffset);
02171   free(p->zSnippet);
02172   memset(p, 0, sizeof(*p));
02173 }
02174 /*
02175 ** Append a single entry to the p->aMatch[] log.
02176 */
02177 static void snippetAppendMatch(
02178   Snippet *p,               /* Append the entry to this snippet */
02179   int iCol, int iTerm,      /* The column and query term */
02180   int iStart, int nByte     /* Offset and size of the match */
02181 ){
02182   int i;
02183   struct snippetMatch *pMatch;
02184   if( p->nMatch+1>=p->nAlloc ){
02185     p->nAlloc = p->nAlloc*2 + 10;
02186     p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
02187     if( p->aMatch==0 ){
02188       p->nMatch = 0;
02189       p->nAlloc = 0;
02190       return;
02191     }
02192   }
02193   i = p->nMatch++;
02194   pMatch = &p->aMatch[i];
02195   pMatch->iCol = iCol;
02196   pMatch->iTerm = iTerm;
02197   pMatch->iStart = iStart;
02198   pMatch->nByte = nByte;
02199 }
02200 
02201 /*
02202 ** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
02203 */
02204 #define FTS1_ROTOR_SZ   (32)
02205 #define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1)
02206 
02207 /*
02208 ** Add entries to pSnippet->aMatch[] for every match that occurs against
02209 ** document zDoc[0..nDoc-1] which is stored in column iColumn.
02210 */
02211 static void snippetOffsetsOfColumn(
02212   Query *pQuery,
02213   Snippet *pSnippet,
02214   int iColumn,
02215   const char *zDoc,
02216   int nDoc
02217 ){
02218   const sqlite3_tokenizer_module *pTModule;  /* The tokenizer module */
02219   sqlite3_tokenizer *pTokenizer;             /* The specific tokenizer */
02220   sqlite3_tokenizer_cursor *pTCursor;        /* Tokenizer cursor */
02221   fulltext_vtab *pVtab;                /* The full text index */
02222   int nColumn;                         /* Number of columns in the index */
02223   const QueryTerm *aTerm;              /* Query string terms */
02224   int nTerm;                           /* Number of query string terms */  
02225   int i, j;                            /* Loop counters */
02226   int rc;                              /* Return code */
02227   unsigned int match, prevMatch;       /* Phrase search bitmasks */
02228   const char *zToken;                  /* Next token from the tokenizer */
02229   int nToken;                          /* Size of zToken */
02230   int iBegin, iEnd, iPos;              /* Offsets of beginning and end */
02231 
02232   /* The following variables keep a circular buffer of the last
02233   ** few tokens */
02234   unsigned int iRotor = 0;             /* Index of current token */
02235   int iRotorBegin[FTS1_ROTOR_SZ];      /* Beginning offset of token */
02236   int iRotorLen[FTS1_ROTOR_SZ];        /* Length of token */
02237 
02238   pVtab = pQuery->pFts;
02239   nColumn = pVtab->nColumn;
02240   pTokenizer = pVtab->pTokenizer;
02241   pTModule = pTokenizer->pModule;
02242   rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
02243   if( rc ) return;
02244   pTCursor->pTokenizer = pTokenizer;
02245   aTerm = pQuery->pTerms;
02246   nTerm = pQuery->nTerms;
02247   if( nTerm>=FTS1_ROTOR_SZ ){
02248     nTerm = FTS1_ROTOR_SZ - 1;
02249   }
02250   prevMatch = 0;
02251   while(1){
02252     rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
02253     if( rc ) break;
02254     iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin;
02255     iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin;
02256     match = 0;
02257     for(i=0; i<nTerm; i++){
02258       int iCol;
02259       iCol = aTerm[i].iColumn;
02260       if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
02261       if( aTerm[i].nTerm!=nToken ) continue;
02262       if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue;
02263       if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
02264       match |= 1<<i;
02265       if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
02266         for(j=aTerm[i].iPhrase-1; j>=0; j--){
02267           int k = (iRotor-j) & FTS1_ROTOR_MASK;
02268           snippetAppendMatch(pSnippet, iColumn, i-j,
02269                 iRotorBegin[k], iRotorLen[k]);
02270         }
02271       }
02272     }
02273     prevMatch = match<<1;
02274     iRotor++;
02275   }
02276   pTModule->xClose(pTCursor);  
02277 }
02278 
02279 
02280 /*
02281 ** Compute all offsets for the current row of the query.  
02282 ** If the offsets have already been computed, this routine is a no-op.
02283 */
02284 static void snippetAllOffsets(fulltext_cursor *p){
02285   int nColumn;
02286   int iColumn, i;
02287   int iFirst, iLast;
02288   fulltext_vtab *pFts;
02289 
02290   if( p->snippet.nMatch ) return;
02291   if( p->q.nTerms==0 ) return;
02292   pFts = p->q.pFts;
02293   nColumn = pFts->nColumn;
02294   iColumn = p->iCursorType - QUERY_FULLTEXT;
02295   if( iColumn<0 || iColumn>=nColumn ){
02296     iFirst = 0;
02297     iLast = nColumn-1;
02298   }else{
02299     iFirst = iColumn;
02300     iLast = iColumn;
02301   }
02302   for(i=iFirst; i<=iLast; i++){
02303     const char *zDoc;
02304     int nDoc;
02305     zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
02306     nDoc = sqlite3_column_bytes(p->pStmt, i+1);
02307     snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
02308   }
02309 }
02310 
02311 /*
02312 ** Convert the information in the aMatch[] array of the snippet
02313 ** into the string zOffset[0..nOffset-1].
02314 */
02315 static void snippetOffsetText(Snippet *p){
02316   int i;
02317   int cnt = 0;
02318   StringBuffer sb;
02319   char zBuf[200];
02320   if( p->zOffset ) return;
02321   initStringBuffer(&sb);
02322   for(i=0; i<p->nMatch; i++){
02323     struct snippetMatch *pMatch = &p->aMatch[i];
02324     zBuf[0] = ' ';
02325     sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
02326         pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
02327     append(&sb, zBuf);
02328     cnt++;
02329   }
02330   p->zOffset = sb.s;
02331   p->nOffset = sb.len;
02332 }
02333 
02334 /*
02335 ** zDoc[0..nDoc-1] is phrase of text.  aMatch[0..nMatch-1] are a set
02336 ** of matching words some of which might be in zDoc.  zDoc is column
02337 ** number iCol.
02338 **
02339 ** iBreak is suggested spot in zDoc where we could begin or end an
02340 ** excerpt.  Return a value similar to iBreak but possibly adjusted
02341 ** to be a little left or right so that the break point is better.
02342 */
02343 static int wordBoundary(
02344   int iBreak,                   /* The suggested break point */
02345   const char *zDoc,             /* Document text */
02346   int nDoc,                     /* Number of bytes in zDoc[] */
02347   struct snippetMatch *aMatch,  /* Matching words */
02348   int nMatch,                   /* Number of entries in aMatch[] */
02349   int iCol                      /* The column number for zDoc[] */
02350 ){
02351   int i;
02352   if( iBreak<=10 ){
02353     return 0;
02354   }
02355   if( iBreak>=nDoc-10 ){
02356     return nDoc;
02357   }
02358   for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
02359   while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
02360   if( i<nMatch ){
02361     if( aMatch[i].iStart<iBreak+10 ){
02362       return aMatch[i].iStart;
02363     }
02364     if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
02365       return aMatch[i-1].iStart;
02366     }
02367   }
02368   for(i=1; i<=10; i++){
02369     if( safe_isspace(zDoc[iBreak-i]) ){
02370       return iBreak - i + 1;
02371     }
02372     if( safe_isspace(zDoc[iBreak+i]) ){
02373       return iBreak + i + 1;
02374     }
02375   }
02376   return iBreak;
02377 }
02378 
02379 /*
02380 ** If the StringBuffer does not end in white space, add a single
02381 ** space character to the end.
02382 */
02383 static void appendWhiteSpace(StringBuffer *p){
02384   if( p->len==0 ) return;
02385   if( safe_isspace(p->s[p->len-1]) ) return;
02386   append(p, " ");
02387 }
02388 
02389 /*
02390 ** Remove white space from teh end of the StringBuffer
02391 */
02392 static void trimWhiteSpace(StringBuffer *p){
02393   while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
02394     p->len--;
02395   }
02396 }
02397 
02398 
02399 
02400 /*
02401 ** Allowed values for Snippet.aMatch[].snStatus
02402 */
02403 #define SNIPPET_IGNORE  0   /* It is ok to omit this match from the snippet */
02404 #define SNIPPET_DESIRED 1   /* We want to include this match in the snippet */
02405 
02406 /*
02407 ** Generate the text of a snippet.
02408 */
02409 static void snippetText(
02410   fulltext_cursor *pCursor,   /* The cursor we need the snippet for */
02411   const char *zStartMark,     /* Markup to appear before each match */
02412   const char *zEndMark,       /* Markup to appear after each match */
02413   const char *zEllipsis       /* Ellipsis mark */
02414 ){
02415   int i, j;
02416   struct snippetMatch *aMatch;
02417   int nMatch;
02418   int nDesired;
02419   StringBuffer sb;
02420   int tailCol;
02421   int tailOffset;
02422   int iCol;
02423   int nDoc;
02424   const char *zDoc;
02425   int iStart, iEnd;
02426   int tailEllipsis = 0;
02427   int iMatch;
02428   
02429 
02430   free(pCursor->snippet.zSnippet);
02431   pCursor->snippet.zSnippet = 0;
02432   aMatch = pCursor->snippet.aMatch;
02433   nMatch = pCursor->snippet.nMatch;
02434   initStringBuffer(&sb);
02435 
02436   for(i=0; i<nMatch; i++){
02437     aMatch[i].snStatus = SNIPPET_IGNORE;
02438   }
02439   nDesired = 0;
02440   for(i=0; i<pCursor->q.nTerms; i++){
02441     for(j=0; j<nMatch; j++){
02442       if( aMatch[j].iTerm==i ){
02443         aMatch[j].snStatus = SNIPPET_DESIRED;
02444         nDesired++;
02445         break;
02446       }
02447     }
02448   }
02449 
02450   iMatch = 0;
02451   tailCol = -1;
02452   tailOffset = 0;
02453   for(i=0; i<nMatch && nDesired>0; i++){
02454     if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
02455     nDesired--;
02456     iCol = aMatch[i].iCol;
02457     zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
02458     nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
02459     iStart = aMatch[i].iStart - 40;
02460     iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
02461     if( iStart<=10 ){
02462       iStart = 0;
02463     }
02464     if( iCol==tailCol && iStart<=tailOffset+20 ){
02465       iStart = tailOffset;
02466     }
02467     if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
02468       trimWhiteSpace(&sb);
02469       appendWhiteSpace(&sb);
02470       append(&sb, zEllipsis);
02471       appendWhiteSpace(&sb);
02472     }
02473     iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
02474     iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
02475     if( iEnd>=nDoc-10 ){
02476       iEnd = nDoc;
02477       tailEllipsis = 0;
02478     }else{
02479       tailEllipsis = 1;
02480     }
02481     while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
02482     while( iStart<iEnd ){
02483       while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
02484              && aMatch[iMatch].iCol<=iCol ){
02485         iMatch++;
02486       }
02487       if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
02488              && aMatch[iMatch].iCol==iCol ){
02489         nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
02490         iStart = aMatch[iMatch].iStart;
02491         append(&sb, zStartMark);
02492         nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
02493         append(&sb, zEndMark);
02494         iStart += aMatch[iMatch].nByte;
02495         for(j=iMatch+1; j<nMatch; j++){
02496           if( aMatch[j].iTerm==aMatch[iMatch].iTerm
02497               && aMatch[j].snStatus==SNIPPET_DESIRED ){
02498             nDesired--;
02499             aMatch[j].snStatus = SNIPPET_IGNORE;
02500           }
02501         }
02502       }else{
02503         nappend(&sb, &zDoc[iStart], iEnd - iStart);
02504         iStart = iEnd;
02505       }
02506     }
02507     tailCol = iCol;
02508     tailOffset = iEnd;
02509   }
02510   trimWhiteSpace(&sb);
02511   if( tailEllipsis ){
02512     appendWhiteSpace(&sb);
02513     append(&sb, zEllipsis);
02514   }
02515   pCursor->snippet.zSnippet = sb.s;
02516   pCursor->snippet.nSnippet = sb.len;  
02517 }
02518 
02519 
02520 /*
02521 ** Close the cursor.  For additional information see the documentation
02522 ** on the xClose method of the virtual table interface.
02523 */
02524 static int fulltextClose(sqlite3_vtab_cursor *pCursor){
02525   fulltext_cursor *c = (fulltext_cursor *) pCursor;
02526   TRACE(("FTS1 Close %p\n", c));
02527   sqlite3_finalize(c->pStmt);
02528   queryClear(&c->q);
02529   snippetClear(&c->snippet);
02530   if( c->result.pDoclist!=NULL ){
02531     docListDelete(c->result.pDoclist);
02532   }
02533   free(c);
02534   return SQLITE_OK;
02535 }
02536 
02537 static int fulltextNext(sqlite3_vtab_cursor *pCursor){
02538   fulltext_cursor *c = (fulltext_cursor *) pCursor;
02539   sqlite_int64 iDocid;
02540   int rc;
02541 
02542   TRACE(("FTS1 Next %p\n", pCursor));
02543   snippetClear(&c->snippet);
02544   if( c->iCursorType < QUERY_FULLTEXT ){
02545     /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
02546     rc = sqlite3_step(c->pStmt);
02547     switch( rc ){
02548       case SQLITE_ROW:
02549         c->eof = 0;
02550         return SQLITE_OK;
02551       case SQLITE_DONE:
02552         c->eof = 1;
02553         return SQLITE_OK;
02554       default:
02555         c->eof = 1;
02556         return rc;
02557     }
02558   } else {  /* full-text query */
02559     rc = sqlite3_reset(c->pStmt);
02560     if( rc!=SQLITE_OK ) return rc;
02561 
02562     iDocid = nextDocid(&c->result);
02563     if( iDocid==0 ){
02564       c->eof = 1;
02565       return SQLITE_OK;
02566     }
02567     rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
02568     if( rc!=SQLITE_OK ) return rc;
02569     /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
02570     rc = sqlite3_step(c->pStmt);
02571     if( rc==SQLITE_ROW ){   /* the case we expect */
02572       c->eof = 0;
02573       return SQLITE_OK;
02574     }
02575     /* an error occurred; abort */
02576     return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
02577   }
02578 }
02579 
02580 
02581 /* Return a DocList corresponding to the query term *pTerm.  If *pTerm
02582 ** is the first term of a phrase query, go ahead and evaluate the phrase
02583 ** query and return the doclist for the entire phrase query.
02584 **
02585 ** The result is stored in pTerm->doclist.
02586 */
02587 static int docListOfTerm(
02588   fulltext_vtab *v,     /* The full text index */
02589   int iColumn,          /* column to restrict to.  No restrition if >=nColumn */
02590   QueryTerm *pQTerm,    /* Term we are looking for, or 1st term of a phrase */
02591   DocList **ppResult    /* Write the result here */
02592 ){
02593   DocList *pLeft, *pRight, *pNew;
02594   int i, rc;
02595 
02596   pLeft = docListNew(DL_POSITIONS);
02597   rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
02598   if( rc ){
02599     docListDelete(pLeft);
02600     return rc;
02601   }
02602   for(i=1; i<=pQTerm->nPhrase; i++){
02603     pRight = docListNew(DL_POSITIONS);
02604     rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
02605     if( rc ){
02606       docListDelete(pLeft);
02607       return rc;
02608     }
02609     pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS);
02610     docListPhraseMerge(pLeft, pRight, pNew);
02611     docListDelete(pLeft);
02612     docListDelete(pRight);
02613     pLeft = pNew;
02614   }
02615   *ppResult = pLeft;
02616   return SQLITE_OK;
02617 }
02618 
02619 /* Add a new term pTerm[0..nTerm-1] to the query *q.
02620 */
02621 static void queryAdd(Query *q, const char *pTerm, int nTerm){
02622   QueryTerm *t;
02623   ++q->nTerms;
02624   q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
02625   if( q->pTerms==0 ){
02626     q->nTerms = 0;
02627     return;
02628   }
02629   t = &q->pTerms[q->nTerms - 1];
02630   memset(t, 0, sizeof(*t));
02631   t->pTerm = malloc(nTerm+1);
02632   memcpy(t->pTerm, pTerm, nTerm);
02633   t->pTerm[nTerm] = 0;
02634   t->nTerm = nTerm;
02635   t->isOr = q->nextIsOr;
02636   q->nextIsOr = 0;
02637   t->iColumn = q->nextColumn;
02638   q->nextColumn = q->dfltColumn;
02639 }
02640 
02641 /*
02642 ** Check to see if the string zToken[0...nToken-1] matches any
02643 ** column name in the virtual table.   If it does,
02644 ** return the zero-indexed column number.  If not, return -1.
02645 */
02646 static int checkColumnSpecifier(
02647   fulltext_vtab *pVtab,    /* The virtual table */
02648   const char *zToken,      /* Text of the token */
02649   int nToken               /* Number of characters in the token */
02650 ){
02651   int i;
02652   for(i=0; i<pVtab->nColumn; i++){
02653     if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
02654         && pVtab->azColumn[i][nToken]==0 ){
02655       return i;
02656     }
02657   }
02658   return -1;
02659 }
02660 
02661 /*
02662 ** Parse the text at pSegment[0..nSegment-1].  Add additional terms
02663 ** to the query being assemblied in pQuery.
02664 **
02665 ** inPhrase is true if pSegment[0..nSegement-1] is contained within
02666 ** double-quotes.  If inPhrase is true, then the first term
02667 ** is marked with the number of terms in the phrase less one and
02668 ** OR and "-" syntax is ignored.  If inPhrase is false, then every
02669 ** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
02670 */
02671 static int tokenizeSegment(
02672   sqlite3_tokenizer *pTokenizer,          /* The tokenizer to use */
02673   const char *pSegment, int nSegment,     /* Query expression being parsed */
02674   int inPhrase,                           /* True if within "..." */
02675   Query *pQuery                           /* Append results here */
02676 ){
02677   const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
02678   sqlite3_tokenizer_cursor *pCursor;
02679   int firstIndex = pQuery->nTerms;
02680   int iCol;
02681   int nTerm = 1;
02682   
02683   int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
02684   if( rc!=SQLITE_OK ) return rc;
02685   pCursor->pTokenizer = pTokenizer;
02686 
02687   while( 1 ){
02688     const char *pToken;
02689     int nToken, iBegin, iEnd, iPos;
02690 
02691     rc = pModule->xNext(pCursor,
02692                         &pToken, &nToken,
02693                         &iBegin, &iEnd, &iPos);
02694     if( rc!=SQLITE_OK ) break;
02695     if( !inPhrase &&
02696         pSegment[iEnd]==':' &&
02697          (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
02698       pQuery->nextColumn = iCol;
02699       continue;
02700     }
02701     if( !inPhrase && pQuery->nTerms>0 && nToken==2
02702          && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
02703       pQuery->nextIsOr = 1;
02704       continue;
02705     }
02706     queryAdd(pQuery, pToken, nToken);
02707     if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
02708       pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
02709     }
02710     pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
02711     if( inPhrase ){
02712       nTerm++;
02713     }
02714   }
02715 
02716   if( inPhrase && pQuery->nTerms>firstIndex ){
02717     pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
02718   }
02719 
02720   return pModule->xClose(pCursor);
02721 }
02722 
02723 /* Parse a query string, yielding a Query object pQuery.
02724 **
02725 ** The calling function will need to queryClear() to clean up
02726 ** the dynamically allocated memory held by pQuery.
02727 */
02728 static int parseQuery(
02729   fulltext_vtab *v,        /* The fulltext index */
02730   const char *zInput,      /* Input text of the query string */
02731   int nInput,              /* Size of the input text */
02732   int dfltColumn,          /* Default column of the index to match against */
02733   Query *pQuery            /* Write the parse results here. */
02734 ){
02735   int iInput, inPhrase = 0;
02736 
02737   if( zInput==0 ) nInput = 0;
02738   if( nInput<0 ) nInput = strlen(zInput);
02739   pQuery->nTerms = 0;
02740   pQuery->pTerms = NULL;
02741   pQuery->nextIsOr = 0;
02742   pQuery->nextColumn = dfltColumn;
02743   pQuery->dfltColumn = dfltColumn;
02744   pQuery->pFts = v;
02745 
02746   for(iInput=0; iInput<nInput; ++iInput){
02747     int i;
02748     for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
02749     if( i>iInput ){
02750       tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
02751                        pQuery);
02752     }
02753     iInput = i;
02754     if( i<nInput ){
02755       assert( zInput[i]=='"' );
02756       inPhrase = !inPhrase;
02757     }
02758   }
02759 
02760   if( inPhrase ){
02761     /* unmatched quote */
02762     queryClear(pQuery);
02763     return SQLITE_ERROR;
02764   }
02765   return SQLITE_OK;
02766 }
02767 
02768 /* Perform a full-text query using the search expression in
02769 ** zInput[0..nInput-1].  Return a list of matching documents
02770 ** in pResult.
02771 **
02772 ** Queries must match column iColumn.  Or if iColumn>=nColumn
02773 ** they are allowed to match against any column.
02774 */
02775 static int fulltextQuery(
02776   fulltext_vtab *v,      /* The full text index */
02777   int iColumn,           /* Match against this column by default */
02778   const char *zInput,    /* The query string */
02779   int nInput,            /* Number of bytes in zInput[] */
02780   DocList **pResult,     /* Write the result doclist here */
02781   Query *pQuery          /* Put parsed query string here */
02782 ){
02783   int i, iNext, rc;
02784   DocList *pLeft = NULL;
02785   DocList *pRight, *pNew, *pOr;
02786   int nNot = 0;
02787   QueryTerm *aTerm;
02788 
02789   rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
02790   if( rc!=SQLITE_OK ) return rc;
02791 
02792   /* Merge AND terms. */
02793   aTerm = pQuery->pTerms;
02794   for(i = 0; i<pQuery->nTerms; i=iNext){
02795     if( aTerm[i].isNot ){
02796       /* Handle all NOT terms in a separate pass */
02797       nNot++;
02798       iNext = i + aTerm[i].nPhrase+1;
02799       continue;
02800     }
02801     iNext = i + aTerm[i].nPhrase + 1;
02802     rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
02803     if( rc ){
02804       queryClear(pQuery);
02805       return rc;
02806     }
02807     while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
02808       rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr);
02809       iNext += aTerm[iNext].nPhrase + 1;
02810       if( rc ){
02811         queryClear(pQuery);
02812         return rc;
02813       }
02814       pNew = docListNew(DL_DOCIDS);
02815       docListOrMerge(pRight, pOr, pNew);
02816       docListDelete(pRight);
02817       docListDelete(pOr);
02818       pRight = pNew;
02819     }
02820     if( pLeft==0 ){
02821       pLeft = pRight;
02822     }else{
02823       pNew = docListNew(DL_DOCIDS);
02824       docListAndMerge(pLeft, pRight, pNew);
02825       docListDelete(pRight);
02826       docListDelete(pLeft);
02827       pLeft = pNew;
02828     }
02829   }
02830 
02831   if( nNot && pLeft==0 ){
02832     /* We do not yet know how to handle a query of only NOT terms */
02833     return SQLITE_ERROR;
02834   }
02835 
02836   /* Do the EXCEPT terms */
02837   for(i=0; i<pQuery->nTerms;  i += aTerm[i].nPhrase + 1){
02838     if( !aTerm[i].isNot ) continue;
02839     rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
02840     if( rc ){
02841       queryClear(pQuery);
02842       docListDelete(pLeft);
02843       return rc;
02844     }
02845     pNew = docListNew(DL_DOCIDS);
02846     docListExceptMerge(pLeft, pRight, pNew);
02847     docListDelete(pRight);
02848     docListDelete(pLeft);
02849     pLeft = pNew;
02850   }
02851 
02852   *pResult = pLeft;
02853   return rc;
02854 }
02855 
02856 /*
02857 ** This is the xFilter interface for the virtual table.  See
02858 ** the virtual table xFilter method documentation for additional
02859 ** information.
02860 **
02861 ** If idxNum==QUERY_GENERIC then do a full table scan against
02862 ** the %_content table.
02863 **
02864 ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
02865 ** in the %_content table.
02866 **
02867 ** If idxNum>=QUERY_FULLTEXT then use the full text index.  The
02868 ** column on the left-hand side of the MATCH operator is column
02869 ** number idxNum-QUERY_FULLTEXT, 0 indexed.  argv[0] is the right-hand
02870 ** side of the MATCH operator.
02871 */
02872 /* TODO(shess) Upgrade the cursor initialization and destruction to
02873 ** account for fulltextFilter() being called multiple times on the
02874 ** same cursor.  The current solution is very fragile.  Apply fix to
02875 ** fts2 as appropriate.
02876 */
02877 static int fulltextFilter(
02878   sqlite3_vtab_cursor *pCursor,     /* The cursor used for this query */
02879   int idxNum, const char *idxStr,   /* Which indexing scheme to use */
02880   int argc, sqlite3_value **argv    /* Arguments for the indexing scheme */
02881 ){
02882   fulltext_cursor *c = (fulltext_cursor *) pCursor;
02883   fulltext_vtab *v = cursor_vtab(c);
02884   int rc;
02885   char *zSql;
02886 
02887   TRACE(("FTS1 Filter %p\n",pCursor));
02888 
02889   zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
02890                           idxNum==QUERY_GENERIC ? "" : "where rowid=?");
02891   sqlite3_finalize(c->pStmt);
02892   rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
02893   sqlite3_free(zSql);
02894   if( rc!=SQLITE_OK ) return rc;
02895 
02896   c->iCursorType = idxNum;
02897   switch( idxNum ){
02898     case QUERY_GENERIC:
02899       break;
02900 
02901     case QUERY_ROWID:
02902       rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
02903       if( rc!=SQLITE_OK ) return rc;
02904       break;
02905 
02906     default:   /* full-text search */
02907     {
02908       const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
02909       DocList *pResult;
02910       assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
02911       assert( argc==1 );
02912       queryClear(&c->q);
02913       rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q);
02914       if( rc!=SQLITE_OK ) return rc;
02915       if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist);
02916       readerInit(&c->result, pResult);
02917       break;
02918     }
02919   }
02920 
02921   return fulltextNext(pCursor);
02922 }
02923 
02924 /* This is the xEof method of the virtual table.  The SQLite core
02925 ** calls this routine to find out if it has reached the end of
02926 ** a query's results set.
02927 */
02928 static int fulltextEof(sqlite3_vtab_cursor *pCursor){
02929   fulltext_cursor *c = (fulltext_cursor *) pCursor;
02930   return c->eof;
02931 }
02932 
02933 /* This is the xColumn method of the virtual table.  The SQLite
02934 ** core calls this method during a query when it needs the value
02935 ** of a column from the virtual table.  This method needs to use
02936 ** one of the sqlite3_result_*() routines to store the requested
02937 ** value back in the pContext.
02938 */
02939 static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
02940                           sqlite3_context *pContext, int idxCol){
02941   fulltext_cursor *c = (fulltext_cursor *) pCursor;
02942   fulltext_vtab *v = cursor_vtab(c);
02943 
02944   if( idxCol<v->nColumn ){
02945     sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
02946     sqlite3_result_value(pContext, pVal);
02947   }else if( idxCol==v->nColumn ){
02948     /* The extra column whose name is the same as the table.
02949     ** Return a blob which is a pointer to the cursor
02950     */
02951     sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
02952   }
02953   return SQLITE_OK;
02954 }
02955 
02956 /* This is the xRowid method.  The SQLite core calls this routine to
02957 ** retrive the rowid for the current row of the result set.  The
02958 ** rowid should be written to *pRowid.
02959 */
02960 static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
02961   fulltext_cursor *c = (fulltext_cursor *) pCursor;
02962 
02963   *pRowid = sqlite3_column_int64(c->pStmt, 0);
02964   return SQLITE_OK;
02965 }
02966 
02967 /* Add all terms in [zText] to the given hash table.  If [iColumn] > 0,
02968  * we also store positions and offsets in the hash table using the given
02969  * column number. */
02970 static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid,
02971                       const char *zText, int iColumn){
02972   sqlite3_tokenizer *pTokenizer = v->pTokenizer;
02973   sqlite3_tokenizer_cursor *pCursor;
02974   const char *pToken;
02975   int nTokenBytes;
02976   int iStartOffset, iEndOffset, iPosition;
02977   int rc;
02978 
02979   rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
02980   if( rc!=SQLITE_OK ) return rc;
02981 
02982   pCursor->pTokenizer = pTokenizer;
02983   while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
02984                                                &pToken, &nTokenBytes,
02985                                                &iStartOffset, &iEndOffset,
02986                                                &iPosition) ){
02987     DocList *p;
02988 
02989     /* Positions can't be negative; we use -1 as a terminator internally. */
02990     if( iPosition<0 ){
02991       pTokenizer->pModule->xClose(pCursor);
02992       return SQLITE_ERROR;
02993     }
02994 
02995     p = fts1HashFind(terms, pToken, nTokenBytes);
02996     if( p==NULL ){
02997       p = docListNew(DL_DEFAULT);
02998       docListAddDocid(p, iDocid);
02999       fts1HashInsert(terms, pToken, nTokenBytes, p);
03000     }
03001     if( iColumn>=0 ){
03002       docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset);
03003     }
03004   }
03005 
03006   /* TODO(shess) Check return?  Should this be able to cause errors at
03007   ** this point?  Actually, same question about sqlite3_finalize(),
03008   ** though one could argue that failure there means that the data is
03009   ** not durable.  *ponder*
03010   */
03011   pTokenizer->pModule->xClose(pCursor);
03012   return rc;
03013 }
03014 
03015 /* Update the %_terms table to map the term [pTerm] to the given rowid. */
03016 static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm,
03017                              DocList *d){
03018   sqlite_int64 iIndexRow;
03019   DocList doclist;
03020   int iSegment = 0, rc;
03021 
03022   rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist);
03023   if( rc==SQLITE_DONE ){
03024     docListInit(&doclist, DL_DEFAULT, 0, 0);
03025     docListUpdate(&doclist, d);
03026     /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */
03027     rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist);
03028     goto err;
03029   }
03030   if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
03031 
03032   docListUpdate(&doclist, d);
03033   if( doclist.nData<=CHUNK_MAX ){
03034     rc = term_update(v, iIndexRow, &doclist);
03035     goto err;
03036   }
03037 
03038   /* Doclist doesn't fit, delete what's there, and accumulate
03039   ** forward.
03040   */
03041   rc = term_delete(v, iIndexRow);
03042   if( rc!=SQLITE_OK ) goto err;
03043 
03044   /* Try to insert the doclist into a higher segment bucket.  On
03045   ** failure, accumulate existing doclist with the doclist from that
03046   ** bucket, and put results in the next bucket.
03047   */
03048   iSegment++;
03049   while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment,
03050                          &doclist))!=SQLITE_OK ){
03051     sqlite_int64 iSegmentRow;
03052     DocList old;
03053     int rc2;
03054 
03055     /* Retain old error in case the term_insert() error was really an
03056     ** error rather than a bounced insert.
03057     */
03058     rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old);
03059     if( rc2!=SQLITE_ROW ) goto err;
03060 
03061     rc = term_delete(v, iSegmentRow);
03062     if( rc!=SQLITE_OK ) goto err;
03063 
03064     /* Reusing lowest-number deleted row keeps the index smaller. */
03065     if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow;
03066 
03067     /* doclist contains the newer data, so accumulate it over old.
03068     ** Then steal accumulated data for doclist.
03069     */
03070     docListAccumulate(&old, &doclist);
03071     docListDestroy(&doclist);
03072     doclist = old;
03073 
03074     iSegment++;
03075   }
03076 
03077  err:
03078   docListDestroy(&doclist);
03079   return rc;
03080 }
03081 
03082 /* Add doclists for all terms in [pValues] to the hash table [terms]. */
03083 static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid,
03084                 sqlite3_value **pValues){
03085   int i;
03086   for(i = 0; i < v->nColumn ; ++i){
03087     char *zText = (char*)sqlite3_value_text(pValues[i]);
03088     int rc = buildTerms(v, terms, iRowid, zText, i);
03089     if( rc!=SQLITE_OK ) return rc;
03090   }
03091   return SQLITE_OK;
03092 }
03093 
03094 /* Add empty doclists for all terms in the given row's content to the hash
03095  * table [pTerms]. */
03096 static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){
03097   const char **pValues;
03098   int i;
03099 
03100   int rc = content_select(v, iRowid, &pValues);
03101   if( rc!=SQLITE_OK ) return rc;
03102 
03103   for(i = 0 ; i < v->nColumn; ++i) {
03104     rc = buildTerms(v, pTerms, iRowid, pValues[i], -1);
03105     if( rc!=SQLITE_OK ) break;
03106   }
03107 
03108   freeStringArray(v->nColumn, pValues);
03109   return SQLITE_OK;
03110 }
03111 
03112 /* Insert a row into the %_content table; set *piRowid to be the ID of the
03113  * new row.  Fill [pTerms] with new doclists for the %_term table. */
03114 static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
03115                         sqlite3_value **pValues,
03116                         sqlite_int64 *piRowid, fts1Hash *pTerms){
03117   int rc;
03118 
03119   rc = content_insert(v, pRequestRowid, pValues);  /* execute an SQL INSERT */
03120   if( rc!=SQLITE_OK ) return rc;
03121   *piRowid = sqlite3_last_insert_rowid(v->db);
03122   return insertTerms(v, pTerms, *piRowid, pValues);
03123 }
03124 
03125 /* Delete a row from the %_content table; fill [pTerms] with empty doclists
03126  * to be written to the %_term table. */
03127 static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){
03128   int rc = deleteTerms(v, pTerms, iRow);
03129   if( rc!=SQLITE_OK ) return rc;
03130   return content_delete(v, iRow);  /* execute an SQL DELETE */
03131 }
03132 
03133 /* Update a row in the %_content table; fill [pTerms] with new doclists for the
03134  * %_term table. */
03135 static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
03136                         sqlite3_value **pValues, fts1Hash *pTerms){
03137   /* Generate an empty doclist for each term that previously appeared in this
03138    * row. */
03139   int rc = deleteTerms(v, pTerms, iRow);
03140   if( rc!=SQLITE_OK ) return rc;
03141 
03142   rc = content_update(v, pValues, iRow);  /* execute an SQL UPDATE */
03143   if( rc!=SQLITE_OK ) return rc;
03144 
03145   /* Now add positions for terms which appear in the updated row. */
03146   return insertTerms(v, pTerms, iRow, pValues);
03147 }
03148 
03149 /* This function implements the xUpdate callback; it is the top-level entry
03150  * point for inserting, deleting or updating a row in a full-text table. */
03151 static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
03152                    sqlite_int64 *pRowid){
03153   fulltext_vtab *v = (fulltext_vtab *) pVtab;
03154   fts1Hash terms;   /* maps term string -> PosList */
03155   int rc;
03156   fts1HashElem *e;
03157 
03158   TRACE(("FTS1 Update %p\n", pVtab));
03159   
03160   fts1HashInit(&terms, FTS1_HASH_STRING, 1);
03161 
03162   if( nArg<2 ){
03163     rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms);
03164   } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
03165     /* An update:
03166      * ppArg[0] = old rowid
03167      * ppArg[1] = new rowid
03168      * ppArg[2..2+v->nColumn-1] = values
03169      * ppArg[2+v->nColumn] = value for magic column (we ignore this)
03170      */
03171     sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
03172     if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
03173       sqlite3_value_int64(ppArg[1]) != rowid ){
03174       rc = SQLITE_ERROR;  /* we don't allow changing the rowid */
03175     } else {
03176       assert( nArg==2+v->nColumn+1);
03177       rc = index_update(v, rowid, &ppArg[2], &terms);
03178     }
03179   } else {
03180     /* An insert:
03181      * ppArg[1] = requested rowid
03182      * ppArg[2..2+v->nColumn-1] = values
03183      * ppArg[2+v->nColumn] = value for magic column (we ignore this)
03184      */
03185     assert( nArg==2+v->nColumn+1);
03186     rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
03187   }
03188 
03189   if( rc==SQLITE_OK ){
03190     /* Write updated doclists to disk. */
03191     for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
03192       DocList *p = fts1HashData(e);
03193       rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p);
03194       if( rc!=SQLITE_OK ) break;
03195     }
03196   }
03197 
03198   /* clean up */
03199   for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
03200     DocList *p = fts1HashData(e);
03201     docListDelete(p);
03202   }
03203   fts1HashClear(&terms);
03204 
03205   return rc;
03206 }
03207 
03208 /*
03209 ** Implementation of the snippet() function for FTS1
03210 */
03211 static void snippetFunc(
03212   sqlite3_context *pContext,
03213   int argc,
03214   sqlite3_value **argv
03215 ){
03216   fulltext_cursor *pCursor;
03217   if( argc<1 ) return;
03218   if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
03219       sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
03220     sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
03221   }else{
03222     const char *zStart = "<b>";
03223     const char *zEnd = "</b>";
03224     const char *zEllipsis = "<b>...</b>";
03225     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
03226     if( argc>=2 ){
03227       zStart = (const char*)sqlite3_value_text(argv[1]);
03228       if( argc>=3 ){
03229         zEnd = (const char*)sqlite3_value_text(argv[2]);
03230         if( argc>=4 ){
03231           zEllipsis = (const char*)sqlite3_value_text(argv[3]);
03232         }
03233       }
03234     }
03235     snippetAllOffsets(pCursor);
03236     snippetText(pCursor, zStart, zEnd, zEllipsis);
03237     sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
03238                         pCursor->snippet.nSnippet, SQLITE_STATIC);
03239   }
03240 }
03241 
03242 /*
03243 ** Implementation of the offsets() function for FTS1
03244 */
03245 static void snippetOffsetsFunc(
03246   sqlite3_context *pContext,
03247   int argc,
03248   sqlite3_value **argv
03249 ){
03250   fulltext_cursor *pCursor;
03251   if( argc<1 ) return;
03252   if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
03253       sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
03254     sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
03255   }else{
03256     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
03257     snippetAllOffsets(pCursor);
03258     snippetOffsetText(&pCursor->snippet);
03259     sqlite3_result_text(pContext,
03260                         pCursor->snippet.zOffset, pCursor->snippet.nOffset,
03261                         SQLITE_STATIC);
03262   }
03263 }
03264 
03265 /*
03266 ** This routine implements the xFindFunction method for the FTS1
03267 ** virtual table.
03268 */
03269 static int fulltextFindFunction(
03270   sqlite3_vtab *pVtab,
03271   int nArg,
03272   const char *zName,
03273   void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
03274   void **ppArg
03275 ){
03276   if( strcmp(zName,"snippet")==0 ){
03277     *pxFunc = snippetFunc;
03278     return 1;
03279   }else if( strcmp(zName,"offsets")==0 ){
03280     *pxFunc = snippetOffsetsFunc;
03281     return 1;
03282   }
03283   return 0;
03284 }
03285 
03286 /*
03287 ** Rename an fts1 table.
03288 */
03289 static int fulltextRename(
03290   sqlite3_vtab *pVtab,
03291   const char *zName
03292 ){
03293   fulltext_vtab *p = (fulltext_vtab *)pVtab;
03294   int rc = SQLITE_NOMEM;
03295   char *zSql = sqlite3_mprintf(
03296     "ALTER TABLE %Q.'%q_content'  RENAME TO '%q_content';"
03297     "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';"
03298     , p->zDb, p->zName, zName
03299     , p->zDb, p->zName, zName
03300   );
03301   if( zSql ){
03302     rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
03303     sqlite3_free(zSql);
03304   }
03305   return rc;
03306 }
03307 
03308 static const sqlite3_module fulltextModule = {
03309   /* iVersion      */ 0,
03310   /* xCreate       */ fulltextCreate,
03311   /* xConnect      */ fulltextConnect,
03312   /* xBestIndex    */ fulltextBestIndex,
03313   /* xDisconnect   */ fulltextDisconnect,
03314   /* xDestroy      */ fulltextDestroy,
03315   /* xOpen         */ fulltextOpen,
03316   /* xClose        */ fulltextClose,
03317   /* xFilter       */ fulltextFilter,
03318   /* xNext         */ fulltextNext,
03319   /* xEof          */ fulltextEof,
03320   /* xColumn       */ fulltextColumn,
03321   /* xRowid        */ fulltextRowid,
03322   /* xUpdate       */ fulltextUpdate,
03323   /* xBegin        */ 0, 
03324   /* xSync         */ 0,
03325   /* xCommit       */ 0,
03326   /* xRollback     */ 0,
03327   /* xFindFunction */ fulltextFindFunction,
03328   /* xRename       */ fulltextRename,
03329 };
03330 
03331 int sqlite3Fts1Init(sqlite3 *db){
03332   sqlite3_overload_function(db, "snippet", -1);
03333   sqlite3_overload_function(db, "offsets", -1);
03334   return sqlite3_create_module(db, "fts1", &fulltextModule, 0);
03335 }
03336 
03337 #if !SQLITE_CORE
03338 int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg,
03339                            const sqlite3_api_routines *pApi){
03340   SQLITE_EXTENSION_INIT2(pApi)
03341   return sqlite3Fts1Init(db);
03342 }
03343 #endif
03344 
03345 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1