00001 /* fts1 has a design flaw which can lead to database corruption (see 00002 ** below). It is recommended not to use it any longer, instead use 00003 ** fts3 (or higher). If you believe that your use of fts1 is safe, 00004 ** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS. 00005 */ 00006 #if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)) \ 00007 && !defined(SQLITE_ENABLE_BROKEN_FTS1) 00008 #error fts1 has a design flaw and has been deprecated. 00009 #endif 00010 /* The flaw is that fts1 uses the content table's unaliased rowid as 00011 ** the unique docid. fts1 embeds the rowid in the index it builds, 00012 ** and expects the rowid to not change. The SQLite VACUUM operation 00013 ** will renumber such rowids, thereby breaking fts1. If you are using 00014 ** fts1 in a system which has disabled VACUUM, then you can continue 00015 ** to use it safely. Note that PRAGMA auto_vacuum does NOT disable 00016 ** VACUUM, though systems using auto_vacuum are unlikely to invoke 00017 ** VACUUM. 00018 ** 00019 ** fts1 should be safe even across VACUUM if you only insert documents 00020 ** and never delete. 00021 */ 00022 00023 /* The author disclaims copyright to this source code. 00024 * 00025 * This is an SQLite module implementing full-text search. 00026 */ 00027 00028 /* 00029 ** The code in this file is only compiled if: 00030 ** 00031 ** * The FTS1 module is being built as an extension 00032 ** (in which case SQLITE_CORE is not defined), or 00033 ** 00034 ** * The FTS1 module is being built into the core of 00035 ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). 00036 */ 00037 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) 00038 00039 #if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE) 00040 # define SQLITE_CORE 1 00041 #endif 00042 00043 #include <assert.h> 00044 #include <stdlib.h> 00045 #include <stdio.h> 00046 #include <string.h> 00047 #include <ctype.h> 00048 00049 #include "fts1.h" 00050 #include "fts1_hash.h" 00051 #include "fts1_tokenizer.h" 00052 #include "sqlite3.h" 00053 #include "sqlite3ext.h" 00054 SQLITE_EXTENSION_INIT1 00055 00056 00057 #if 0 00058 # define TRACE(A) printf A; fflush(stdout) 00059 #else 00060 # define TRACE(A) 00061 #endif 00062 00063 /* utility functions */ 00064 00065 typedef struct StringBuffer { 00066 int len; /* length, not including null terminator */ 00067 int alloced; /* Space allocated for s[] */ 00068 char *s; /* Content of the string */ 00069 } StringBuffer; 00070 00071 static void initStringBuffer(StringBuffer *sb){ 00072 sb->len = 0; 00073 sb->alloced = 100; 00074 sb->s = malloc(100); 00075 sb->s[0] = '\0'; 00076 } 00077 00078 static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ 00079 if( sb->len + nFrom >= sb->alloced ){ 00080 sb->alloced = sb->len + nFrom + 100; 00081 sb->s = realloc(sb->s, sb->alloced+1); 00082 if( sb->s==0 ){ 00083 initStringBuffer(sb); 00084 return; 00085 } 00086 } 00087 memcpy(sb->s + sb->len, zFrom, nFrom); 00088 sb->len += nFrom; 00089 sb->s[sb->len] = 0; 00090 } 00091 static void append(StringBuffer *sb, const char *zFrom){ 00092 nappend(sb, zFrom, strlen(zFrom)); 00093 } 00094 00095 /* We encode variable-length integers in little-endian order using seven bits 00096 * per byte as follows: 00097 ** 00098 ** KEY: 00099 ** A = 0xxxxxxx 7 bits of data and one flag bit 00100 ** B = 1xxxxxxx 7 bits of data and one flag bit 00101 ** 00102 ** 7 bits - A 00103 ** 14 bits - BA 00104 ** 21 bits - BBA 00105 ** and so on. 00106 */ 00107 00108 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ 00109 #define VARINT_MAX 10 00110 00111 /* Write a 64-bit variable-length integer to memory starting at p[0]. 00112 * The length of data written will be between 1 and VARINT_MAX bytes. 00113 * The number of bytes written is returned. */ 00114 static int putVarint(char *p, sqlite_int64 v){ 00115 unsigned char *q = (unsigned char *) p; 00116 sqlite_uint64 vu = v; 00117 do{ 00118 *q++ = (unsigned char) ((vu & 0x7f) | 0x80); 00119 vu >>= 7; 00120 }while( vu!=0 ); 00121 q[-1] &= 0x7f; /* turn off high bit in final byte */ 00122 assert( q - (unsigned char *)p <= VARINT_MAX ); 00123 return (int) (q - (unsigned char *)p); 00124 } 00125 00126 /* Read a 64-bit variable-length integer from memory starting at p[0]. 00127 * Return the number of bytes read, or 0 on error. 00128 * The value is stored in *v. */ 00129 static int getVarint(const char *p, sqlite_int64 *v){ 00130 const unsigned char *q = (const unsigned char *) p; 00131 sqlite_uint64 x = 0, y = 1; 00132 while( (*q & 0x80) == 0x80 ){ 00133 x += y * (*q++ & 0x7f); 00134 y <<= 7; 00135 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ 00136 assert( 0 ); 00137 return 0; 00138 } 00139 } 00140 x += y * (*q++); 00141 *v = (sqlite_int64) x; 00142 return (int) (q - (unsigned char *)p); 00143 } 00144 00145 static int getVarint32(const char *p, int *pi){ 00146 sqlite_int64 i; 00147 int ret = getVarint(p, &i); 00148 *pi = (int) i; 00149 assert( *pi==i ); 00150 return ret; 00151 } 00152 00153 /*** Document lists *** 00154 * 00155 * A document list holds a sorted list of varint-encoded document IDs. 00156 * 00157 * A doclist with type DL_POSITIONS_OFFSETS is stored like this: 00158 * 00159 * array { 00160 * varint docid; 00161 * array { 00162 * varint position; (delta from previous position plus POS_BASE) 00163 * varint startOffset; (delta from previous startOffset) 00164 * varint endOffset; (delta from startOffset) 00165 * } 00166 * } 00167 * 00168 * Here, array { X } means zero or more occurrences of X, adjacent in memory. 00169 * 00170 * A position list may hold positions for text in multiple columns. A position 00171 * POS_COLUMN is followed by a varint containing the index of the column for 00172 * following positions in the list. Any positions appearing before any 00173 * occurrences of POS_COLUMN are for column 0. 00174 * 00175 * A doclist with type DL_POSITIONS is like the above, but holds only docids 00176 * and positions without offset information. 00177 * 00178 * A doclist with type DL_DOCIDS is like the above, but holds only docids 00179 * without positions or offset information. 00180 * 00181 * On disk, every document list has positions and offsets, so we don't bother 00182 * to serialize a doclist's type. 00183 * 00184 * We don't yet delta-encode document IDs; doing so will probably be a 00185 * modest win. 00186 * 00187 * NOTE(shess) I've thought of a slightly (1%) better offset encoding. 00188 * After the first offset, estimate the next offset by using the 00189 * current token position and the previous token position and offset, 00190 * offset to handle some variance. So the estimate would be 00191 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded 00192 * as normal. Offsets more than 64 chars from the estimate are 00193 * encoded as the delta to the previous start offset + 128. An 00194 * additional tiny increment can be gained by using the end offset of 00195 * the previous token to make the estimate a tiny bit more precise. 00196 */ 00197 00198 /* It is not safe to call isspace(), tolower(), or isalnum() on 00199 ** hi-bit-set characters. This is the same solution used in the 00200 ** tokenizer. 00201 */ 00202 /* TODO(shess) The snippet-generation code should be using the 00203 ** tokenizer-generated tokens rather than doing its own local 00204 ** tokenization. 00205 */ 00206 /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ 00207 static int safe_isspace(char c){ 00208 return (c&0x80)==0 ? isspace(c) : 0; 00209 } 00210 static int safe_tolower(char c){ 00211 return (c&0x80)==0 ? tolower(c) : c; 00212 } 00213 static int safe_isalnum(char c){ 00214 return (c&0x80)==0 ? isalnum(c) : 0; 00215 } 00216 00217 typedef enum DocListType { 00218 DL_DOCIDS, /* docids only */ 00219 DL_POSITIONS, /* docids + positions */ 00220 DL_POSITIONS_OFFSETS /* docids + positions + offsets */ 00221 } DocListType; 00222 00223 /* 00224 ** By default, only positions and not offsets are stored in the doclists. 00225 ** To change this so that offsets are stored too, compile with 00226 ** 00227 ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS 00228 ** 00229 */ 00230 #ifndef DL_DEFAULT 00231 # define DL_DEFAULT DL_POSITIONS 00232 #endif 00233 00234 typedef struct DocList { 00235 char *pData; 00236 int nData; 00237 DocListType iType; 00238 int iLastColumn; /* the last column written */ 00239 int iLastPos; /* the last position written */ 00240 int iLastOffset; /* the last start offset written */ 00241 } DocList; 00242 00243 enum { 00244 POS_END = 0, /* end of this position list */ 00245 POS_COLUMN, /* followed by new column number */ 00246 POS_BASE 00247 }; 00248 00249 /* Initialize a new DocList to hold the given data. */ 00250 static void docListInit(DocList *d, DocListType iType, 00251 const char *pData, int nData){ 00252 d->nData = nData; 00253 if( nData>0 ){ 00254 d->pData = malloc(nData); 00255 memcpy(d->pData, pData, nData); 00256 } else { 00257 d->pData = NULL; 00258 } 00259 d->iType = iType; 00260 d->iLastColumn = 0; 00261 d->iLastPos = d->iLastOffset = 0; 00262 } 00263 00264 /* Create a new dynamically-allocated DocList. */ 00265 static DocList *docListNew(DocListType iType){ 00266 DocList *d = (DocList *) malloc(sizeof(DocList)); 00267 docListInit(d, iType, 0, 0); 00268 return d; 00269 } 00270 00271 static void docListDestroy(DocList *d){ 00272 free(d->pData); 00273 #ifndef NDEBUG 00274 memset(d, 0x55, sizeof(*d)); 00275 #endif 00276 } 00277 00278 static void docListDelete(DocList *d){ 00279 docListDestroy(d); 00280 free(d); 00281 } 00282 00283 static char *docListEnd(DocList *d){ 00284 return d->pData + d->nData; 00285 } 00286 00287 /* Append a varint to a DocList's data. */ 00288 static void appendVarint(DocList *d, sqlite_int64 i){ 00289 char c[VARINT_MAX]; 00290 int n = putVarint(c, i); 00291 d->pData = realloc(d->pData, d->nData + n); 00292 memcpy(d->pData + d->nData, c, n); 00293 d->nData += n; 00294 } 00295 00296 static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ 00297 appendVarint(d, iDocid); 00298 if( d->iType>=DL_POSITIONS ){ 00299 appendVarint(d, POS_END); /* initially empty position list */ 00300 d->iLastColumn = 0; 00301 d->iLastPos = d->iLastOffset = 0; 00302 } 00303 } 00304 00305 /* helper function for docListAddPos and docListAddPosOffset */ 00306 static void addPos(DocList *d, int iColumn, int iPos){ 00307 assert( d->nData>0 ); 00308 --d->nData; /* remove previous terminator */ 00309 if( iColumn!=d->iLastColumn ){ 00310 assert( iColumn>d->iLastColumn ); 00311 appendVarint(d, POS_COLUMN); 00312 appendVarint(d, iColumn); 00313 d->iLastColumn = iColumn; 00314 d->iLastPos = d->iLastOffset = 0; 00315 } 00316 assert( iPos>=d->iLastPos ); 00317 appendVarint(d, iPos-d->iLastPos+POS_BASE); 00318 d->iLastPos = iPos; 00319 } 00320 00321 /* Add a position to the last position list in a doclist. */ 00322 static void docListAddPos(DocList *d, int iColumn, int iPos){ 00323 assert( d->iType==DL_POSITIONS ); 00324 addPos(d, iColumn, iPos); 00325 appendVarint(d, POS_END); /* add new terminator */ 00326 } 00327 00328 /* 00329 ** Add a position and starting and ending offsets to a doclist. 00330 ** 00331 ** If the doclist is setup to handle only positions, then insert 00332 ** the position only and ignore the offsets. 00333 */ 00334 static void docListAddPosOffset( 00335 DocList *d, /* Doclist under construction */ 00336 int iColumn, /* Column the inserted term is part of */ 00337 int iPos, /* Position of the inserted term */ 00338 int iStartOffset, /* Starting offset of inserted term */ 00339 int iEndOffset /* Ending offset of inserted term */ 00340 ){ 00341 assert( d->iType>=DL_POSITIONS ); 00342 addPos(d, iColumn, iPos); 00343 if( d->iType==DL_POSITIONS_OFFSETS ){ 00344 assert( iStartOffset>=d->iLastOffset ); 00345 appendVarint(d, iStartOffset-d->iLastOffset); 00346 d->iLastOffset = iStartOffset; 00347 assert( iEndOffset>=iStartOffset ); 00348 appendVarint(d, iEndOffset-iStartOffset); 00349 } 00350 appendVarint(d, POS_END); /* add new terminator */ 00351 } 00352 00353 /* 00354 ** A DocListReader object is a cursor into a doclist. Initialize 00355 ** the cursor to the beginning of the doclist by calling readerInit(). 00356 ** Then use routines 00357 ** 00358 ** peekDocid() 00359 ** readDocid() 00360 ** readPosition() 00361 ** skipPositionList() 00362 ** and so forth... 00363 ** 00364 ** to read information out of the doclist. When we reach the end 00365 ** of the doclist, atEnd() returns TRUE. 00366 */ 00367 typedef struct DocListReader { 00368 DocList *pDoclist; /* The document list we are stepping through */ 00369 char *p; /* Pointer to next unread byte in the doclist */ 00370 int iLastColumn; 00371 int iLastPos; /* the last position read, or -1 when not in a position list */ 00372 } DocListReader; 00373 00374 /* 00375 ** Initialize the DocListReader r to point to the beginning of pDoclist. 00376 */ 00377 static void readerInit(DocListReader *r, DocList *pDoclist){ 00378 r->pDoclist = pDoclist; 00379 if( pDoclist!=NULL ){ 00380 r->p = pDoclist->pData; 00381 } 00382 r->iLastColumn = -1; 00383 r->iLastPos = -1; 00384 } 00385 00386 /* 00387 ** Return TRUE if we have reached then end of pReader and there is 00388 ** nothing else left to read. 00389 */ 00390 static int atEnd(DocListReader *pReader){ 00391 return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist)); 00392 } 00393 00394 /* Peek at the next docid without advancing the read pointer. 00395 */ 00396 static sqlite_int64 peekDocid(DocListReader *pReader){ 00397 sqlite_int64 ret; 00398 assert( !atEnd(pReader) ); 00399 assert( pReader->iLastPos==-1 ); 00400 getVarint(pReader->p, &ret); 00401 return ret; 00402 } 00403 00404 /* Read the next docid. See also nextDocid(). 00405 */ 00406 static sqlite_int64 readDocid(DocListReader *pReader){ 00407 sqlite_int64 ret; 00408 assert( !atEnd(pReader) ); 00409 assert( pReader->iLastPos==-1 ); 00410 pReader->p += getVarint(pReader->p, &ret); 00411 if( pReader->pDoclist->iType>=DL_POSITIONS ){ 00412 pReader->iLastColumn = 0; 00413 pReader->iLastPos = 0; 00414 } 00415 return ret; 00416 } 00417 00418 /* Read the next position and column index from a position list. 00419 * Returns the position, or -1 at the end of the list. */ 00420 static int readPosition(DocListReader *pReader, int *iColumn){ 00421 int i; 00422 int iType = pReader->pDoclist->iType; 00423 00424 if( pReader->iLastPos==-1 ){ 00425 return -1; 00426 } 00427 assert( !atEnd(pReader) ); 00428 00429 if( iType<DL_POSITIONS ){ 00430 return -1; 00431 } 00432 pReader->p += getVarint32(pReader->p, &i); 00433 if( i==POS_END ){ 00434 pReader->iLastColumn = pReader->iLastPos = -1; 00435 *iColumn = -1; 00436 return -1; 00437 } 00438 if( i==POS_COLUMN ){ 00439 pReader->p += getVarint32(pReader->p, &pReader->iLastColumn); 00440 pReader->iLastPos = 0; 00441 pReader->p += getVarint32(pReader->p, &i); 00442 assert( i>=POS_BASE ); 00443 } 00444 pReader->iLastPos += ((int) i)-POS_BASE; 00445 if( iType>=DL_POSITIONS_OFFSETS ){ 00446 /* Skip over offsets, ignoring them for now. */ 00447 int iStart, iEnd; 00448 pReader->p += getVarint32(pReader->p, &iStart); 00449 pReader->p += getVarint32(pReader->p, &iEnd); 00450 } 00451 *iColumn = pReader->iLastColumn; 00452 return pReader->iLastPos; 00453 } 00454 00455 /* Skip past the end of a position list. */ 00456 static void skipPositionList(DocListReader *pReader){ 00457 DocList *p = pReader->pDoclist; 00458 if( p && p->iType>=DL_POSITIONS ){ 00459 int iColumn; 00460 while( readPosition(pReader, &iColumn)!=-1 ){} 00461 } 00462 } 00463 00464 /* Skip over a docid, including its position list if the doclist has 00465 * positions. */ 00466 static void skipDocument(DocListReader *pReader){ 00467 readDocid(pReader); 00468 skipPositionList(pReader); 00469 } 00470 00471 /* Skip past all docids which are less than [iDocid]. Returns 1 if a docid 00472 * matching [iDocid] was found. */ 00473 static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){ 00474 sqlite_int64 d = 0; 00475 while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){ 00476 skipDocument(pReader); 00477 } 00478 return !atEnd(pReader) && d==iDocid; 00479 } 00480 00481 /* Return the first document in a document list. 00482 */ 00483 static sqlite_int64 firstDocid(DocList *d){ 00484 DocListReader r; 00485 readerInit(&r, d); 00486 return readDocid(&r); 00487 } 00488 00489 #ifdef SQLITE_DEBUG 00490 /* 00491 ** This routine is used for debugging purpose only. 00492 ** 00493 ** Write the content of a doclist to standard output. 00494 */ 00495 static void printDoclist(DocList *p){ 00496 DocListReader r; 00497 const char *zSep = ""; 00498 00499 readerInit(&r, p); 00500 while( !atEnd(&r) ){ 00501 sqlite_int64 docid = readDocid(&r); 00502 if( docid==0 ){ 00503 skipPositionList(&r); 00504 continue; 00505 } 00506 printf("%s%lld", zSep, docid); 00507 zSep = ","; 00508 if( p->iType>=DL_POSITIONS ){ 00509 int iPos, iCol; 00510 const char *zDiv = ""; 00511 printf("("); 00512 while( (iPos = readPosition(&r, &iCol))>=0 ){ 00513 printf("%s%d:%d", zDiv, iCol, iPos); 00514 zDiv = ":"; 00515 } 00516 printf(")"); 00517 } 00518 } 00519 printf("\n"); 00520 fflush(stdout); 00521 } 00522 #endif /* SQLITE_DEBUG */ 00523 00524 /* Trim the given doclist to contain only positions in column 00525 * [iRestrictColumn]. */ 00526 static void docListRestrictColumn(DocList *in, int iRestrictColumn){ 00527 DocListReader r; 00528 DocList out; 00529 00530 assert( in->iType>=DL_POSITIONS ); 00531 readerInit(&r, in); 00532 docListInit(&out, DL_POSITIONS, NULL, 0); 00533 00534 while( !atEnd(&r) ){ 00535 sqlite_int64 iDocid = readDocid(&r); 00536 int iPos, iColumn; 00537 00538 docListAddDocid(&out, iDocid); 00539 while( (iPos = readPosition(&r, &iColumn)) != -1 ){ 00540 if( iColumn==iRestrictColumn ){ 00541 docListAddPos(&out, iColumn, iPos); 00542 } 00543 } 00544 } 00545 00546 docListDestroy(in); 00547 *in = out; 00548 } 00549 00550 /* Trim the given doclist by discarding any docids without any remaining 00551 * positions. */ 00552 static void docListDiscardEmpty(DocList *in) { 00553 DocListReader r; 00554 DocList out; 00555 00556 /* TODO: It would be nice to implement this operation in place; that 00557 * could save a significant amount of memory in queries with long doclists. */ 00558 assert( in->iType>=DL_POSITIONS ); 00559 readerInit(&r, in); 00560 docListInit(&out, DL_POSITIONS, NULL, 0); 00561 00562 while( !atEnd(&r) ){ 00563 sqlite_int64 iDocid = readDocid(&r); 00564 int match = 0; 00565 int iPos, iColumn; 00566 while( (iPos = readPosition(&r, &iColumn)) != -1 ){ 00567 if( !match ){ 00568 docListAddDocid(&out, iDocid); 00569 match = 1; 00570 } 00571 docListAddPos(&out, iColumn, iPos); 00572 } 00573 } 00574 00575 docListDestroy(in); 00576 *in = out; 00577 } 00578 00579 /* Helper function for docListUpdate() and docListAccumulate(). 00580 ** Splices a doclist element into the doclist represented by r, 00581 ** leaving r pointing after the newly spliced element. 00582 */ 00583 static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid, 00584 const char *pSource, int nSource){ 00585 DocList *d = r->pDoclist; 00586 char *pTarget; 00587 int nTarget, found; 00588 00589 found = skipToDocid(r, iDocid); 00590 00591 /* Describe slice in d to place pSource/nSource. */ 00592 pTarget = r->p; 00593 if( found ){ 00594 skipDocument(r); 00595 nTarget = r->p-pTarget; 00596 }else{ 00597 nTarget = 0; 00598 } 00599 00600 /* The sense of the following is that there are three possibilities. 00601 ** If nTarget==nSource, we should not move any memory nor realloc. 00602 ** If nTarget>nSource, trim target and realloc. 00603 ** If nTarget<nSource, realloc then expand target. 00604 */ 00605 if( nTarget>nSource ){ 00606 memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget)); 00607 } 00608 if( nTarget!=nSource ){ 00609 int iDoclist = pTarget-d->pData; 00610 d->pData = realloc(d->pData, d->nData+nSource-nTarget); 00611 pTarget = d->pData+iDoclist; 00612 } 00613 if( nTarget<nSource ){ 00614 memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget)); 00615 } 00616 00617 memcpy(pTarget, pSource, nSource); 00618 d->nData += nSource-nTarget; 00619 r->p = pTarget+nSource; 00620 } 00621 00622 /* Insert/update pUpdate into the doclist. */ 00623 static void docListUpdate(DocList *d, DocList *pUpdate){ 00624 DocListReader reader; 00625 00626 assert( d!=NULL && pUpdate!=NULL ); 00627 assert( d->iType==pUpdate->iType); 00628 00629 readerInit(&reader, d); 00630 docListSpliceElement(&reader, firstDocid(pUpdate), 00631 pUpdate->pData, pUpdate->nData); 00632 } 00633 00634 /* Propagate elements from pUpdate to pAcc, overwriting elements with 00635 ** matching docids. 00636 */ 00637 static void docListAccumulate(DocList *pAcc, DocList *pUpdate){ 00638 DocListReader accReader, updateReader; 00639 00640 /* Handle edge cases where one doclist is empty. */ 00641 assert( pAcc!=NULL ); 00642 if( pUpdate==NULL || pUpdate->nData==0 ) return; 00643 if( pAcc->nData==0 ){ 00644 pAcc->pData = malloc(pUpdate->nData); 00645 memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData); 00646 pAcc->nData = pUpdate->nData; 00647 return; 00648 } 00649 00650 readerInit(&accReader, pAcc); 00651 readerInit(&updateReader, pUpdate); 00652 00653 while( !atEnd(&updateReader) ){ 00654 char *pSource = updateReader.p; 00655 sqlite_int64 iDocid = readDocid(&updateReader); 00656 skipPositionList(&updateReader); 00657 docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource); 00658 } 00659 } 00660 00661 /* 00662 ** Read the next docid off of pIn. Return 0 if we reach the end. 00663 * 00664 * TODO: This assumes that docids are never 0, but they may actually be 0 since 00665 * users can choose docids when inserting into a full-text table. Fix this. 00666 */ 00667 static sqlite_int64 nextDocid(DocListReader *pIn){ 00668 skipPositionList(pIn); 00669 return atEnd(pIn) ? 0 : readDocid(pIn); 00670 } 00671 00672 /* 00673 ** pLeft and pRight are two DocListReaders that are pointing to 00674 ** positions lists of the same document: iDocid. 00675 ** 00676 ** If there are no instances in pLeft or pRight where the position 00677 ** of pLeft is one less than the position of pRight, then this 00678 ** routine adds nothing to pOut. 00679 ** 00680 ** If there are one or more instances where positions from pLeft 00681 ** are exactly one less than positions from pRight, then add a new 00682 ** document record to pOut. If pOut wants to hold positions, then 00683 ** include the positions from pRight that are one more than a 00684 ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1. 00685 ** 00686 ** pLeft and pRight are left pointing at the next document record. 00687 */ 00688 static void mergePosList( 00689 DocListReader *pLeft, /* Left position list */ 00690 DocListReader *pRight, /* Right position list */ 00691 sqlite_int64 iDocid, /* The docid from pLeft and pRight */ 00692 DocList *pOut /* Write the merged document record here */ 00693 ){ 00694 int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol); 00695 int iRightCol, iRightPos = readPosition(pRight, &iRightCol); 00696 int match = 0; 00697 00698 /* Loop until we've reached the end of both position lists. */ 00699 while( iLeftPos!=-1 && iRightPos!=-1 ){ 00700 if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){ 00701 if( !match ){ 00702 docListAddDocid(pOut, iDocid); 00703 match = 1; 00704 } 00705 if( pOut->iType>=DL_POSITIONS ){ 00706 docListAddPos(pOut, iRightCol, iRightPos); 00707 } 00708 iLeftPos = readPosition(pLeft, &iLeftCol); 00709 iRightPos = readPosition(pRight, &iRightCol); 00710 }else if( iRightCol<iLeftCol || 00711 (iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){ 00712 iRightPos = readPosition(pRight, &iRightCol); 00713 }else{ 00714 iLeftPos = readPosition(pLeft, &iLeftCol); 00715 } 00716 } 00717 if( iLeftPos>=0 ) skipPositionList(pLeft); 00718 if( iRightPos>=0 ) skipPositionList(pRight); 00719 } 00720 00721 /* We have two doclists: pLeft and pRight. 00722 ** Write the phrase intersection of these two doclists into pOut. 00723 ** 00724 ** A phrase intersection means that two documents only match 00725 ** if pLeft.iPos+1==pRight.iPos. 00726 ** 00727 ** The output pOut may or may not contain positions. If pOut 00728 ** does contain positions, they are the positions of pRight. 00729 */ 00730 static void docListPhraseMerge( 00731 DocList *pLeft, /* Doclist resulting from the words on the left */ 00732 DocList *pRight, /* Doclist for the next word to the right */ 00733 DocList *pOut /* Write the combined doclist here */ 00734 ){ 00735 DocListReader left, right; 00736 sqlite_int64 docidLeft, docidRight; 00737 00738 readerInit(&left, pLeft); 00739 readerInit(&right, pRight); 00740 docidLeft = nextDocid(&left); 00741 docidRight = nextDocid(&right); 00742 00743 while( docidLeft>0 && docidRight>0 ){ 00744 if( docidLeft<docidRight ){ 00745 docidLeft = nextDocid(&left); 00746 }else if( docidRight<docidLeft ){ 00747 docidRight = nextDocid(&right); 00748 }else{ 00749 mergePosList(&left, &right, docidLeft, pOut); 00750 docidLeft = nextDocid(&left); 00751 docidRight = nextDocid(&right); 00752 } 00753 } 00754 } 00755 00756 /* We have two doclists: pLeft and pRight. 00757 ** Write the intersection of these two doclists into pOut. 00758 ** Only docids are matched. Position information is ignored. 00759 ** 00760 ** The output pOut never holds positions. 00761 */ 00762 static void docListAndMerge( 00763 DocList *pLeft, /* Doclist resulting from the words on the left */ 00764 DocList *pRight, /* Doclist for the next word to the right */ 00765 DocList *pOut /* Write the combined doclist here */ 00766 ){ 00767 DocListReader left, right; 00768 sqlite_int64 docidLeft, docidRight; 00769 00770 assert( pOut->iType<DL_POSITIONS ); 00771 00772 readerInit(&left, pLeft); 00773 readerInit(&right, pRight); 00774 docidLeft = nextDocid(&left); 00775 docidRight = nextDocid(&right); 00776 00777 while( docidLeft>0 && docidRight>0 ){ 00778 if( docidLeft<docidRight ){ 00779 docidLeft = nextDocid(&left); 00780 }else if( docidRight<docidLeft ){ 00781 docidRight = nextDocid(&right); 00782 }else{ 00783 docListAddDocid(pOut, docidLeft); 00784 docidLeft = nextDocid(&left); 00785 docidRight = nextDocid(&right); 00786 } 00787 } 00788 } 00789 00790 /* We have two doclists: pLeft and pRight. 00791 ** Write the union of these two doclists into pOut. 00792 ** Only docids are matched. Position information is ignored. 00793 ** 00794 ** The output pOut never holds positions. 00795 */ 00796 static void docListOrMerge( 00797 DocList *pLeft, /* Doclist resulting from the words on the left */ 00798 DocList *pRight, /* Doclist for the next word to the right */ 00799 DocList *pOut /* Write the combined doclist here */ 00800 ){ 00801 DocListReader left, right; 00802 sqlite_int64 docidLeft, docidRight, priorLeft; 00803 00804 readerInit(&left, pLeft); 00805 readerInit(&right, pRight); 00806 docidLeft = nextDocid(&left); 00807 docidRight = nextDocid(&right); 00808 00809 while( docidLeft>0 && docidRight>0 ){ 00810 if( docidLeft<=docidRight ){ 00811 docListAddDocid(pOut, docidLeft); 00812 }else{ 00813 docListAddDocid(pOut, docidRight); 00814 } 00815 priorLeft = docidLeft; 00816 if( docidLeft<=docidRight ){ 00817 docidLeft = nextDocid(&left); 00818 } 00819 if( docidRight>0 && docidRight<=priorLeft ){ 00820 docidRight = nextDocid(&right); 00821 } 00822 } 00823 while( docidLeft>0 ){ 00824 docListAddDocid(pOut, docidLeft); 00825 docidLeft = nextDocid(&left); 00826 } 00827 while( docidRight>0 ){ 00828 docListAddDocid(pOut, docidRight); 00829 docidRight = nextDocid(&right); 00830 } 00831 } 00832 00833 /* We have two doclists: pLeft and pRight. 00834 ** Write into pOut all documents that occur in pLeft but not 00835 ** in pRight. 00836 ** 00837 ** Only docids are matched. Position information is ignored. 00838 ** 00839 ** The output pOut never holds positions. 00840 */ 00841 static void docListExceptMerge( 00842 DocList *pLeft, /* Doclist resulting from the words on the left */ 00843 DocList *pRight, /* Doclist for the next word to the right */ 00844 DocList *pOut /* Write the combined doclist here */ 00845 ){ 00846 DocListReader left, right; 00847 sqlite_int64 docidLeft, docidRight, priorLeft; 00848 00849 readerInit(&left, pLeft); 00850 readerInit(&right, pRight); 00851 docidLeft = nextDocid(&left); 00852 docidRight = nextDocid(&right); 00853 00854 while( docidLeft>0 && docidRight>0 ){ 00855 priorLeft = docidLeft; 00856 if( docidLeft<docidRight ){ 00857 docListAddDocid(pOut, docidLeft); 00858 } 00859 if( docidLeft<=docidRight ){ 00860 docidLeft = nextDocid(&left); 00861 } 00862 if( docidRight>0 && docidRight<=priorLeft ){ 00863 docidRight = nextDocid(&right); 00864 } 00865 } 00866 while( docidLeft>0 ){ 00867 docListAddDocid(pOut, docidLeft); 00868 docidLeft = nextDocid(&left); 00869 } 00870 } 00871 00872 static char *string_dup_n(const char *s, int n){ 00873 char *str = malloc(n + 1); 00874 memcpy(str, s, n); 00875 str[n] = '\0'; 00876 return str; 00877 } 00878 00879 /* Duplicate a string; the caller must free() the returned string. 00880 * (We don't use strdup() since it is not part of the standard C library and 00881 * may not be available everywhere.) */ 00882 static char *string_dup(const char *s){ 00883 return string_dup_n(s, strlen(s)); 00884 } 00885 00886 /* Format a string, replacing each occurrence of the % character with 00887 * zDb.zName. This may be more convenient than sqlite_mprintf() 00888 * when one string is used repeatedly in a format string. 00889 * The caller must free() the returned string. */ 00890 static char *string_format(const char *zFormat, 00891 const char *zDb, const char *zName){ 00892 const char *p; 00893 size_t len = 0; 00894 size_t nDb = strlen(zDb); 00895 size_t nName = strlen(zName); 00896 size_t nFullTableName = nDb+1+nName; 00897 char *result; 00898 char *r; 00899 00900 /* first compute length needed */ 00901 for(p = zFormat ; *p ; ++p){ 00902 len += (*p=='%' ? nFullTableName : 1); 00903 } 00904 len += 1; /* for null terminator */ 00905 00906 r = result = malloc(len); 00907 for(p = zFormat; *p; ++p){ 00908 if( *p=='%' ){ 00909 memcpy(r, zDb, nDb); 00910 r += nDb; 00911 *r++ = '.'; 00912 memcpy(r, zName, nName); 00913 r += nName; 00914 } else { 00915 *r++ = *p; 00916 } 00917 } 00918 *r++ = '\0'; 00919 assert( r == result + len ); 00920 return result; 00921 } 00922 00923 static int sql_exec(sqlite3 *db, const char *zDb, const char *zName, 00924 const char *zFormat){ 00925 char *zCommand = string_format(zFormat, zDb, zName); 00926 int rc; 00927 TRACE(("FTS1 sql: %s\n", zCommand)); 00928 rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); 00929 free(zCommand); 00930 return rc; 00931 } 00932 00933 static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName, 00934 sqlite3_stmt **ppStmt, const char *zFormat){ 00935 char *zCommand = string_format(zFormat, zDb, zName); 00936 int rc; 00937 TRACE(("FTS1 prepare: %s\n", zCommand)); 00938 rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); 00939 free(zCommand); 00940 return rc; 00941 } 00942 00943 /* end utility functions */ 00944 00945 /* Forward reference */ 00946 typedef struct fulltext_vtab fulltext_vtab; 00947 00948 /* A single term in a query is represented by an instances of 00949 ** the following structure. 00950 */ 00951 typedef struct QueryTerm { 00952 short int nPhrase; /* How many following terms are part of the same phrase */ 00953 short int iPhrase; /* This is the i-th term of a phrase. */ 00954 short int iColumn; /* Column of the index that must match this term */ 00955 signed char isOr; /* this term is preceded by "OR" */ 00956 signed char isNot; /* this term is preceded by "-" */ 00957 char *pTerm; /* text of the term. '\000' terminated. malloced */ 00958 int nTerm; /* Number of bytes in pTerm[] */ 00959 } QueryTerm; 00960 00961 00962 /* A query string is parsed into a Query structure. 00963 * 00964 * We could, in theory, allow query strings to be complicated 00965 * nested expressions with precedence determined by parentheses. 00966 * But none of the major search engines do this. (Perhaps the 00967 * feeling is that an parenthesized expression is two complex of 00968 * an idea for the average user to grasp.) Taking our lead from 00969 * the major search engines, we will allow queries to be a list 00970 * of terms (with an implied AND operator) or phrases in double-quotes, 00971 * with a single optional "-" before each non-phrase term to designate 00972 * negation and an optional OR connector. 00973 * 00974 * OR binds more tightly than the implied AND, which is what the 00975 * major search engines seem to do. So, for example: 00976 * 00977 * [one two OR three] ==> one AND (two OR three) 00978 * [one OR two three] ==> (one OR two) AND three 00979 * 00980 * A "-" before a term matches all entries that lack that term. 00981 * The "-" must occur immediately before the term with in intervening 00982 * space. This is how the search engines do it. 00983 * 00984 * A NOT term cannot be the right-hand operand of an OR. If this 00985 * occurs in the query string, the NOT is ignored: 00986 * 00987 * [one OR -two] ==> one OR two 00988 * 00989 */ 00990 typedef struct Query { 00991 fulltext_vtab *pFts; /* The full text index */ 00992 int nTerms; /* Number of terms in the query */ 00993 QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ 00994 int nextIsOr; /* Set the isOr flag on the next inserted term */ 00995 int nextColumn; /* Next word parsed must be in this column */ 00996 int dfltColumn; /* The default column */ 00997 } Query; 00998 00999 01000 /* 01001 ** An instance of the following structure keeps track of generated 01002 ** matching-word offset information and snippets. 01003 */ 01004 typedef struct Snippet { 01005 int nMatch; /* Total number of matches */ 01006 int nAlloc; /* Space allocated for aMatch[] */ 01007 struct snippetMatch { /* One entry for each matching term */ 01008 char snStatus; /* Status flag for use while constructing snippets */ 01009 short int iCol; /* The column that contains the match */ 01010 short int iTerm; /* The index in Query.pTerms[] of the matching term */ 01011 short int nByte; /* Number of bytes in the term */ 01012 int iStart; /* The offset to the first character of the term */ 01013 } *aMatch; /* Points to space obtained from malloc */ 01014 char *zOffset; /* Text rendering of aMatch[] */ 01015 int nOffset; /* strlen(zOffset) */ 01016 char *zSnippet; /* Snippet text */ 01017 int nSnippet; /* strlen(zSnippet) */ 01018 } Snippet; 01019 01020 01021 typedef enum QueryType { 01022 QUERY_GENERIC, /* table scan */ 01023 QUERY_ROWID, /* lookup by rowid */ 01024 QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/ 01025 } QueryType; 01026 01027 /* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0 01028 ** before we start aggregating into larger segments. Lower CHUNK_MAX 01029 ** means that for a given input we have more individual segments per 01030 ** term, which means more rows in the table and a bigger index (due to 01031 ** both more rows and bigger rowids). But it also reduces the average 01032 ** cost of adding new elements to the segment 0 doclist, and it seems 01033 ** to reduce the number of pages read and written during inserts. 256 01034 ** was chosen by measuring insertion times for a certain input (first 01035 ** 10k documents of Enron corpus), though including query performance 01036 ** in the decision may argue for a larger value. 01037 */ 01038 #define CHUNK_MAX 256 01039 01040 typedef enum fulltext_statement { 01041 CONTENT_INSERT_STMT, 01042 CONTENT_SELECT_STMT, 01043 CONTENT_UPDATE_STMT, 01044 CONTENT_DELETE_STMT, 01045 01046 TERM_SELECT_STMT, 01047 TERM_SELECT_ALL_STMT, 01048 TERM_INSERT_STMT, 01049 TERM_UPDATE_STMT, 01050 TERM_DELETE_STMT, 01051 01052 MAX_STMT /* Always at end! */ 01053 } fulltext_statement; 01054 01055 /* These must exactly match the enum above. */ 01056 /* TODO(adam): Is there some risk that a statement (in particular, 01057 ** pTermSelectStmt) will be used in two cursors at once, e.g. if a 01058 ** query joins a virtual table to itself? If so perhaps we should 01059 ** move some of these to the cursor object. 01060 */ 01061 static const char *const fulltext_zStatement[MAX_STMT] = { 01062 /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */ 01063 /* CONTENT_SELECT */ "select * from %_content where rowid = ?", 01064 /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */ 01065 /* CONTENT_DELETE */ "delete from %_content where rowid = ?", 01066 01067 /* TERM_SELECT */ 01068 "select rowid, doclist from %_term where term = ? and segment = ?", 01069 /* TERM_SELECT_ALL */ 01070 "select doclist from %_term where term = ? order by segment", 01071 /* TERM_INSERT */ 01072 "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)", 01073 /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", 01074 /* TERM_DELETE */ "delete from %_term where rowid = ?", 01075 }; 01076 01077 /* 01078 ** A connection to a fulltext index is an instance of the following 01079 ** structure. The xCreate and xConnect methods create an instance 01080 ** of this structure and xDestroy and xDisconnect free that instance. 01081 ** All other methods receive a pointer to the structure as one of their 01082 ** arguments. 01083 */ 01084 struct fulltext_vtab { 01085 sqlite3_vtab base; /* Base class used by SQLite core */ 01086 sqlite3 *db; /* The database connection */ 01087 const char *zDb; /* logical database name */ 01088 const char *zName; /* virtual table name */ 01089 int nColumn; /* number of columns in virtual table */ 01090 char **azColumn; /* column names. malloced */ 01091 char **azContentColumn; /* column names in content table; malloced */ 01092 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ 01093 01094 /* Precompiled statements which we keep as long as the table is 01095 ** open. 01096 */ 01097 sqlite3_stmt *pFulltextStatements[MAX_STMT]; 01098 }; 01099 01100 /* 01101 ** When the core wants to do a query, it create a cursor using a 01102 ** call to xOpen. This structure is an instance of a cursor. It 01103 ** is destroyed by xClose. 01104 */ 01105 typedef struct fulltext_cursor { 01106 sqlite3_vtab_cursor base; /* Base class used by SQLite core */ 01107 QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */ 01108 sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ 01109 int eof; /* True if at End Of Results */ 01110 Query q; /* Parsed query string */ 01111 Snippet snippet; /* Cached snippet for the current row */ 01112 int iColumn; /* Column being searched */ 01113 DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */ 01114 } fulltext_cursor; 01115 01116 static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ 01117 return (fulltext_vtab *) c->base.pVtab; 01118 } 01119 01120 static const sqlite3_module fulltextModule; /* forward declaration */ 01121 01122 /* Append a list of strings separated by commas to a StringBuffer. */ 01123 static void appendList(StringBuffer *sb, int nString, char **azString){ 01124 int i; 01125 for(i=0; i<nString; ++i){ 01126 if( i>0 ) append(sb, ", "); 01127 append(sb, azString[i]); 01128 } 01129 } 01130 01131 /* Return a dynamically generated statement of the form 01132 * insert into %_content (rowid, ...) values (?, ...) 01133 */ 01134 static const char *contentInsertStatement(fulltext_vtab *v){ 01135 StringBuffer sb; 01136 int i; 01137 01138 initStringBuffer(&sb); 01139 append(&sb, "insert into %_content (rowid, "); 01140 appendList(&sb, v->nColumn, v->azContentColumn); 01141 append(&sb, ") values (?"); 01142 for(i=0; i<v->nColumn; ++i) 01143 append(&sb, ", ?"); 01144 append(&sb, ")"); 01145 return sb.s; 01146 } 01147 01148 /* Return a dynamically generated statement of the form 01149 * update %_content set [col_0] = ?, [col_1] = ?, ... 01150 * where rowid = ? 01151 */ 01152 static const char *contentUpdateStatement(fulltext_vtab *v){ 01153 StringBuffer sb; 01154 int i; 01155 01156 initStringBuffer(&sb); 01157 append(&sb, "update %_content set "); 01158 for(i=0; i<v->nColumn; ++i) { 01159 if( i>0 ){ 01160 append(&sb, ", "); 01161 } 01162 append(&sb, v->azContentColumn[i]); 01163 append(&sb, " = ?"); 01164 } 01165 append(&sb, " where rowid = ?"); 01166 return sb.s; 01167 } 01168 01169 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. 01170 ** If the indicated statement has never been prepared, it is prepared 01171 ** and cached, otherwise the cached version is reset. 01172 */ 01173 static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, 01174 sqlite3_stmt **ppStmt){ 01175 assert( iStmt<MAX_STMT ); 01176 if( v->pFulltextStatements[iStmt]==NULL ){ 01177 const char *zStmt; 01178 int rc; 01179 switch( iStmt ){ 01180 case CONTENT_INSERT_STMT: 01181 zStmt = contentInsertStatement(v); break; 01182 case CONTENT_UPDATE_STMT: 01183 zStmt = contentUpdateStatement(v); break; 01184 default: 01185 zStmt = fulltext_zStatement[iStmt]; 01186 } 01187 rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt], 01188 zStmt); 01189 if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt); 01190 if( rc!=SQLITE_OK ) return rc; 01191 } else { 01192 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); 01193 if( rc!=SQLITE_OK ) return rc; 01194 } 01195 01196 *ppStmt = v->pFulltextStatements[iStmt]; 01197 return SQLITE_OK; 01198 } 01199 01200 /* Step the indicated statement, handling errors SQLITE_BUSY (by 01201 ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring 01202 ** bindings to the new statement). 01203 ** TODO(adam): We should extend this function so that it can work with 01204 ** statements declared locally, not only globally cached statements. 01205 */ 01206 static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, 01207 sqlite3_stmt **ppStmt){ 01208 int rc; 01209 sqlite3_stmt *s = *ppStmt; 01210 assert( iStmt<MAX_STMT ); 01211 assert( s==v->pFulltextStatements[iStmt] ); 01212 01213 while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ 01214 if( rc==SQLITE_BUSY ) continue; 01215 if( rc!=SQLITE_ERROR ) return rc; 01216 01217 /* If an SQLITE_SCHEMA error has occured, then finalizing this 01218 * statement is going to delete the fulltext_vtab structure. If 01219 * the statement just executed is in the pFulltextStatements[] 01220 * array, it will be finalized twice. So remove it before 01221 * calling sqlite3_finalize(). 01222 */ 01223 v->pFulltextStatements[iStmt] = NULL; 01224 rc = sqlite3_finalize(s); 01225 break; 01226 } 01227 return rc; 01228 01229 err: 01230 sqlite3_finalize(s); 01231 return rc; 01232 } 01233 01234 /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. 01235 ** Useful for statements like UPDATE, where we expect no results. 01236 */ 01237 static int sql_single_step_statement(fulltext_vtab *v, 01238 fulltext_statement iStmt, 01239 sqlite3_stmt **ppStmt){ 01240 int rc = sql_step_statement(v, iStmt, ppStmt); 01241 return (rc==SQLITE_DONE) ? SQLITE_OK : rc; 01242 } 01243 01244 /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */ 01245 static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, 01246 sqlite3_value **pValues){ 01247 sqlite3_stmt *s; 01248 int i; 01249 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); 01250 if( rc!=SQLITE_OK ) return rc; 01251 01252 rc = sqlite3_bind_value(s, 1, rowid); 01253 if( rc!=SQLITE_OK ) return rc; 01254 01255 for(i=0; i<v->nColumn; ++i){ 01256 rc = sqlite3_bind_value(s, 2+i, pValues[i]); 01257 if( rc!=SQLITE_OK ) return rc; 01258 } 01259 01260 return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); 01261 } 01262 01263 /* update %_content set col0 = pValues[0], col1 = pValues[1], ... 01264 * where rowid = [iRowid] */ 01265 static int content_update(fulltext_vtab *v, sqlite3_value **pValues, 01266 sqlite_int64 iRowid){ 01267 sqlite3_stmt *s; 01268 int i; 01269 int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s); 01270 if( rc!=SQLITE_OK ) return rc; 01271 01272 for(i=0; i<v->nColumn; ++i){ 01273 rc = sqlite3_bind_value(s, 1+i, pValues[i]); 01274 if( rc!=SQLITE_OK ) return rc; 01275 } 01276 01277 rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid); 01278 if( rc!=SQLITE_OK ) return rc; 01279 01280 return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s); 01281 } 01282 01283 static void freeStringArray(int nString, const char **pString){ 01284 int i; 01285 01286 for (i=0 ; i < nString ; ++i) { 01287 if( pString[i]!=NULL ) free((void *) pString[i]); 01288 } 01289 free((void *) pString); 01290 } 01291 01292 /* select * from %_content where rowid = [iRow] 01293 * The caller must delete the returned array and all strings in it. 01294 * null fields will be NULL in the returned array. 01295 * 01296 * TODO: Perhaps we should return pointer/length strings here for consistency 01297 * with other code which uses pointer/length. */ 01298 static int content_select(fulltext_vtab *v, sqlite_int64 iRow, 01299 const char ***pValues){ 01300 sqlite3_stmt *s; 01301 const char **values; 01302 int i; 01303 int rc; 01304 01305 *pValues = NULL; 01306 01307 rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); 01308 if( rc!=SQLITE_OK ) return rc; 01309 01310 rc = sqlite3_bind_int64(s, 1, iRow); 01311 if( rc!=SQLITE_OK ) return rc; 01312 01313 rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); 01314 if( rc!=SQLITE_ROW ) return rc; 01315 01316 values = (const char **) malloc(v->nColumn * sizeof(const char *)); 01317 for(i=0; i<v->nColumn; ++i){ 01318 if( sqlite3_column_type(s, i)==SQLITE_NULL ){ 01319 values[i] = NULL; 01320 }else{ 01321 values[i] = string_dup((char*)sqlite3_column_text(s, i)); 01322 } 01323 } 01324 01325 /* We expect only one row. We must execute another sqlite3_step() 01326 * to complete the iteration; otherwise the table will remain locked. */ 01327 rc = sqlite3_step(s); 01328 if( rc==SQLITE_DONE ){ 01329 *pValues = values; 01330 return SQLITE_OK; 01331 } 01332 01333 freeStringArray(v->nColumn, values); 01334 return rc; 01335 } 01336 01337 /* delete from %_content where rowid = [iRow ] */ 01338 static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ 01339 sqlite3_stmt *s; 01340 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); 01341 if( rc!=SQLITE_OK ) return rc; 01342 01343 rc = sqlite3_bind_int64(s, 1, iRow); 01344 if( rc!=SQLITE_OK ) return rc; 01345 01346 return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); 01347 } 01348 01349 /* select rowid, doclist from %_term 01350 * where term = [pTerm] and segment = [iSegment] 01351 * If found, returns SQLITE_ROW; the caller must free the 01352 * returned doclist. If no rows found, returns SQLITE_DONE. */ 01353 static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm, 01354 int iSegment, 01355 sqlite_int64 *rowid, DocList *out){ 01356 sqlite3_stmt *s; 01357 int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); 01358 if( rc!=SQLITE_OK ) return rc; 01359 01360 rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC); 01361 if( rc!=SQLITE_OK ) return rc; 01362 01363 rc = sqlite3_bind_int(s, 2, iSegment); 01364 if( rc!=SQLITE_OK ) return rc; 01365 01366 rc = sql_step_statement(v, TERM_SELECT_STMT, &s); 01367 if( rc!=SQLITE_ROW ) return rc; 01368 01369 *rowid = sqlite3_column_int64(s, 0); 01370 docListInit(out, DL_DEFAULT, 01371 sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); 01372 01373 /* We expect only one row. We must execute another sqlite3_step() 01374 * to complete the iteration; otherwise the table will remain locked. */ 01375 rc = sqlite3_step(s); 01376 return rc==SQLITE_DONE ? SQLITE_ROW : rc; 01377 } 01378 01379 /* Load the segment doclists for term pTerm and merge them in 01380 ** appropriate order into out. Returns SQLITE_OK if successful. If 01381 ** there are no segments for pTerm, successfully returns an empty 01382 ** doclist in out. 01383 ** 01384 ** Each document consists of 1 or more "columns". The number of 01385 ** columns is v->nColumn. If iColumn==v->nColumn, then return 01386 ** position information about all columns. If iColumn<v->nColumn, 01387 ** then only return position information about the iColumn-th column 01388 ** (where the first column is 0). 01389 */ 01390 static int term_select_all( 01391 fulltext_vtab *v, /* The fulltext index we are querying against */ 01392 int iColumn, /* If <nColumn, only look at the iColumn-th column */ 01393 const char *pTerm, /* The term whose posting lists we want */ 01394 int nTerm, /* Number of bytes in pTerm */ 01395 DocList *out /* Write the resulting doclist here */ 01396 ){ 01397 DocList doclist; 01398 sqlite3_stmt *s; 01399 int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s); 01400 if( rc!=SQLITE_OK ) return rc; 01401 01402 rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC); 01403 if( rc!=SQLITE_OK ) return rc; 01404 01405 docListInit(&doclist, DL_DEFAULT, 0, 0); 01406 01407 /* TODO(shess) Handle schema and busy errors. */ 01408 while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){ 01409 DocList old; 01410 01411 /* TODO(shess) If we processed doclists from oldest to newest, we 01412 ** could skip the malloc() involved with the following call. For 01413 ** now, I'd rather keep this logic similar to index_insert_term(). 01414 ** We could additionally drop elements when we see deletes, but 01415 ** that would require a distinct version of docListAccumulate(). 01416 */ 01417 docListInit(&old, DL_DEFAULT, 01418 sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0)); 01419 01420 if( iColumn<v->nColumn ){ /* querying a single column */ 01421 docListRestrictColumn(&old, iColumn); 01422 } 01423 01424 /* doclist contains the newer data, so write it over old. Then 01425 ** steal accumulated result for doclist. 01426 */ 01427 docListAccumulate(&old, &doclist); 01428 docListDestroy(&doclist); 01429 doclist = old; 01430 } 01431 if( rc!=SQLITE_DONE ){ 01432 docListDestroy(&doclist); 01433 return rc; 01434 } 01435 01436 docListDiscardEmpty(&doclist); 01437 *out = doclist; 01438 return SQLITE_OK; 01439 } 01440 01441 /* insert into %_term (rowid, term, segment, doclist) 01442 values ([piRowid], [pTerm], [iSegment], [doclist]) 01443 ** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid. 01444 ** 01445 ** NOTE(shess) piRowid is IN, with values of "space of int64" plus 01446 ** null, it is not used to pass data back to the caller. 01447 */ 01448 static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid, 01449 const char *pTerm, int nTerm, 01450 int iSegment, DocList *doclist){ 01451 sqlite3_stmt *s; 01452 int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); 01453 if( rc!=SQLITE_OK ) return rc; 01454 01455 if( piRowid==NULL ){ 01456 rc = sqlite3_bind_null(s, 1); 01457 }else{ 01458 rc = sqlite3_bind_int64(s, 1, *piRowid); 01459 } 01460 if( rc!=SQLITE_OK ) return rc; 01461 01462 rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC); 01463 if( rc!=SQLITE_OK ) return rc; 01464 01465 rc = sqlite3_bind_int(s, 3, iSegment); 01466 if( rc!=SQLITE_OK ) return rc; 01467 01468 rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC); 01469 if( rc!=SQLITE_OK ) return rc; 01470 01471 return sql_single_step_statement(v, TERM_INSERT_STMT, &s); 01472 } 01473 01474 /* update %_term set doclist = [doclist] where rowid = [rowid] */ 01475 static int term_update(fulltext_vtab *v, sqlite_int64 rowid, 01476 DocList *doclist){ 01477 sqlite3_stmt *s; 01478 int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); 01479 if( rc!=SQLITE_OK ) return rc; 01480 01481 rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC); 01482 if( rc!=SQLITE_OK ) return rc; 01483 01484 rc = sqlite3_bind_int64(s, 2, rowid); 01485 if( rc!=SQLITE_OK ) return rc; 01486 01487 return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); 01488 } 01489 01490 static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ 01491 sqlite3_stmt *s; 01492 int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); 01493 if( rc!=SQLITE_OK ) return rc; 01494 01495 rc = sqlite3_bind_int64(s, 1, rowid); 01496 if( rc!=SQLITE_OK ) return rc; 01497 01498 return sql_single_step_statement(v, TERM_DELETE_STMT, &s); 01499 } 01500 01501 /* 01502 ** Free the memory used to contain a fulltext_vtab structure. 01503 */ 01504 static void fulltext_vtab_destroy(fulltext_vtab *v){ 01505 int iStmt, i; 01506 01507 TRACE(("FTS1 Destroy %p\n", v)); 01508 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ 01509 if( v->pFulltextStatements[iStmt]!=NULL ){ 01510 sqlite3_finalize(v->pFulltextStatements[iStmt]); 01511 v->pFulltextStatements[iStmt] = NULL; 01512 } 01513 } 01514 01515 if( v->pTokenizer!=NULL ){ 01516 v->pTokenizer->pModule->xDestroy(v->pTokenizer); 01517 v->pTokenizer = NULL; 01518 } 01519 01520 free(v->azColumn); 01521 for(i = 0; i < v->nColumn; ++i) { 01522 sqlite3_free(v->azContentColumn[i]); 01523 } 01524 free(v->azContentColumn); 01525 free(v); 01526 } 01527 01528 /* 01529 ** Token types for parsing the arguments to xConnect or xCreate. 01530 */ 01531 #define TOKEN_EOF 0 /* End of file */ 01532 #define TOKEN_SPACE 1 /* Any kind of whitespace */ 01533 #define TOKEN_ID 2 /* An identifier */ 01534 #define TOKEN_STRING 3 /* A string literal */ 01535 #define TOKEN_PUNCT 4 /* A single punctuation character */ 01536 01537 /* 01538 ** If X is a character that can be used in an identifier then 01539 ** IdChar(X) will be true. Otherwise it is false. 01540 ** 01541 ** For ASCII, any character with the high-order bit set is 01542 ** allowed in an identifier. For 7-bit characters, 01543 ** sqlite3IsIdChar[X] must be 1. 01544 ** 01545 ** Ticket #1066. the SQL standard does not allow '$' in the 01546 ** middle of identfiers. But many SQL implementations do. 01547 ** SQLite will allow '$' in identifiers for compatibility. 01548 ** But the feature is undocumented. 01549 */ 01550 static const char isIdChar[] = { 01551 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ 01552 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ 01553 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ 01554 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ 01555 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ 01556 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ 01557 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ 01558 }; 01559 #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20])) 01560 01561 01562 /* 01563 ** Return the length of the token that begins at z[0]. 01564 ** Store the token type in *tokenType before returning. 01565 */ 01566 static int getToken(const char *z, int *tokenType){ 01567 int i, c; 01568 switch( *z ){ 01569 case 0: { 01570 *tokenType = TOKEN_EOF; 01571 return 0; 01572 } 01573 case ' ': case '\t': case '\n': case '\f': case '\r': { 01574 for(i=1; safe_isspace(z[i]); i++){} 01575 *tokenType = TOKEN_SPACE; 01576 return i; 01577 } 01578 case '`': 01579 case '\'': 01580 case '"': { 01581 int delim = z[0]; 01582 for(i=1; (c=z[i])!=0; i++){ 01583 if( c==delim ){ 01584 if( z[i+1]==delim ){ 01585 i++; 01586 }else{ 01587 break; 01588 } 01589 } 01590 } 01591 *tokenType = TOKEN_STRING; 01592 return i + (c!=0); 01593 } 01594 case '[': { 01595 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} 01596 *tokenType = TOKEN_ID; 01597 return i; 01598 } 01599 default: { 01600 if( !IdChar(*z) ){ 01601 break; 01602 } 01603 for(i=1; IdChar(z[i]); i++){} 01604 *tokenType = TOKEN_ID; 01605 return i; 01606 } 01607 } 01608 *tokenType = TOKEN_PUNCT; 01609 return 1; 01610 } 01611 01612 /* 01613 ** A token extracted from a string is an instance of the following 01614 ** structure. 01615 */ 01616 typedef struct Token { 01617 const char *z; /* Pointer to token text. Not '\000' terminated */ 01618 short int n; /* Length of the token text in bytes. */ 01619 } Token; 01620 01621 /* 01622 ** Given a input string (which is really one of the argv[] parameters 01623 ** passed into xConnect or xCreate) split the string up into tokens. 01624 ** Return an array of pointers to '\000' terminated strings, one string 01625 ** for each non-whitespace token. 01626 ** 01627 ** The returned array is terminated by a single NULL pointer. 01628 ** 01629 ** Space to hold the returned array is obtained from a single 01630 ** malloc and should be freed by passing the return value to free(). 01631 ** The individual strings within the token list are all a part of 01632 ** the single memory allocation and will all be freed at once. 01633 */ 01634 static char **tokenizeString(const char *z, int *pnToken){ 01635 int nToken = 0; 01636 Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) ); 01637 int n = 1; 01638 int e, i; 01639 int totalSize = 0; 01640 char **azToken; 01641 char *zCopy; 01642 while( n>0 ){ 01643 n = getToken(z, &e); 01644 if( e!=TOKEN_SPACE ){ 01645 aToken[nToken].z = z; 01646 aToken[nToken].n = n; 01647 nToken++; 01648 totalSize += n+1; 01649 } 01650 z += n; 01651 } 01652 azToken = (char**)malloc( nToken*sizeof(char*) + totalSize ); 01653 zCopy = (char*)&azToken[nToken]; 01654 nToken--; 01655 for(i=0; i<nToken; i++){ 01656 azToken[i] = zCopy; 01657 n = aToken[i].n; 01658 memcpy(zCopy, aToken[i].z, n); 01659 zCopy[n] = 0; 01660 zCopy += n+1; 01661 } 01662 azToken[nToken] = 0; 01663 free(aToken); 01664 *pnToken = nToken; 01665 return azToken; 01666 } 01667 01668 /* 01669 ** Convert an SQL-style quoted string into a normal string by removing 01670 ** the quote characters. The conversion is done in-place. If the 01671 ** input does not begin with a quote character, then this routine 01672 ** is a no-op. 01673 ** 01674 ** Examples: 01675 ** 01676 ** "abc" becomes abc 01677 ** 'xyz' becomes xyz 01678 ** [pqr] becomes pqr 01679 ** `mno` becomes mno 01680 */ 01681 static void dequoteString(char *z){ 01682 int quote; 01683 int i, j; 01684 if( z==0 ) return; 01685 quote = z[0]; 01686 switch( quote ){ 01687 case '\'': break; 01688 case '"': break; 01689 case '`': break; /* For MySQL compatibility */ 01690 case '[': quote = ']'; break; /* For MS SqlServer compatibility */ 01691 default: return; 01692 } 01693 for(i=1, j=0; z[i]; i++){ 01694 if( z[i]==quote ){ 01695 if( z[i+1]==quote ){ 01696 z[j++] = quote; 01697 i++; 01698 }else{ 01699 z[j++] = 0; 01700 break; 01701 } 01702 }else{ 01703 z[j++] = z[i]; 01704 } 01705 } 01706 } 01707 01708 /* 01709 ** The input azIn is a NULL-terminated list of tokens. Remove the first 01710 ** token and all punctuation tokens. Remove the quotes from 01711 ** around string literal tokens. 01712 ** 01713 ** Example: 01714 ** 01715 ** input: tokenize chinese ( 'simplifed' , 'mixed' ) 01716 ** output: chinese simplifed mixed 01717 ** 01718 ** Another example: 01719 ** 01720 ** input: delimiters ( '[' , ']' , '...' ) 01721 ** output: [ ] ... 01722 */ 01723 static void tokenListToIdList(char **azIn){ 01724 int i, j; 01725 if( azIn ){ 01726 for(i=0, j=-1; azIn[i]; i++){ 01727 if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ 01728 dequoteString(azIn[i]); 01729 if( j>=0 ){ 01730 azIn[j] = azIn[i]; 01731 } 01732 j++; 01733 } 01734 } 01735 azIn[j] = 0; 01736 } 01737 } 01738 01739 01740 /* 01741 ** Find the first alphanumeric token in the string zIn. Null-terminate 01742 ** this token. Remove any quotation marks. And return a pointer to 01743 ** the result. 01744 */ 01745 static char *firstToken(char *zIn, char **pzTail){ 01746 int n, ttype; 01747 while(1){ 01748 n = getToken(zIn, &ttype); 01749 if( ttype==TOKEN_SPACE ){ 01750 zIn += n; 01751 }else if( ttype==TOKEN_EOF ){ 01752 *pzTail = zIn; 01753 return 0; 01754 }else{ 01755 zIn[n] = 0; 01756 *pzTail = &zIn[1]; 01757 dequoteString(zIn); 01758 return zIn; 01759 } 01760 } 01761 /*NOTREACHED*/ 01762 } 01763 01764 /* Return true if... 01765 ** 01766 ** * s begins with the string t, ignoring case 01767 ** * s is longer than t 01768 ** * The first character of s beyond t is not a alphanumeric 01769 ** 01770 ** Ignore leading space in *s. 01771 ** 01772 ** To put it another way, return true if the first token of 01773 ** s[] is t[]. 01774 */ 01775 static int startsWith(const char *s, const char *t){ 01776 while( safe_isspace(*s) ){ s++; } 01777 while( *t ){ 01778 if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; 01779 } 01780 return *s!='_' && !safe_isalnum(*s); 01781 } 01782 01783 /* 01784 ** An instance of this structure defines the "spec" of a 01785 ** full text index. This structure is populated by parseSpec 01786 ** and use by fulltextConnect and fulltextCreate. 01787 */ 01788 typedef struct TableSpec { 01789 const char *zDb; /* Logical database name */ 01790 const char *zName; /* Name of the full-text index */ 01791 int nColumn; /* Number of columns to be indexed */ 01792 char **azColumn; /* Original names of columns to be indexed */ 01793 char **azContentColumn; /* Column names for %_content */ 01794 char **azTokenizer; /* Name of tokenizer and its arguments */ 01795 } TableSpec; 01796 01797 /* 01798 ** Reclaim all of the memory used by a TableSpec 01799 */ 01800 static void clearTableSpec(TableSpec *p) { 01801 free(p->azColumn); 01802 free(p->azContentColumn); 01803 free(p->azTokenizer); 01804 } 01805 01806 /* Parse a CREATE VIRTUAL TABLE statement, which looks like this: 01807 * 01808 * CREATE VIRTUAL TABLE email 01809 * USING fts1(subject, body, tokenize mytokenizer(myarg)) 01810 * 01811 * We return parsed information in a TableSpec structure. 01812 * 01813 */ 01814 static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, 01815 char**pzErr){ 01816 int i, n; 01817 char *z, *zDummy; 01818 char **azArg; 01819 const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */ 01820 01821 assert( argc>=3 ); 01822 /* Current interface: 01823 ** argv[0] - module name 01824 ** argv[1] - database name 01825 ** argv[2] - table name 01826 ** argv[3..] - columns, optionally followed by tokenizer specification 01827 ** and snippet delimiters specification. 01828 */ 01829 01830 /* Make a copy of the complete argv[][] array in a single allocation. 01831 ** The argv[][] array is read-only and transient. We can write to the 01832 ** copy in order to modify things and the copy is persistent. 01833 */ 01834 memset(pSpec, 0, sizeof(*pSpec)); 01835 for(i=n=0; i<argc; i++){ 01836 n += strlen(argv[i]) + 1; 01837 } 01838 azArg = malloc( sizeof(char*)*argc + n ); 01839 if( azArg==0 ){ 01840 return SQLITE_NOMEM; 01841 } 01842 z = (char*)&azArg[argc]; 01843 for(i=0; i<argc; i++){ 01844 azArg[i] = z; 01845 strcpy(z, argv[i]); 01846 z += strlen(z)+1; 01847 } 01848 01849 /* Identify the column names and the tokenizer and delimiter arguments 01850 ** in the argv[][] array. 01851 */ 01852 pSpec->zDb = azArg[1]; 01853 pSpec->zName = azArg[2]; 01854 pSpec->nColumn = 0; 01855 pSpec->azColumn = azArg; 01856 zTokenizer = "tokenize simple"; 01857 for(i=3; i<argc; ++i){ 01858 if( startsWith(azArg[i],"tokenize") ){ 01859 zTokenizer = azArg[i]; 01860 }else{ 01861 z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy); 01862 pSpec->nColumn++; 01863 } 01864 } 01865 if( pSpec->nColumn==0 ){ 01866 azArg[0] = "content"; 01867 pSpec->nColumn = 1; 01868 } 01869 01870 /* 01871 ** Construct the list of content column names. 01872 ** 01873 ** Each content column name will be of the form cNNAAAA 01874 ** where NN is the column number and AAAA is the sanitized 01875 ** column name. "sanitized" means that special characters are 01876 ** converted to "_". The cNN prefix guarantees that all column 01877 ** names are unique. 01878 ** 01879 ** The AAAA suffix is not strictly necessary. It is included 01880 ** for the convenience of people who might examine the generated 01881 ** %_content table and wonder what the columns are used for. 01882 */ 01883 pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) ); 01884 if( pSpec->azContentColumn==0 ){ 01885 clearTableSpec(pSpec); 01886 return SQLITE_NOMEM; 01887 } 01888 for(i=0; i<pSpec->nColumn; i++){ 01889 char *p; 01890 pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); 01891 for (p = pSpec->azContentColumn[i]; *p ; ++p) { 01892 if( !safe_isalnum(*p) ) *p = '_'; 01893 } 01894 } 01895 01896 /* 01897 ** Parse the tokenizer specification string. 01898 */ 01899 pSpec->azTokenizer = tokenizeString(zTokenizer, &n); 01900 tokenListToIdList(pSpec->azTokenizer); 01901 01902 return SQLITE_OK; 01903 } 01904 01905 /* 01906 ** Generate a CREATE TABLE statement that describes the schema of 01907 ** the virtual table. Return a pointer to this schema string. 01908 ** 01909 ** Space is obtained from sqlite3_mprintf() and should be freed 01910 ** using sqlite3_free(). 01911 */ 01912 static char *fulltextSchema( 01913 int nColumn, /* Number of columns */ 01914 const char *const* azColumn, /* List of columns */ 01915 const char *zTableName /* Name of the table */ 01916 ){ 01917 int i; 01918 char *zSchema, *zNext; 01919 const char *zSep = "("; 01920 zSchema = sqlite3_mprintf("CREATE TABLE x"); 01921 for(i=0; i<nColumn; i++){ 01922 zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]); 01923 sqlite3_free(zSchema); 01924 zSchema = zNext; 01925 zSep = ","; 01926 } 01927 zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName); 01928 sqlite3_free(zSchema); 01929 return zNext; 01930 } 01931 01932 /* 01933 ** Build a new sqlite3_vtab structure that will describe the 01934 ** fulltext index defined by spec. 01935 */ 01936 static int constructVtab( 01937 sqlite3 *db, /* The SQLite database connection */ 01938 TableSpec *spec, /* Parsed spec information from parseSpec() */ 01939 sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */ 01940 char **pzErr /* Write any error message here */ 01941 ){ 01942 int rc; 01943 int n; 01944 fulltext_vtab *v = 0; 01945 const sqlite3_tokenizer_module *m = NULL; 01946 char *schema; 01947 01948 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); 01949 if( v==0 ) return SQLITE_NOMEM; 01950 memset(v, 0, sizeof(*v)); 01951 /* sqlite will initialize v->base */ 01952 v->db = db; 01953 v->zDb = spec->zDb; /* Freed when azColumn is freed */ 01954 v->zName = spec->zName; /* Freed when azColumn is freed */ 01955 v->nColumn = spec->nColumn; 01956 v->azContentColumn = spec->azContentColumn; 01957 spec->azContentColumn = 0; 01958 v->azColumn = spec->azColumn; 01959 spec->azColumn = 0; 01960 01961 if( spec->azTokenizer==0 ){ 01962 return SQLITE_NOMEM; 01963 } 01964 /* TODO(shess) For now, add new tokenizers as else if clauses. */ 01965 if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){ 01966 sqlite3Fts1SimpleTokenizerModule(&m); 01967 }else if( startsWith(spec->azTokenizer[0], "porter") ){ 01968 sqlite3Fts1PorterTokenizerModule(&m); 01969 }else{ 01970 *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]); 01971 rc = SQLITE_ERROR; 01972 goto err; 01973 } 01974 for(n=0; spec->azTokenizer[n]; n++){} 01975 if( n ){ 01976 rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1], 01977 &v->pTokenizer); 01978 }else{ 01979 rc = m->xCreate(0, 0, &v->pTokenizer); 01980 } 01981 if( rc!=SQLITE_OK ) goto err; 01982 v->pTokenizer->pModule = m; 01983 01984 /* TODO: verify the existence of backing tables foo_content, foo_term */ 01985 01986 schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn, 01987 spec->zName); 01988 rc = sqlite3_declare_vtab(db, schema); 01989 sqlite3_free(schema); 01990 if( rc!=SQLITE_OK ) goto err; 01991 01992 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); 01993 01994 *ppVTab = &v->base; 01995 TRACE(("FTS1 Connect %p\n", v)); 01996 01997 return rc; 01998 01999 err: 02000 fulltext_vtab_destroy(v); 02001 return rc; 02002 } 02003 02004 static int fulltextConnect( 02005 sqlite3 *db, 02006 void *pAux, 02007 int argc, const char *const*argv, 02008 sqlite3_vtab **ppVTab, 02009 char **pzErr 02010 ){ 02011 TableSpec spec; 02012 int rc = parseSpec(&spec, argc, argv, pzErr); 02013 if( rc!=SQLITE_OK ) return rc; 02014 02015 rc = constructVtab(db, &spec, ppVTab, pzErr); 02016 clearTableSpec(&spec); 02017 return rc; 02018 } 02019 02020 /* The %_content table holds the text of each document, with 02021 ** the rowid used as the docid. 02022 ** 02023 ** The %_term table maps each term to a document list blob 02024 ** containing elements sorted by ascending docid, each element 02025 ** encoded as: 02026 ** 02027 ** docid varint-encoded 02028 ** token elements: 02029 ** position+1 varint-encoded as delta from previous position 02030 ** start offset varint-encoded as delta from previous start offset 02031 ** end offset varint-encoded as delta from start offset 02032 ** 02033 ** The sentinel position of 0 indicates the end of the token list. 02034 ** 02035 ** Additionally, doclist blobs are chunked into multiple segments, 02036 ** using segment to order the segments. New elements are added to 02037 ** the segment at segment 0, until it exceeds CHUNK_MAX. Then 02038 ** segment 0 is deleted, and the doclist is inserted at segment 1. 02039 ** If there is already a doclist at segment 1, the segment 0 doclist 02040 ** is merged with it, the segment 1 doclist is deleted, and the 02041 ** merged doclist is inserted at segment 2, repeating those 02042 ** operations until an insert succeeds. 02043 ** 02044 ** Since this structure doesn't allow us to update elements in place 02045 ** in case of deletion or update, these are simply written to 02046 ** segment 0 (with an empty token list in case of deletion), with 02047 ** docListAccumulate() taking care to retain lower-segment 02048 ** information in preference to higher-segment information. 02049 */ 02050 /* TODO(shess) Provide a VACUUM type operation which both removes 02051 ** deleted elements which are no longer necessary, and duplicated 02052 ** elements. I suspect this will probably not be necessary in 02053 ** practice, though. 02054 */ 02055 static int fulltextCreate(sqlite3 *db, void *pAux, 02056 int argc, const char * const *argv, 02057 sqlite3_vtab **ppVTab, char **pzErr){ 02058 int rc; 02059 TableSpec spec; 02060 StringBuffer schema; 02061 TRACE(("FTS1 Create\n")); 02062 02063 rc = parseSpec(&spec, argc, argv, pzErr); 02064 if( rc!=SQLITE_OK ) return rc; 02065 02066 initStringBuffer(&schema); 02067 append(&schema, "CREATE TABLE %_content("); 02068 appendList(&schema, spec.nColumn, spec.azContentColumn); 02069 append(&schema, ")"); 02070 rc = sql_exec(db, spec.zDb, spec.zName, schema.s); 02071 free(schema.s); 02072 if( rc!=SQLITE_OK ) goto out; 02073 02074 rc = sql_exec(db, spec.zDb, spec.zName, 02075 "create table %_term(term text, segment integer, doclist blob, " 02076 "primary key(term, segment));"); 02077 if( rc!=SQLITE_OK ) goto out; 02078 02079 rc = constructVtab(db, &spec, ppVTab, pzErr); 02080 02081 out: 02082 clearTableSpec(&spec); 02083 return rc; 02084 } 02085 02086 /* Decide how to handle an SQL query. */ 02087 static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ 02088 int i; 02089 TRACE(("FTS1 BestIndex\n")); 02090 02091 for(i=0; i<pInfo->nConstraint; ++i){ 02092 const struct sqlite3_index_constraint *pConstraint; 02093 pConstraint = &pInfo->aConstraint[i]; 02094 if( pConstraint->usable ) { 02095 if( pConstraint->iColumn==-1 && 02096 pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){ 02097 pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */ 02098 TRACE(("FTS1 QUERY_ROWID\n")); 02099 } else if( pConstraint->iColumn>=0 && 02100 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){ 02101 /* full-text search */ 02102 pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn; 02103 TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn)); 02104 } else continue; 02105 02106 pInfo->aConstraintUsage[i].argvIndex = 1; 02107 pInfo->aConstraintUsage[i].omit = 1; 02108 02109 /* An arbitrary value for now. 02110 * TODO: Perhaps rowid matches should be considered cheaper than 02111 * full-text searches. */ 02112 pInfo->estimatedCost = 1.0; 02113 02114 return SQLITE_OK; 02115 } 02116 } 02117 pInfo->idxNum = QUERY_GENERIC; 02118 return SQLITE_OK; 02119 } 02120 02121 static int fulltextDisconnect(sqlite3_vtab *pVTab){ 02122 TRACE(("FTS1 Disconnect %p\n", pVTab)); 02123 fulltext_vtab_destroy((fulltext_vtab *)pVTab); 02124 return SQLITE_OK; 02125 } 02126 02127 static int fulltextDestroy(sqlite3_vtab *pVTab){ 02128 fulltext_vtab *v = (fulltext_vtab *)pVTab; 02129 int rc; 02130 02131 TRACE(("FTS1 Destroy %p\n", pVTab)); 02132 rc = sql_exec(v->db, v->zDb, v->zName, 02133 "drop table if exists %_content;" 02134 "drop table if exists %_term;" 02135 ); 02136 if( rc!=SQLITE_OK ) return rc; 02137 02138 fulltext_vtab_destroy((fulltext_vtab *)pVTab); 02139 return SQLITE_OK; 02140 } 02141 02142 static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ 02143 fulltext_cursor *c; 02144 02145 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); 02146 /* sqlite will initialize c->base */ 02147 *ppCursor = &c->base; 02148 TRACE(("FTS1 Open %p: %p\n", pVTab, c)); 02149 02150 return SQLITE_OK; 02151 } 02152 02153 02154 /* Free all of the dynamically allocated memory held by *q 02155 */ 02156 static void queryClear(Query *q){ 02157 int i; 02158 for(i = 0; i < q->nTerms; ++i){ 02159 free(q->pTerms[i].pTerm); 02160 } 02161 free(q->pTerms); 02162 memset(q, 0, sizeof(*q)); 02163 } 02164 02165 /* Free all of the dynamically allocated memory held by the 02166 ** Snippet 02167 */ 02168 static void snippetClear(Snippet *p){ 02169 free(p->aMatch); 02170 free(p->zOffset); 02171 free(p->zSnippet); 02172 memset(p, 0, sizeof(*p)); 02173 } 02174 /* 02175 ** Append a single entry to the p->aMatch[] log. 02176 */ 02177 static void snippetAppendMatch( 02178 Snippet *p, /* Append the entry to this snippet */ 02179 int iCol, int iTerm, /* The column and query term */ 02180 int iStart, int nByte /* Offset and size of the match */ 02181 ){ 02182 int i; 02183 struct snippetMatch *pMatch; 02184 if( p->nMatch+1>=p->nAlloc ){ 02185 p->nAlloc = p->nAlloc*2 + 10; 02186 p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); 02187 if( p->aMatch==0 ){ 02188 p->nMatch = 0; 02189 p->nAlloc = 0; 02190 return; 02191 } 02192 } 02193 i = p->nMatch++; 02194 pMatch = &p->aMatch[i]; 02195 pMatch->iCol = iCol; 02196 pMatch->iTerm = iTerm; 02197 pMatch->iStart = iStart; 02198 pMatch->nByte = nByte; 02199 } 02200 02201 /* 02202 ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() 02203 */ 02204 #define FTS1_ROTOR_SZ (32) 02205 #define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1) 02206 02207 /* 02208 ** Add entries to pSnippet->aMatch[] for every match that occurs against 02209 ** document zDoc[0..nDoc-1] which is stored in column iColumn. 02210 */ 02211 static void snippetOffsetsOfColumn( 02212 Query *pQuery, 02213 Snippet *pSnippet, 02214 int iColumn, 02215 const char *zDoc, 02216 int nDoc 02217 ){ 02218 const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ 02219 sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ 02220 sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ 02221 fulltext_vtab *pVtab; /* The full text index */ 02222 int nColumn; /* Number of columns in the index */ 02223 const QueryTerm *aTerm; /* Query string terms */ 02224 int nTerm; /* Number of query string terms */ 02225 int i, j; /* Loop counters */ 02226 int rc; /* Return code */ 02227 unsigned int match, prevMatch; /* Phrase search bitmasks */ 02228 const char *zToken; /* Next token from the tokenizer */ 02229 int nToken; /* Size of zToken */ 02230 int iBegin, iEnd, iPos; /* Offsets of beginning and end */ 02231 02232 /* The following variables keep a circular buffer of the last 02233 ** few tokens */ 02234 unsigned int iRotor = 0; /* Index of current token */ 02235 int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */ 02236 int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */ 02237 02238 pVtab = pQuery->pFts; 02239 nColumn = pVtab->nColumn; 02240 pTokenizer = pVtab->pTokenizer; 02241 pTModule = pTokenizer->pModule; 02242 rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); 02243 if( rc ) return; 02244 pTCursor->pTokenizer = pTokenizer; 02245 aTerm = pQuery->pTerms; 02246 nTerm = pQuery->nTerms; 02247 if( nTerm>=FTS1_ROTOR_SZ ){ 02248 nTerm = FTS1_ROTOR_SZ - 1; 02249 } 02250 prevMatch = 0; 02251 while(1){ 02252 rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); 02253 if( rc ) break; 02254 iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin; 02255 iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin; 02256 match = 0; 02257 for(i=0; i<nTerm; i++){ 02258 int iCol; 02259 iCol = aTerm[i].iColumn; 02260 if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue; 02261 if( aTerm[i].nTerm!=nToken ) continue; 02262 if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue; 02263 if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; 02264 match |= 1<<i; 02265 if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ 02266 for(j=aTerm[i].iPhrase-1; j>=0; j--){ 02267 int k = (iRotor-j) & FTS1_ROTOR_MASK; 02268 snippetAppendMatch(pSnippet, iColumn, i-j, 02269 iRotorBegin[k], iRotorLen[k]); 02270 } 02271 } 02272 } 02273 prevMatch = match<<1; 02274 iRotor++; 02275 } 02276 pTModule->xClose(pTCursor); 02277 } 02278 02279 02280 /* 02281 ** Compute all offsets for the current row of the query. 02282 ** If the offsets have already been computed, this routine is a no-op. 02283 */ 02284 static void snippetAllOffsets(fulltext_cursor *p){ 02285 int nColumn; 02286 int iColumn, i; 02287 int iFirst, iLast; 02288 fulltext_vtab *pFts; 02289 02290 if( p->snippet.nMatch ) return; 02291 if( p->q.nTerms==0 ) return; 02292 pFts = p->q.pFts; 02293 nColumn = pFts->nColumn; 02294 iColumn = p->iCursorType - QUERY_FULLTEXT; 02295 if( iColumn<0 || iColumn>=nColumn ){ 02296 iFirst = 0; 02297 iLast = nColumn-1; 02298 }else{ 02299 iFirst = iColumn; 02300 iLast = iColumn; 02301 } 02302 for(i=iFirst; i<=iLast; i++){ 02303 const char *zDoc; 02304 int nDoc; 02305 zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); 02306 nDoc = sqlite3_column_bytes(p->pStmt, i+1); 02307 snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); 02308 } 02309 } 02310 02311 /* 02312 ** Convert the information in the aMatch[] array of the snippet 02313 ** into the string zOffset[0..nOffset-1]. 02314 */ 02315 static void snippetOffsetText(Snippet *p){ 02316 int i; 02317 int cnt = 0; 02318 StringBuffer sb; 02319 char zBuf[200]; 02320 if( p->zOffset ) return; 02321 initStringBuffer(&sb); 02322 for(i=0; i<p->nMatch; i++){ 02323 struct snippetMatch *pMatch = &p->aMatch[i]; 02324 zBuf[0] = ' '; 02325 sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d", 02326 pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte); 02327 append(&sb, zBuf); 02328 cnt++; 02329 } 02330 p->zOffset = sb.s; 02331 p->nOffset = sb.len; 02332 } 02333 02334 /* 02335 ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set 02336 ** of matching words some of which might be in zDoc. zDoc is column 02337 ** number iCol. 02338 ** 02339 ** iBreak is suggested spot in zDoc where we could begin or end an 02340 ** excerpt. Return a value similar to iBreak but possibly adjusted 02341 ** to be a little left or right so that the break point is better. 02342 */ 02343 static int wordBoundary( 02344 int iBreak, /* The suggested break point */ 02345 const char *zDoc, /* Document text */ 02346 int nDoc, /* Number of bytes in zDoc[] */ 02347 struct snippetMatch *aMatch, /* Matching words */ 02348 int nMatch, /* Number of entries in aMatch[] */ 02349 int iCol /* The column number for zDoc[] */ 02350 ){ 02351 int i; 02352 if( iBreak<=10 ){ 02353 return 0; 02354 } 02355 if( iBreak>=nDoc-10 ){ 02356 return nDoc; 02357 } 02358 for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){} 02359 while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; } 02360 if( i<nMatch ){ 02361 if( aMatch[i].iStart<iBreak+10 ){ 02362 return aMatch[i].iStart; 02363 } 02364 if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ 02365 return aMatch[i-1].iStart; 02366 } 02367 } 02368 for(i=1; i<=10; i++){ 02369 if( safe_isspace(zDoc[iBreak-i]) ){ 02370 return iBreak - i + 1; 02371 } 02372 if( safe_isspace(zDoc[iBreak+i]) ){ 02373 return iBreak + i + 1; 02374 } 02375 } 02376 return iBreak; 02377 } 02378 02379 /* 02380 ** If the StringBuffer does not end in white space, add a single 02381 ** space character to the end. 02382 */ 02383 static void appendWhiteSpace(StringBuffer *p){ 02384 if( p->len==0 ) return; 02385 if( safe_isspace(p->s[p->len-1]) ) return; 02386 append(p, " "); 02387 } 02388 02389 /* 02390 ** Remove white space from teh end of the StringBuffer 02391 */ 02392 static void trimWhiteSpace(StringBuffer *p){ 02393 while( p->len>0 && safe_isspace(p->s[p->len-1]) ){ 02394 p->len--; 02395 } 02396 } 02397 02398 02399 02400 /* 02401 ** Allowed values for Snippet.aMatch[].snStatus 02402 */ 02403 #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ 02404 #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ 02405 02406 /* 02407 ** Generate the text of a snippet. 02408 */ 02409 static void snippetText( 02410 fulltext_cursor *pCursor, /* The cursor we need the snippet for */ 02411 const char *zStartMark, /* Markup to appear before each match */ 02412 const char *zEndMark, /* Markup to appear after each match */ 02413 const char *zEllipsis /* Ellipsis mark */ 02414 ){ 02415 int i, j; 02416 struct snippetMatch *aMatch; 02417 int nMatch; 02418 int nDesired; 02419 StringBuffer sb; 02420 int tailCol; 02421 int tailOffset; 02422 int iCol; 02423 int nDoc; 02424 const char *zDoc; 02425 int iStart, iEnd; 02426 int tailEllipsis = 0; 02427 int iMatch; 02428 02429 02430 free(pCursor->snippet.zSnippet); 02431 pCursor->snippet.zSnippet = 0; 02432 aMatch = pCursor->snippet.aMatch; 02433 nMatch = pCursor->snippet.nMatch; 02434 initStringBuffer(&sb); 02435 02436 for(i=0; i<nMatch; i++){ 02437 aMatch[i].snStatus = SNIPPET_IGNORE; 02438 } 02439 nDesired = 0; 02440 for(i=0; i<pCursor->q.nTerms; i++){ 02441 for(j=0; j<nMatch; j++){ 02442 if( aMatch[j].iTerm==i ){ 02443 aMatch[j].snStatus = SNIPPET_DESIRED; 02444 nDesired++; 02445 break; 02446 } 02447 } 02448 } 02449 02450 iMatch = 0; 02451 tailCol = -1; 02452 tailOffset = 0; 02453 for(i=0; i<nMatch && nDesired>0; i++){ 02454 if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; 02455 nDesired--; 02456 iCol = aMatch[i].iCol; 02457 zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); 02458 nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); 02459 iStart = aMatch[i].iStart - 40; 02460 iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); 02461 if( iStart<=10 ){ 02462 iStart = 0; 02463 } 02464 if( iCol==tailCol && iStart<=tailOffset+20 ){ 02465 iStart = tailOffset; 02466 } 02467 if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){ 02468 trimWhiteSpace(&sb); 02469 appendWhiteSpace(&sb); 02470 append(&sb, zEllipsis); 02471 appendWhiteSpace(&sb); 02472 } 02473 iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; 02474 iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); 02475 if( iEnd>=nDoc-10 ){ 02476 iEnd = nDoc; 02477 tailEllipsis = 0; 02478 }else{ 02479 tailEllipsis = 1; 02480 } 02481 while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; } 02482 while( iStart<iEnd ){ 02483 while( iMatch<nMatch && aMatch[iMatch].iStart<iStart 02484 && aMatch[iMatch].iCol<=iCol ){ 02485 iMatch++; 02486 } 02487 if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd 02488 && aMatch[iMatch].iCol==iCol ){ 02489 nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart); 02490 iStart = aMatch[iMatch].iStart; 02491 append(&sb, zStartMark); 02492 nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte); 02493 append(&sb, zEndMark); 02494 iStart += aMatch[iMatch].nByte; 02495 for(j=iMatch+1; j<nMatch; j++){ 02496 if( aMatch[j].iTerm==aMatch[iMatch].iTerm 02497 && aMatch[j].snStatus==SNIPPET_DESIRED ){ 02498 nDesired--; 02499 aMatch[j].snStatus = SNIPPET_IGNORE; 02500 } 02501 } 02502 }else{ 02503 nappend(&sb, &zDoc[iStart], iEnd - iStart); 02504 iStart = iEnd; 02505 } 02506 } 02507 tailCol = iCol; 02508 tailOffset = iEnd; 02509 } 02510 trimWhiteSpace(&sb); 02511 if( tailEllipsis ){ 02512 appendWhiteSpace(&sb); 02513 append(&sb, zEllipsis); 02514 } 02515 pCursor->snippet.zSnippet = sb.s; 02516 pCursor->snippet.nSnippet = sb.len; 02517 } 02518 02519 02520 /* 02521 ** Close the cursor. For additional information see the documentation 02522 ** on the xClose method of the virtual table interface. 02523 */ 02524 static int fulltextClose(sqlite3_vtab_cursor *pCursor){ 02525 fulltext_cursor *c = (fulltext_cursor *) pCursor; 02526 TRACE(("FTS1 Close %p\n", c)); 02527 sqlite3_finalize(c->pStmt); 02528 queryClear(&c->q); 02529 snippetClear(&c->snippet); 02530 if( c->result.pDoclist!=NULL ){ 02531 docListDelete(c->result.pDoclist); 02532 } 02533 free(c); 02534 return SQLITE_OK; 02535 } 02536 02537 static int fulltextNext(sqlite3_vtab_cursor *pCursor){ 02538 fulltext_cursor *c = (fulltext_cursor *) pCursor; 02539 sqlite_int64 iDocid; 02540 int rc; 02541 02542 TRACE(("FTS1 Next %p\n", pCursor)); 02543 snippetClear(&c->snippet); 02544 if( c->iCursorType < QUERY_FULLTEXT ){ 02545 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ 02546 rc = sqlite3_step(c->pStmt); 02547 switch( rc ){ 02548 case SQLITE_ROW: 02549 c->eof = 0; 02550 return SQLITE_OK; 02551 case SQLITE_DONE: 02552 c->eof = 1; 02553 return SQLITE_OK; 02554 default: 02555 c->eof = 1; 02556 return rc; 02557 } 02558 } else { /* full-text query */ 02559 rc = sqlite3_reset(c->pStmt); 02560 if( rc!=SQLITE_OK ) return rc; 02561 02562 iDocid = nextDocid(&c->result); 02563 if( iDocid==0 ){ 02564 c->eof = 1; 02565 return SQLITE_OK; 02566 } 02567 rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); 02568 if( rc!=SQLITE_OK ) return rc; 02569 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ 02570 rc = sqlite3_step(c->pStmt); 02571 if( rc==SQLITE_ROW ){ /* the case we expect */ 02572 c->eof = 0; 02573 return SQLITE_OK; 02574 } 02575 /* an error occurred; abort */ 02576 return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 02577 } 02578 } 02579 02580 02581 /* Return a DocList corresponding to the query term *pTerm. If *pTerm 02582 ** is the first term of a phrase query, go ahead and evaluate the phrase 02583 ** query and return the doclist for the entire phrase query. 02584 ** 02585 ** The result is stored in pTerm->doclist. 02586 */ 02587 static int docListOfTerm( 02588 fulltext_vtab *v, /* The full text index */ 02589 int iColumn, /* column to restrict to. No restrition if >=nColumn */ 02590 QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ 02591 DocList **ppResult /* Write the result here */ 02592 ){ 02593 DocList *pLeft, *pRight, *pNew; 02594 int i, rc; 02595 02596 pLeft = docListNew(DL_POSITIONS); 02597 rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft); 02598 if( rc ){ 02599 docListDelete(pLeft); 02600 return rc; 02601 } 02602 for(i=1; i<=pQTerm->nPhrase; i++){ 02603 pRight = docListNew(DL_POSITIONS); 02604 rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight); 02605 if( rc ){ 02606 docListDelete(pLeft); 02607 return rc; 02608 } 02609 pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS); 02610 docListPhraseMerge(pLeft, pRight, pNew); 02611 docListDelete(pLeft); 02612 docListDelete(pRight); 02613 pLeft = pNew; 02614 } 02615 *ppResult = pLeft; 02616 return SQLITE_OK; 02617 } 02618 02619 /* Add a new term pTerm[0..nTerm-1] to the query *q. 02620 */ 02621 static void queryAdd(Query *q, const char *pTerm, int nTerm){ 02622 QueryTerm *t; 02623 ++q->nTerms; 02624 q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0])); 02625 if( q->pTerms==0 ){ 02626 q->nTerms = 0; 02627 return; 02628 } 02629 t = &q->pTerms[q->nTerms - 1]; 02630 memset(t, 0, sizeof(*t)); 02631 t->pTerm = malloc(nTerm+1); 02632 memcpy(t->pTerm, pTerm, nTerm); 02633 t->pTerm[nTerm] = 0; 02634 t->nTerm = nTerm; 02635 t->isOr = q->nextIsOr; 02636 q->nextIsOr = 0; 02637 t->iColumn = q->nextColumn; 02638 q->nextColumn = q->dfltColumn; 02639 } 02640 02641 /* 02642 ** Check to see if the string zToken[0...nToken-1] matches any 02643 ** column name in the virtual table. If it does, 02644 ** return the zero-indexed column number. If not, return -1. 02645 */ 02646 static int checkColumnSpecifier( 02647 fulltext_vtab *pVtab, /* The virtual table */ 02648 const char *zToken, /* Text of the token */ 02649 int nToken /* Number of characters in the token */ 02650 ){ 02651 int i; 02652 for(i=0; i<pVtab->nColumn; i++){ 02653 if( memcmp(pVtab->azColumn[i], zToken, nToken)==0 02654 && pVtab->azColumn[i][nToken]==0 ){ 02655 return i; 02656 } 02657 } 02658 return -1; 02659 } 02660 02661 /* 02662 ** Parse the text at pSegment[0..nSegment-1]. Add additional terms 02663 ** to the query being assemblied in pQuery. 02664 ** 02665 ** inPhrase is true if pSegment[0..nSegement-1] is contained within 02666 ** double-quotes. If inPhrase is true, then the first term 02667 ** is marked with the number of terms in the phrase less one and 02668 ** OR and "-" syntax is ignored. If inPhrase is false, then every 02669 ** term found is marked with nPhrase=0 and OR and "-" syntax is significant. 02670 */ 02671 static int tokenizeSegment( 02672 sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */ 02673 const char *pSegment, int nSegment, /* Query expression being parsed */ 02674 int inPhrase, /* True if within "..." */ 02675 Query *pQuery /* Append results here */ 02676 ){ 02677 const sqlite3_tokenizer_module *pModule = pTokenizer->pModule; 02678 sqlite3_tokenizer_cursor *pCursor; 02679 int firstIndex = pQuery->nTerms; 02680 int iCol; 02681 int nTerm = 1; 02682 02683 int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor); 02684 if( rc!=SQLITE_OK ) return rc; 02685 pCursor->pTokenizer = pTokenizer; 02686 02687 while( 1 ){ 02688 const char *pToken; 02689 int nToken, iBegin, iEnd, iPos; 02690 02691 rc = pModule->xNext(pCursor, 02692 &pToken, &nToken, 02693 &iBegin, &iEnd, &iPos); 02694 if( rc!=SQLITE_OK ) break; 02695 if( !inPhrase && 02696 pSegment[iEnd]==':' && 02697 (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){ 02698 pQuery->nextColumn = iCol; 02699 continue; 02700 } 02701 if( !inPhrase && pQuery->nTerms>0 && nToken==2 02702 && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){ 02703 pQuery->nextIsOr = 1; 02704 continue; 02705 } 02706 queryAdd(pQuery, pToken, nToken); 02707 if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ 02708 pQuery->pTerms[pQuery->nTerms-1].isNot = 1; 02709 } 02710 pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm; 02711 if( inPhrase ){ 02712 nTerm++; 02713 } 02714 } 02715 02716 if( inPhrase && pQuery->nTerms>firstIndex ){ 02717 pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1; 02718 } 02719 02720 return pModule->xClose(pCursor); 02721 } 02722 02723 /* Parse a query string, yielding a Query object pQuery. 02724 ** 02725 ** The calling function will need to queryClear() to clean up 02726 ** the dynamically allocated memory held by pQuery. 02727 */ 02728 static int parseQuery( 02729 fulltext_vtab *v, /* The fulltext index */ 02730 const char *zInput, /* Input text of the query string */ 02731 int nInput, /* Size of the input text */ 02732 int dfltColumn, /* Default column of the index to match against */ 02733 Query *pQuery /* Write the parse results here. */ 02734 ){ 02735 int iInput, inPhrase = 0; 02736 02737 if( zInput==0 ) nInput = 0; 02738 if( nInput<0 ) nInput = strlen(zInput); 02739 pQuery->nTerms = 0; 02740 pQuery->pTerms = NULL; 02741 pQuery->nextIsOr = 0; 02742 pQuery->nextColumn = dfltColumn; 02743 pQuery->dfltColumn = dfltColumn; 02744 pQuery->pFts = v; 02745 02746 for(iInput=0; iInput<nInput; ++iInput){ 02747 int i; 02748 for(i=iInput; i<nInput && zInput[i]!='"'; ++i){} 02749 if( i>iInput ){ 02750 tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase, 02751 pQuery); 02752 } 02753 iInput = i; 02754 if( i<nInput ){ 02755 assert( zInput[i]=='"' ); 02756 inPhrase = !inPhrase; 02757 } 02758 } 02759 02760 if( inPhrase ){ 02761 /* unmatched quote */ 02762 queryClear(pQuery); 02763 return SQLITE_ERROR; 02764 } 02765 return SQLITE_OK; 02766 } 02767 02768 /* Perform a full-text query using the search expression in 02769 ** zInput[0..nInput-1]. Return a list of matching documents 02770 ** in pResult. 02771 ** 02772 ** Queries must match column iColumn. Or if iColumn>=nColumn 02773 ** they are allowed to match against any column. 02774 */ 02775 static int fulltextQuery( 02776 fulltext_vtab *v, /* The full text index */ 02777 int iColumn, /* Match against this column by default */ 02778 const char *zInput, /* The query string */ 02779 int nInput, /* Number of bytes in zInput[] */ 02780 DocList **pResult, /* Write the result doclist here */ 02781 Query *pQuery /* Put parsed query string here */ 02782 ){ 02783 int i, iNext, rc; 02784 DocList *pLeft = NULL; 02785 DocList *pRight, *pNew, *pOr; 02786 int nNot = 0; 02787 QueryTerm *aTerm; 02788 02789 rc = parseQuery(v, zInput, nInput, iColumn, pQuery); 02790 if( rc!=SQLITE_OK ) return rc; 02791 02792 /* Merge AND terms. */ 02793 aTerm = pQuery->pTerms; 02794 for(i = 0; i<pQuery->nTerms; i=iNext){ 02795 if( aTerm[i].isNot ){ 02796 /* Handle all NOT terms in a separate pass */ 02797 nNot++; 02798 iNext = i + aTerm[i].nPhrase+1; 02799 continue; 02800 } 02801 iNext = i + aTerm[i].nPhrase + 1; 02802 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight); 02803 if( rc ){ 02804 queryClear(pQuery); 02805 return rc; 02806 } 02807 while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){ 02808 rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr); 02809 iNext += aTerm[iNext].nPhrase + 1; 02810 if( rc ){ 02811 queryClear(pQuery); 02812 return rc; 02813 } 02814 pNew = docListNew(DL_DOCIDS); 02815 docListOrMerge(pRight, pOr, pNew); 02816 docListDelete(pRight); 02817 docListDelete(pOr); 02818 pRight = pNew; 02819 } 02820 if( pLeft==0 ){ 02821 pLeft = pRight; 02822 }else{ 02823 pNew = docListNew(DL_DOCIDS); 02824 docListAndMerge(pLeft, pRight, pNew); 02825 docListDelete(pRight); 02826 docListDelete(pLeft); 02827 pLeft = pNew; 02828 } 02829 } 02830 02831 if( nNot && pLeft==0 ){ 02832 /* We do not yet know how to handle a query of only NOT terms */ 02833 return SQLITE_ERROR; 02834 } 02835 02836 /* Do the EXCEPT terms */ 02837 for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){ 02838 if( !aTerm[i].isNot ) continue; 02839 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight); 02840 if( rc ){ 02841 queryClear(pQuery); 02842 docListDelete(pLeft); 02843 return rc; 02844 } 02845 pNew = docListNew(DL_DOCIDS); 02846 docListExceptMerge(pLeft, pRight, pNew); 02847 docListDelete(pRight); 02848 docListDelete(pLeft); 02849 pLeft = pNew; 02850 } 02851 02852 *pResult = pLeft; 02853 return rc; 02854 } 02855 02856 /* 02857 ** This is the xFilter interface for the virtual table. See 02858 ** the virtual table xFilter method documentation for additional 02859 ** information. 02860 ** 02861 ** If idxNum==QUERY_GENERIC then do a full table scan against 02862 ** the %_content table. 02863 ** 02864 ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry 02865 ** in the %_content table. 02866 ** 02867 ** If idxNum>=QUERY_FULLTEXT then use the full text index. The 02868 ** column on the left-hand side of the MATCH operator is column 02869 ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand 02870 ** side of the MATCH operator. 02871 */ 02872 /* TODO(shess) Upgrade the cursor initialization and destruction to 02873 ** account for fulltextFilter() being called multiple times on the 02874 ** same cursor. The current solution is very fragile. Apply fix to 02875 ** fts2 as appropriate. 02876 */ 02877 static int fulltextFilter( 02878 sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */ 02879 int idxNum, const char *idxStr, /* Which indexing scheme to use */ 02880 int argc, sqlite3_value **argv /* Arguments for the indexing scheme */ 02881 ){ 02882 fulltext_cursor *c = (fulltext_cursor *) pCursor; 02883 fulltext_vtab *v = cursor_vtab(c); 02884 int rc; 02885 char *zSql; 02886 02887 TRACE(("FTS1 Filter %p\n",pCursor)); 02888 02889 zSql = sqlite3_mprintf("select rowid, * from %%_content %s", 02890 idxNum==QUERY_GENERIC ? "" : "where rowid=?"); 02891 sqlite3_finalize(c->pStmt); 02892 rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql); 02893 sqlite3_free(zSql); 02894 if( rc!=SQLITE_OK ) return rc; 02895 02896 c->iCursorType = idxNum; 02897 switch( idxNum ){ 02898 case QUERY_GENERIC: 02899 break; 02900 02901 case QUERY_ROWID: 02902 rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0])); 02903 if( rc!=SQLITE_OK ) return rc; 02904 break; 02905 02906 default: /* full-text search */ 02907 { 02908 const char *zQuery = (const char *)sqlite3_value_text(argv[0]); 02909 DocList *pResult; 02910 assert( idxNum<=QUERY_FULLTEXT+v->nColumn); 02911 assert( argc==1 ); 02912 queryClear(&c->q); 02913 rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q); 02914 if( rc!=SQLITE_OK ) return rc; 02915 if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist); 02916 readerInit(&c->result, pResult); 02917 break; 02918 } 02919 } 02920 02921 return fulltextNext(pCursor); 02922 } 02923 02924 /* This is the xEof method of the virtual table. The SQLite core 02925 ** calls this routine to find out if it has reached the end of 02926 ** a query's results set. 02927 */ 02928 static int fulltextEof(sqlite3_vtab_cursor *pCursor){ 02929 fulltext_cursor *c = (fulltext_cursor *) pCursor; 02930 return c->eof; 02931 } 02932 02933 /* This is the xColumn method of the virtual table. The SQLite 02934 ** core calls this method during a query when it needs the value 02935 ** of a column from the virtual table. This method needs to use 02936 ** one of the sqlite3_result_*() routines to store the requested 02937 ** value back in the pContext. 02938 */ 02939 static int fulltextColumn(sqlite3_vtab_cursor *pCursor, 02940 sqlite3_context *pContext, int idxCol){ 02941 fulltext_cursor *c = (fulltext_cursor *) pCursor; 02942 fulltext_vtab *v = cursor_vtab(c); 02943 02944 if( idxCol<v->nColumn ){ 02945 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1); 02946 sqlite3_result_value(pContext, pVal); 02947 }else if( idxCol==v->nColumn ){ 02948 /* The extra column whose name is the same as the table. 02949 ** Return a blob which is a pointer to the cursor 02950 */ 02951 sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT); 02952 } 02953 return SQLITE_OK; 02954 } 02955 02956 /* This is the xRowid method. The SQLite core calls this routine to 02957 ** retrive the rowid for the current row of the result set. The 02958 ** rowid should be written to *pRowid. 02959 */ 02960 static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ 02961 fulltext_cursor *c = (fulltext_cursor *) pCursor; 02962 02963 *pRowid = sqlite3_column_int64(c->pStmt, 0); 02964 return SQLITE_OK; 02965 } 02966 02967 /* Add all terms in [zText] to the given hash table. If [iColumn] > 0, 02968 * we also store positions and offsets in the hash table using the given 02969 * column number. */ 02970 static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid, 02971 const char *zText, int iColumn){ 02972 sqlite3_tokenizer *pTokenizer = v->pTokenizer; 02973 sqlite3_tokenizer_cursor *pCursor; 02974 const char *pToken; 02975 int nTokenBytes; 02976 int iStartOffset, iEndOffset, iPosition; 02977 int rc; 02978 02979 rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); 02980 if( rc!=SQLITE_OK ) return rc; 02981 02982 pCursor->pTokenizer = pTokenizer; 02983 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, 02984 &pToken, &nTokenBytes, 02985 &iStartOffset, &iEndOffset, 02986 &iPosition) ){ 02987 DocList *p; 02988 02989 /* Positions can't be negative; we use -1 as a terminator internally. */ 02990 if( iPosition<0 ){ 02991 pTokenizer->pModule->xClose(pCursor); 02992 return SQLITE_ERROR; 02993 } 02994 02995 p = fts1HashFind(terms, pToken, nTokenBytes); 02996 if( p==NULL ){ 02997 p = docListNew(DL_DEFAULT); 02998 docListAddDocid(p, iDocid); 02999 fts1HashInsert(terms, pToken, nTokenBytes, p); 03000 } 03001 if( iColumn>=0 ){ 03002 docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset); 03003 } 03004 } 03005 03006 /* TODO(shess) Check return? Should this be able to cause errors at 03007 ** this point? Actually, same question about sqlite3_finalize(), 03008 ** though one could argue that failure there means that the data is 03009 ** not durable. *ponder* 03010 */ 03011 pTokenizer->pModule->xClose(pCursor); 03012 return rc; 03013 } 03014 03015 /* Update the %_terms table to map the term [pTerm] to the given rowid. */ 03016 static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm, 03017 DocList *d){ 03018 sqlite_int64 iIndexRow; 03019 DocList doclist; 03020 int iSegment = 0, rc; 03021 03022 rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist); 03023 if( rc==SQLITE_DONE ){ 03024 docListInit(&doclist, DL_DEFAULT, 0, 0); 03025 docListUpdate(&doclist, d); 03026 /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */ 03027 rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist); 03028 goto err; 03029 } 03030 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; 03031 03032 docListUpdate(&doclist, d); 03033 if( doclist.nData<=CHUNK_MAX ){ 03034 rc = term_update(v, iIndexRow, &doclist); 03035 goto err; 03036 } 03037 03038 /* Doclist doesn't fit, delete what's there, and accumulate 03039 ** forward. 03040 */ 03041 rc = term_delete(v, iIndexRow); 03042 if( rc!=SQLITE_OK ) goto err; 03043 03044 /* Try to insert the doclist into a higher segment bucket. On 03045 ** failure, accumulate existing doclist with the doclist from that 03046 ** bucket, and put results in the next bucket. 03047 */ 03048 iSegment++; 03049 while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment, 03050 &doclist))!=SQLITE_OK ){ 03051 sqlite_int64 iSegmentRow; 03052 DocList old; 03053 int rc2; 03054 03055 /* Retain old error in case the term_insert() error was really an 03056 ** error rather than a bounced insert. 03057 */ 03058 rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old); 03059 if( rc2!=SQLITE_ROW ) goto err; 03060 03061 rc = term_delete(v, iSegmentRow); 03062 if( rc!=SQLITE_OK ) goto err; 03063 03064 /* Reusing lowest-number deleted row keeps the index smaller. */ 03065 if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow; 03066 03067 /* doclist contains the newer data, so accumulate it over old. 03068 ** Then steal accumulated data for doclist. 03069 */ 03070 docListAccumulate(&old, &doclist); 03071 docListDestroy(&doclist); 03072 doclist = old; 03073 03074 iSegment++; 03075 } 03076 03077 err: 03078 docListDestroy(&doclist); 03079 return rc; 03080 } 03081 03082 /* Add doclists for all terms in [pValues] to the hash table [terms]. */ 03083 static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid, 03084 sqlite3_value **pValues){ 03085 int i; 03086 for(i = 0; i < v->nColumn ; ++i){ 03087 char *zText = (char*)sqlite3_value_text(pValues[i]); 03088 int rc = buildTerms(v, terms, iRowid, zText, i); 03089 if( rc!=SQLITE_OK ) return rc; 03090 } 03091 return SQLITE_OK; 03092 } 03093 03094 /* Add empty doclists for all terms in the given row's content to the hash 03095 * table [pTerms]. */ 03096 static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){ 03097 const char **pValues; 03098 int i; 03099 03100 int rc = content_select(v, iRowid, &pValues); 03101 if( rc!=SQLITE_OK ) return rc; 03102 03103 for(i = 0 ; i < v->nColumn; ++i) { 03104 rc = buildTerms(v, pTerms, iRowid, pValues[i], -1); 03105 if( rc!=SQLITE_OK ) break; 03106 } 03107 03108 freeStringArray(v->nColumn, pValues); 03109 return SQLITE_OK; 03110 } 03111 03112 /* Insert a row into the %_content table; set *piRowid to be the ID of the 03113 * new row. Fill [pTerms] with new doclists for the %_term table. */ 03114 static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid, 03115 sqlite3_value **pValues, 03116 sqlite_int64 *piRowid, fts1Hash *pTerms){ 03117 int rc; 03118 03119 rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */ 03120 if( rc!=SQLITE_OK ) return rc; 03121 *piRowid = sqlite3_last_insert_rowid(v->db); 03122 return insertTerms(v, pTerms, *piRowid, pValues); 03123 } 03124 03125 /* Delete a row from the %_content table; fill [pTerms] with empty doclists 03126 * to be written to the %_term table. */ 03127 static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){ 03128 int rc = deleteTerms(v, pTerms, iRow); 03129 if( rc!=SQLITE_OK ) return rc; 03130 return content_delete(v, iRow); /* execute an SQL DELETE */ 03131 } 03132 03133 /* Update a row in the %_content table; fill [pTerms] with new doclists for the 03134 * %_term table. */ 03135 static int index_update(fulltext_vtab *v, sqlite_int64 iRow, 03136 sqlite3_value **pValues, fts1Hash *pTerms){ 03137 /* Generate an empty doclist for each term that previously appeared in this 03138 * row. */ 03139 int rc = deleteTerms(v, pTerms, iRow); 03140 if( rc!=SQLITE_OK ) return rc; 03141 03142 rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */ 03143 if( rc!=SQLITE_OK ) return rc; 03144 03145 /* Now add positions for terms which appear in the updated row. */ 03146 return insertTerms(v, pTerms, iRow, pValues); 03147 } 03148 03149 /* This function implements the xUpdate callback; it is the top-level entry 03150 * point for inserting, deleting or updating a row in a full-text table. */ 03151 static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, 03152 sqlite_int64 *pRowid){ 03153 fulltext_vtab *v = (fulltext_vtab *) pVtab; 03154 fts1Hash terms; /* maps term string -> PosList */ 03155 int rc; 03156 fts1HashElem *e; 03157 03158 TRACE(("FTS1 Update %p\n", pVtab)); 03159 03160 fts1HashInit(&terms, FTS1_HASH_STRING, 1); 03161 03162 if( nArg<2 ){ 03163 rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms); 03164 } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ 03165 /* An update: 03166 * ppArg[0] = old rowid 03167 * ppArg[1] = new rowid 03168 * ppArg[2..2+v->nColumn-1] = values 03169 * ppArg[2+v->nColumn] = value for magic column (we ignore this) 03170 */ 03171 sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]); 03172 if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER || 03173 sqlite3_value_int64(ppArg[1]) != rowid ){ 03174 rc = SQLITE_ERROR; /* we don't allow changing the rowid */ 03175 } else { 03176 assert( nArg==2+v->nColumn+1); 03177 rc = index_update(v, rowid, &ppArg[2], &terms); 03178 } 03179 } else { 03180 /* An insert: 03181 * ppArg[1] = requested rowid 03182 * ppArg[2..2+v->nColumn-1] = values 03183 * ppArg[2+v->nColumn] = value for magic column (we ignore this) 03184 */ 03185 assert( nArg==2+v->nColumn+1); 03186 rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms); 03187 } 03188 03189 if( rc==SQLITE_OK ){ 03190 /* Write updated doclists to disk. */ 03191 for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ 03192 DocList *p = fts1HashData(e); 03193 rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p); 03194 if( rc!=SQLITE_OK ) break; 03195 } 03196 } 03197 03198 /* clean up */ 03199 for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ 03200 DocList *p = fts1HashData(e); 03201 docListDelete(p); 03202 } 03203 fts1HashClear(&terms); 03204 03205 return rc; 03206 } 03207 03208 /* 03209 ** Implementation of the snippet() function for FTS1 03210 */ 03211 static void snippetFunc( 03212 sqlite3_context *pContext, 03213 int argc, 03214 sqlite3_value **argv 03215 ){ 03216 fulltext_cursor *pCursor; 03217 if( argc<1 ) return; 03218 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || 03219 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ 03220 sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); 03221 }else{ 03222 const char *zStart = "<b>"; 03223 const char *zEnd = "</b>"; 03224 const char *zEllipsis = "<b>...</b>"; 03225 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); 03226 if( argc>=2 ){ 03227 zStart = (const char*)sqlite3_value_text(argv[1]); 03228 if( argc>=3 ){ 03229 zEnd = (const char*)sqlite3_value_text(argv[2]); 03230 if( argc>=4 ){ 03231 zEllipsis = (const char*)sqlite3_value_text(argv[3]); 03232 } 03233 } 03234 } 03235 snippetAllOffsets(pCursor); 03236 snippetText(pCursor, zStart, zEnd, zEllipsis); 03237 sqlite3_result_text(pContext, pCursor->snippet.zSnippet, 03238 pCursor->snippet.nSnippet, SQLITE_STATIC); 03239 } 03240 } 03241 03242 /* 03243 ** Implementation of the offsets() function for FTS1 03244 */ 03245 static void snippetOffsetsFunc( 03246 sqlite3_context *pContext, 03247 int argc, 03248 sqlite3_value **argv 03249 ){ 03250 fulltext_cursor *pCursor; 03251 if( argc<1 ) return; 03252 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || 03253 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ 03254 sqlite3_result_error(pContext, "illegal first argument to offsets",-1); 03255 }else{ 03256 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); 03257 snippetAllOffsets(pCursor); 03258 snippetOffsetText(&pCursor->snippet); 03259 sqlite3_result_text(pContext, 03260 pCursor->snippet.zOffset, pCursor->snippet.nOffset, 03261 SQLITE_STATIC); 03262 } 03263 } 03264 03265 /* 03266 ** This routine implements the xFindFunction method for the FTS1 03267 ** virtual table. 03268 */ 03269 static int fulltextFindFunction( 03270 sqlite3_vtab *pVtab, 03271 int nArg, 03272 const char *zName, 03273 void (**pxFunc)(sqlite3_context*,int,sqlite3_value**), 03274 void **ppArg 03275 ){ 03276 if( strcmp(zName,"snippet")==0 ){ 03277 *pxFunc = snippetFunc; 03278 return 1; 03279 }else if( strcmp(zName,"offsets")==0 ){ 03280 *pxFunc = snippetOffsetsFunc; 03281 return 1; 03282 } 03283 return 0; 03284 } 03285 03286 /* 03287 ** Rename an fts1 table. 03288 */ 03289 static int fulltextRename( 03290 sqlite3_vtab *pVtab, 03291 const char *zName 03292 ){ 03293 fulltext_vtab *p = (fulltext_vtab *)pVtab; 03294 int rc = SQLITE_NOMEM; 03295 char *zSql = sqlite3_mprintf( 03296 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';" 03297 "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';" 03298 , p->zDb, p->zName, zName 03299 , p->zDb, p->zName, zName 03300 ); 03301 if( zSql ){ 03302 rc = sqlite3_exec(p->db, zSql, 0, 0, 0); 03303 sqlite3_free(zSql); 03304 } 03305 return rc; 03306 } 03307 03308 static const sqlite3_module fulltextModule = { 03309 /* iVersion */ 0, 03310 /* xCreate */ fulltextCreate, 03311 /* xConnect */ fulltextConnect, 03312 /* xBestIndex */ fulltextBestIndex, 03313 /* xDisconnect */ fulltextDisconnect, 03314 /* xDestroy */ fulltextDestroy, 03315 /* xOpen */ fulltextOpen, 03316 /* xClose */ fulltextClose, 03317 /* xFilter */ fulltextFilter, 03318 /* xNext */ fulltextNext, 03319 /* xEof */ fulltextEof, 03320 /* xColumn */ fulltextColumn, 03321 /* xRowid */ fulltextRowid, 03322 /* xUpdate */ fulltextUpdate, 03323 /* xBegin */ 0, 03324 /* xSync */ 0, 03325 /* xCommit */ 0, 03326 /* xRollback */ 0, 03327 /* xFindFunction */ fulltextFindFunction, 03328 /* xRename */ fulltextRename, 03329 }; 03330 03331 int sqlite3Fts1Init(sqlite3 *db){ 03332 sqlite3_overload_function(db, "snippet", -1); 03333 sqlite3_overload_function(db, "offsets", -1); 03334 return sqlite3_create_module(db, "fts1", &fulltextModule, 0); 03335 } 03336 03337 #if !SQLITE_CORE 03338 int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, 03339 const sqlite3_api_routines *pApi){ 03340 SQLITE_EXTENSION_INIT2(pApi) 03341 return sqlite3Fts1Init(db); 03342 } 03343 #endif 03344 03345 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1