00001 /* 00002 ** The author disclaims copyright to this source code. 00003 ** 00004 ************************************************************************* 00005 ** Implementation of the "simple" full-text-search tokenizer. 00006 */ 00007 00008 /* 00009 ** The code in this file is only compiled if: 00010 ** 00011 ** * The FTS1 module is being built as an extension 00012 ** (in which case SQLITE_CORE is not defined), or 00013 ** 00014 ** * The FTS1 module is being built into the core of 00015 ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). 00016 */ 00017 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) 00018 00019 00020 #include <assert.h> 00021 #include <stdlib.h> 00022 #include <stdio.h> 00023 #include <string.h> 00024 #include <ctype.h> 00025 00026 #include "fts1_tokenizer.h" 00027 00028 typedef struct simple_tokenizer { 00029 sqlite3_tokenizer base; 00030 char delim[128]; /* flag ASCII delimiters */ 00031 } simple_tokenizer; 00032 00033 typedef struct simple_tokenizer_cursor { 00034 sqlite3_tokenizer_cursor base; 00035 const char *pInput; /* input we are tokenizing */ 00036 int nBytes; /* size of the input */ 00037 int iOffset; /* current position in pInput */ 00038 int iToken; /* index of next token to be returned */ 00039 char *pToken; /* storage for current token */ 00040 int nTokenAllocated; /* space allocated to zToken buffer */ 00041 } simple_tokenizer_cursor; 00042 00043 00044 /* Forward declaration */ 00045 static const sqlite3_tokenizer_module simpleTokenizerModule; 00046 00047 static int isDelim(simple_tokenizer *t, unsigned char c){ 00048 return c<0x80 && t->delim[c]; 00049 } 00050 00051 /* 00052 ** Create a new tokenizer instance. 00053 */ 00054 static int simpleCreate( 00055 int argc, const char * const *argv, 00056 sqlite3_tokenizer **ppTokenizer 00057 ){ 00058 simple_tokenizer *t; 00059 00060 t = (simple_tokenizer *) calloc(sizeof(*t), 1); 00061 if( t==NULL ) return SQLITE_NOMEM; 00062 00063 /* TODO(shess) Delimiters need to remain the same from run to run, 00064 ** else we need to reindex. One solution would be a meta-table to 00065 ** track such information in the database, then we'd only want this 00066 ** information on the initial create. 00067 */ 00068 if( argc>1 ){ 00069 int i, n = strlen(argv[1]); 00070 for(i=0; i<n; i++){ 00071 unsigned char ch = argv[1][i]; 00072 /* We explicitly don't support UTF-8 delimiters for now. */ 00073 if( ch>=0x80 ){ 00074 free(t); 00075 return SQLITE_ERROR; 00076 } 00077 t->delim[ch] = 1; 00078 } 00079 } else { 00080 /* Mark non-alphanumeric ASCII characters as delimiters */ 00081 int i; 00082 for(i=1; i<0x80; i++){ 00083 t->delim[i] = !isalnum(i); 00084 } 00085 } 00086 00087 *ppTokenizer = &t->base; 00088 return SQLITE_OK; 00089 } 00090 00091 /* 00092 ** Destroy a tokenizer 00093 */ 00094 static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ 00095 free(pTokenizer); 00096 return SQLITE_OK; 00097 } 00098 00099 /* 00100 ** Prepare to begin tokenizing a particular string. The input 00101 ** string to be tokenized is pInput[0..nBytes-1]. A cursor 00102 ** used to incrementally tokenize this string is returned in 00103 ** *ppCursor. 00104 */ 00105 static int simpleOpen( 00106 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 00107 const char *pInput, int nBytes, /* String to be tokenized */ 00108 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 00109 ){ 00110 simple_tokenizer_cursor *c; 00111 00112 c = (simple_tokenizer_cursor *) malloc(sizeof(*c)); 00113 if( c==NULL ) return SQLITE_NOMEM; 00114 00115 c->pInput = pInput; 00116 if( pInput==0 ){ 00117 c->nBytes = 0; 00118 }else if( nBytes<0 ){ 00119 c->nBytes = (int)strlen(pInput); 00120 }else{ 00121 c->nBytes = nBytes; 00122 } 00123 c->iOffset = 0; /* start tokenizing at the beginning */ 00124 c->iToken = 0; 00125 c->pToken = NULL; /* no space allocated, yet. */ 00126 c->nTokenAllocated = 0; 00127 00128 *ppCursor = &c->base; 00129 return SQLITE_OK; 00130 } 00131 00132 /* 00133 ** Close a tokenization cursor previously opened by a call to 00134 ** simpleOpen() above. 00135 */ 00136 static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ 00137 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; 00138 free(c->pToken); 00139 free(c); 00140 return SQLITE_OK; 00141 } 00142 00143 /* 00144 ** Extract the next token from a tokenization cursor. The cursor must 00145 ** have been opened by a prior call to simpleOpen(). 00146 */ 00147 static int simpleNext( 00148 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ 00149 const char **ppToken, /* OUT: *ppToken is the token text */ 00150 int *pnBytes, /* OUT: Number of bytes in token */ 00151 int *piStartOffset, /* OUT: Starting offset of token */ 00152 int *piEndOffset, /* OUT: Ending offset of token */ 00153 int *piPosition /* OUT: Position integer of token */ 00154 ){ 00155 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; 00156 simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; 00157 unsigned char *p = (unsigned char *)c->pInput; 00158 00159 while( c->iOffset<c->nBytes ){ 00160 int iStartOffset; 00161 00162 /* Scan past delimiter characters */ 00163 while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){ 00164 c->iOffset++; 00165 } 00166 00167 /* Count non-delimiter characters. */ 00168 iStartOffset = c->iOffset; 00169 while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){ 00170 c->iOffset++; 00171 } 00172 00173 if( c->iOffset>iStartOffset ){ 00174 int i, n = c->iOffset-iStartOffset; 00175 if( n>c->nTokenAllocated ){ 00176 c->nTokenAllocated = n+20; 00177 c->pToken = realloc(c->pToken, c->nTokenAllocated); 00178 if( c->pToken==NULL ) return SQLITE_NOMEM; 00179 } 00180 for(i=0; i<n; i++){ 00181 /* TODO(shess) This needs expansion to handle UTF-8 00182 ** case-insensitivity. 00183 */ 00184 unsigned char ch = p[iStartOffset+i]; 00185 c->pToken[i] = ch<0x80 ? tolower(ch) : ch; 00186 } 00187 *ppToken = c->pToken; 00188 *pnBytes = n; 00189 *piStartOffset = iStartOffset; 00190 *piEndOffset = c->iOffset; 00191 *piPosition = c->iToken++; 00192 00193 return SQLITE_OK; 00194 } 00195 } 00196 return SQLITE_DONE; 00197 } 00198 00199 /* 00200 ** The set of routines that implement the simple tokenizer 00201 */ 00202 static const sqlite3_tokenizer_module simpleTokenizerModule = { 00203 0, 00204 simpleCreate, 00205 simpleDestroy, 00206 simpleOpen, 00207 simpleClose, 00208 simpleNext, 00209 }; 00210 00211 /* 00212 ** Allocate a new simple tokenizer. Return a pointer to the new 00213 ** tokenizer in *ppModule 00214 */ 00215 void sqlite3Fts1SimpleTokenizerModule( 00216 sqlite3_tokenizer_module const**ppModule 00217 ){ 00218 *ppModule = &simpleTokenizerModule; 00219 } 00220 00221 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1