00001 /* 00002 ** 2006 Oct 10 00003 ** 00004 ** The author disclaims copyright to this source code. In place of 00005 ** a legal notice, here is a blessing: 00006 ** 00007 ** May you do good and not evil. 00008 ** May you find forgiveness for yourself and forgive others. 00009 ** May you share freely, never taking more than you give. 00010 ** 00011 ****************************************************************************** 00012 ** 00013 ** Implementation of the "simple" full-text-search tokenizer. 00014 */ 00015 00016 /* 00017 ** The code in this file is only compiled if: 00018 ** 00019 ** * The FTS3 module is being built as an extension 00020 ** (in which case SQLITE_CORE is not defined), or 00021 ** 00022 ** * The FTS3 module is being built into the core of 00023 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). 00024 */ 00025 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) 00026 00027 00028 #include <assert.h> 00029 #include <stdlib.h> 00030 #include <stdio.h> 00031 #include <string.h> 00032 #include <ctype.h> 00033 00034 #include "fts3_tokenizer.h" 00035 00036 typedef struct simple_tokenizer { 00037 sqlite3_tokenizer base; 00038 char delim[128]; /* flag ASCII delimiters */ 00039 } simple_tokenizer; 00040 00041 typedef struct simple_tokenizer_cursor { 00042 sqlite3_tokenizer_cursor base; 00043 const char *pInput; /* input we are tokenizing */ 00044 int nBytes; /* size of the input */ 00045 int iOffset; /* current position in pInput */ 00046 int iToken; /* index of next token to be returned */ 00047 char *pToken; /* storage for current token */ 00048 int nTokenAllocated; /* space allocated to zToken buffer */ 00049 } simple_tokenizer_cursor; 00050 00051 00052 /* Forward declaration */ 00053 static const sqlite3_tokenizer_module simpleTokenizerModule; 00054 00055 static int simpleDelim(simple_tokenizer *t, unsigned char c){ 00056 return c<0x80 && t->delim[c]; 00057 } 00058 00059 /* 00060 ** Create a new tokenizer instance. 00061 */ 00062 static int simpleCreate( 00063 int argc, const char * const *argv, 00064 sqlite3_tokenizer **ppTokenizer 00065 ){ 00066 simple_tokenizer *t; 00067 00068 t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t)); 00069 if( t==NULL ) return SQLITE_NOMEM; 00070 memset(t, 0, sizeof(*t)); 00071 00072 /* TODO(shess) Delimiters need to remain the same from run to run, 00073 ** else we need to reindex. One solution would be a meta-table to 00074 ** track such information in the database, then we'd only want this 00075 ** information on the initial create. 00076 */ 00077 if( argc>1 ){ 00078 int i, n = strlen(argv[1]); 00079 for(i=0; i<n; i++){ 00080 unsigned char ch = argv[1][i]; 00081 /* We explicitly don't support UTF-8 delimiters for now. */ 00082 if( ch>=0x80 ){ 00083 sqlite3_free(t); 00084 return SQLITE_ERROR; 00085 } 00086 t->delim[ch] = 1; 00087 } 00088 } else { 00089 /* Mark non-alphanumeric ASCII characters as delimiters */ 00090 int i; 00091 for(i=1; i<0x80; i++){ 00092 t->delim[i] = !isalnum(i); 00093 } 00094 } 00095 00096 *ppTokenizer = &t->base; 00097 return SQLITE_OK; 00098 } 00099 00100 /* 00101 ** Destroy a tokenizer 00102 */ 00103 static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ 00104 sqlite3_free(pTokenizer); 00105 return SQLITE_OK; 00106 } 00107 00108 /* 00109 ** Prepare to begin tokenizing a particular string. The input 00110 ** string to be tokenized is pInput[0..nBytes-1]. A cursor 00111 ** used to incrementally tokenize this string is returned in 00112 ** *ppCursor. 00113 */ 00114 static int simpleOpen( 00115 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 00116 const char *pInput, int nBytes, /* String to be tokenized */ 00117 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 00118 ){ 00119 simple_tokenizer_cursor *c; 00120 00121 c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); 00122 if( c==NULL ) return SQLITE_NOMEM; 00123 00124 c->pInput = pInput; 00125 if( pInput==0 ){ 00126 c->nBytes = 0; 00127 }else if( nBytes<0 ){ 00128 c->nBytes = (int)strlen(pInput); 00129 }else{ 00130 c->nBytes = nBytes; 00131 } 00132 c->iOffset = 0; /* start tokenizing at the beginning */ 00133 c->iToken = 0; 00134 c->pToken = NULL; /* no space allocated, yet. */ 00135 c->nTokenAllocated = 0; 00136 00137 *ppCursor = &c->base; 00138 return SQLITE_OK; 00139 } 00140 00141 /* 00142 ** Close a tokenization cursor previously opened by a call to 00143 ** simpleOpen() above. 00144 */ 00145 static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ 00146 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; 00147 sqlite3_free(c->pToken); 00148 sqlite3_free(c); 00149 return SQLITE_OK; 00150 } 00151 00152 /* 00153 ** Extract the next token from a tokenization cursor. The cursor must 00154 ** have been opened by a prior call to simpleOpen(). 00155 */ 00156 static int simpleNext( 00157 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ 00158 const char **ppToken, /* OUT: *ppToken is the token text */ 00159 int *pnBytes, /* OUT: Number of bytes in token */ 00160 int *piStartOffset, /* OUT: Starting offset of token */ 00161 int *piEndOffset, /* OUT: Ending offset of token */ 00162 int *piPosition /* OUT: Position integer of token */ 00163 ){ 00164 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; 00165 simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; 00166 unsigned char *p = (unsigned char *)c->pInput; 00167 00168 while( c->iOffset<c->nBytes ){ 00169 int iStartOffset; 00170 00171 /* Scan past delimiter characters */ 00172 while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){ 00173 c->iOffset++; 00174 } 00175 00176 /* Count non-delimiter characters. */ 00177 iStartOffset = c->iOffset; 00178 while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){ 00179 c->iOffset++; 00180 } 00181 00182 if( c->iOffset>iStartOffset ){ 00183 int i, n = c->iOffset-iStartOffset; 00184 if( n>c->nTokenAllocated ){ 00185 c->nTokenAllocated = n+20; 00186 c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated); 00187 if( c->pToken==NULL ) return SQLITE_NOMEM; 00188 } 00189 for(i=0; i<n; i++){ 00190 /* TODO(shess) This needs expansion to handle UTF-8 00191 ** case-insensitivity. 00192 */ 00193 unsigned char ch = p[iStartOffset+i]; 00194 c->pToken[i] = ch<0x80 ? tolower(ch) : ch; 00195 } 00196 *ppToken = c->pToken; 00197 *pnBytes = n; 00198 *piStartOffset = iStartOffset; 00199 *piEndOffset = c->iOffset; 00200 *piPosition = c->iToken++; 00201 00202 return SQLITE_OK; 00203 } 00204 } 00205 return SQLITE_DONE; 00206 } 00207 00208 /* 00209 ** The set of routines that implement the simple tokenizer 00210 */ 00211 static const sqlite3_tokenizer_module simpleTokenizerModule = { 00212 0, 00213 simpleCreate, 00214 simpleDestroy, 00215 simpleOpen, 00216 simpleClose, 00217 simpleNext, 00218 }; 00219 00220 /* 00221 ** Allocate a new simple tokenizer. Return a pointer to the new 00222 ** tokenizer in *ppModule 00223 */ 00224 void sqlite3Fts3SimpleTokenizerModule( 00225 sqlite3_tokenizer_module const**ppModule 00226 ){ 00227 *ppModule = &simpleTokenizerModule; 00228 } 00229 00230 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1