00001 /* 00002 ** 2007 June 22 00003 ** 00004 ** The author disclaims copyright to this source code. In place of 00005 ** a legal notice, here is a blessing: 00006 ** 00007 ** May you do good and not evil. 00008 ** May you find forgiveness for yourself and forgive others. 00009 ** May you share freely, never taking more than you give. 00010 ** 00011 ************************************************************************* 00012 ** This file implements a tokenizer for fts3 based on the ICU library. 00013 ** 00014 ** $Id: fts3_icu.c,v 1.3 2008/09/01 18:34:20 danielk1977 Exp $ 00015 */ 00016 00017 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) 00018 #ifdef SQLITE_ENABLE_ICU 00019 00020 #include <assert.h> 00021 #include <string.h> 00022 #include "fts3_tokenizer.h" 00023 00024 #include <unicode/ubrk.h> 00025 #include <unicode/ucol.h> 00026 #include <unicode/ustring.h> 00027 #include <unicode/utf16.h> 00028 00029 typedef struct IcuTokenizer IcuTokenizer; 00030 typedef struct IcuCursor IcuCursor; 00031 00032 struct IcuTokenizer { 00033 sqlite3_tokenizer base; 00034 char *zLocale; 00035 }; 00036 00037 struct IcuCursor { 00038 sqlite3_tokenizer_cursor base; 00039 00040 UBreakIterator *pIter; /* ICU break-iterator object */ 00041 int nChar; /* Number of UChar elements in pInput */ 00042 UChar *aChar; /* Copy of input using utf-16 encoding */ 00043 int *aOffset; /* Offsets of each character in utf-8 input */ 00044 00045 int nBuffer; 00046 char *zBuffer; 00047 00048 int iToken; 00049 }; 00050 00051 /* 00052 ** Create a new tokenizer instance. 00053 */ 00054 static int icuCreate( 00055 int argc, /* Number of entries in argv[] */ 00056 const char * const *argv, /* Tokenizer creation arguments */ 00057 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ 00058 ){ 00059 IcuTokenizer *p; 00060 int n = 0; 00061 00062 if( argc>0 ){ 00063 n = strlen(argv[0])+1; 00064 } 00065 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); 00066 if( !p ){ 00067 return SQLITE_NOMEM; 00068 } 00069 memset(p, 0, sizeof(IcuTokenizer)); 00070 00071 if( n ){ 00072 p->zLocale = (char *)&p[1]; 00073 memcpy(p->zLocale, argv[0], n); 00074 } 00075 00076 *ppTokenizer = (sqlite3_tokenizer *)p; 00077 00078 return SQLITE_OK; 00079 } 00080 00081 /* 00082 ** Destroy a tokenizer 00083 */ 00084 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ 00085 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 00086 sqlite3_free(p); 00087 return SQLITE_OK; 00088 } 00089 00090 /* 00091 ** Prepare to begin tokenizing a particular string. The input 00092 ** string to be tokenized is pInput[0..nBytes-1]. A cursor 00093 ** used to incrementally tokenize this string is returned in 00094 ** *ppCursor. 00095 */ 00096 static int icuOpen( 00097 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 00098 const char *zInput, /* Input string */ 00099 int nInput, /* Length of zInput in bytes */ 00100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 00101 ){ 00102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 00103 IcuCursor *pCsr; 00104 00105 const int32_t opt = U_FOLD_CASE_DEFAULT; 00106 UErrorCode status = U_ZERO_ERROR; 00107 int nChar; 00108 00109 UChar32 c; 00110 int iInput = 0; 00111 int iOut = 0; 00112 00113 *ppCursor = 0; 00114 00115 if( nInput<0 ){ 00116 nInput = strlen(zInput); 00117 } 00118 nChar = nInput+1; 00119 pCsr = (IcuCursor *)sqlite3_malloc( 00120 sizeof(IcuCursor) + /* IcuCursor */ 00121 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ 00122 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ 00123 ); 00124 if( !pCsr ){ 00125 return SQLITE_NOMEM; 00126 } 00127 memset(pCsr, 0, sizeof(IcuCursor)); 00128 pCsr->aChar = (UChar *)&pCsr[1]; 00129 pCsr->aOffset = (int *)&pCsr->aChar[nChar]; 00130 00131 pCsr->aOffset[iOut] = iInput; 00132 U8_NEXT(zInput, iInput, nInput, c); 00133 while( c>0 ){ 00134 int isError = 0; 00135 c = u_foldCase(c, opt); 00136 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); 00137 if( isError ){ 00138 sqlite3_free(pCsr); 00139 return SQLITE_ERROR; 00140 } 00141 pCsr->aOffset[iOut] = iInput; 00142 00143 if( iInput<nInput ){ 00144 U8_NEXT(zInput, iInput, nInput, c); 00145 }else{ 00146 c = 0; 00147 } 00148 } 00149 00150 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); 00151 if( !U_SUCCESS(status) ){ 00152 sqlite3_free(pCsr); 00153 return SQLITE_ERROR; 00154 } 00155 pCsr->nChar = iOut; 00156 00157 ubrk_first(pCsr->pIter); 00158 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; 00159 return SQLITE_OK; 00160 } 00161 00162 /* 00163 ** Close a tokenization cursor previously opened by a call to icuOpen(). 00164 */ 00165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ 00166 IcuCursor *pCsr = (IcuCursor *)pCursor; 00167 ubrk_close(pCsr->pIter); 00168 sqlite3_free(pCsr->zBuffer); 00169 sqlite3_free(pCsr); 00170 return SQLITE_OK; 00171 } 00172 00173 /* 00174 ** Extract the next token from a tokenization cursor. 00175 */ 00176 static int icuNext( 00177 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ 00178 const char **ppToken, /* OUT: *ppToken is the token text */ 00179 int *pnBytes, /* OUT: Number of bytes in token */ 00180 int *piStartOffset, /* OUT: Starting offset of token */ 00181 int *piEndOffset, /* OUT: Ending offset of token */ 00182 int *piPosition /* OUT: Position integer of token */ 00183 ){ 00184 IcuCursor *pCsr = (IcuCursor *)pCursor; 00185 00186 int iStart = 0; 00187 int iEnd = 0; 00188 int nByte = 0; 00189 00190 while( iStart==iEnd ){ 00191 UChar32 c; 00192 00193 iStart = ubrk_current(pCsr->pIter); 00194 iEnd = ubrk_next(pCsr->pIter); 00195 if( iEnd==UBRK_DONE ){ 00196 return SQLITE_DONE; 00197 } 00198 00199 while( iStart<iEnd ){ 00200 int iWhite = iStart; 00201 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); 00202 if( u_isspace(c) ){ 00203 iStart = iWhite; 00204 }else{ 00205 break; 00206 } 00207 } 00208 assert(iStart<=iEnd); 00209 } 00210 00211 do { 00212 UErrorCode status = U_ZERO_ERROR; 00213 if( nByte ){ 00214 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); 00215 if( !zNew ){ 00216 return SQLITE_NOMEM; 00217 } 00218 pCsr->zBuffer = zNew; 00219 pCsr->nBuffer = nByte; 00220 } 00221 00222 u_strToUTF8( 00223 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ 00224 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ 00225 &status /* Output success/failure */ 00226 ); 00227 } while( nByte>pCsr->nBuffer ); 00228 00229 *ppToken = pCsr->zBuffer; 00230 *pnBytes = nByte; 00231 *piStartOffset = pCsr->aOffset[iStart]; 00232 *piEndOffset = pCsr->aOffset[iEnd]; 00233 *piPosition = pCsr->iToken++; 00234 00235 return SQLITE_OK; 00236 } 00237 00238 /* 00239 ** The set of routines that implement the simple tokenizer 00240 */ 00241 static const sqlite3_tokenizer_module icuTokenizerModule = { 00242 0, /* iVersion */ 00243 icuCreate, /* xCreate */ 00244 icuDestroy, /* xCreate */ 00245 icuOpen, /* xOpen */ 00246 icuClose, /* xClose */ 00247 icuNext, /* xNext */ 00248 }; 00249 00250 /* 00251 ** Set *ppModule to point at the implementation of the ICU tokenizer. 00252 */ 00253 void sqlite3Fts3IcuTokenizerModule( 00254 sqlite3_tokenizer_module const**ppModule 00255 ){ 00256 *ppModule = &icuTokenizerModule; 00257 } 00258 00259 #endif /* defined(SQLITE_ENABLE_ICU) */ 00260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1