00001 /* 00002 ** 2007 June 22 00003 ** 00004 ** The author disclaims copyright to this source code. In place of 00005 ** a legal notice, here is a blessing: 00006 ** 00007 ** May you do good and not evil. 00008 ** May you find forgiveness for yourself and forgive others. 00009 ** May you share freely, never taking more than you give. 00010 ** 00011 ************************************************************************* 00012 ** This file implements a tokenizer for fts2 based on the ICU library. 00013 ** 00014 ** $Id: fts2_icu.c,v 1.2 2008/07/22 22:20:50 shess Exp $ 00015 */ 00016 00017 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) 00018 #ifdef SQLITE_ENABLE_ICU 00019 00020 #include <assert.h> 00021 #include <string.h> 00022 #include "fts2_tokenizer.h" 00023 00024 #include <unicode/ubrk.h> 00025 #include <unicode/ucol.h> 00026 #include <unicode/ustring.h> 00027 #include <unicode/utf16.h> 00028 00029 typedef struct IcuTokenizer IcuTokenizer; 00030 typedef struct IcuCursor IcuCursor; 00031 00032 struct IcuTokenizer { 00033 sqlite3_tokenizer base; 00034 char *zLocale; 00035 }; 00036 00037 struct IcuCursor { 00038 sqlite3_tokenizer_cursor base; 00039 00040 UBreakIterator *pIter; /* ICU break-iterator object */ 00041 int nChar; /* Number of UChar elements in pInput */ 00042 UChar *aChar; /* Copy of input using utf-16 encoding */ 00043 int *aOffset; /* Offsets of each character in utf-8 input */ 00044 00045 int nBuffer; 00046 char *zBuffer; 00047 00048 int iToken; 00049 }; 00050 00051 /* 00052 ** Create a new tokenizer instance. 00053 */ 00054 static int icuCreate( 00055 int argc, /* Number of entries in argv[] */ 00056 const char * const *argv, /* Tokenizer creation arguments */ 00057 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ 00058 ){ 00059 IcuTokenizer *p; 00060 int n = 0; 00061 00062 if( argc>0 ){ 00063 n = strlen(argv[0])+1; 00064 } 00065 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); 00066 if( !p ){ 00067 return SQLITE_NOMEM; 00068 } 00069 memset(p, 0, sizeof(IcuTokenizer)); 00070 00071 if( n ){ 00072 p->zLocale = (char *)&p[1]; 00073 memcpy(p->zLocale, argv[0], n); 00074 } 00075 00076 *ppTokenizer = (sqlite3_tokenizer *)p; 00077 00078 return SQLITE_OK; 00079 } 00080 00081 /* 00082 ** Destroy a tokenizer 00083 */ 00084 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ 00085 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 00086 sqlite3_free(p); 00087 return SQLITE_OK; 00088 } 00089 00090 /* 00091 ** Prepare to begin tokenizing a particular string. The input 00092 ** string to be tokenized is pInput[0..nBytes-1]. A cursor 00093 ** used to incrementally tokenize this string is returned in 00094 ** *ppCursor. 00095 */ 00096 static int icuOpen( 00097 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 00098 const char *zInput, /* Input string */ 00099 int nInput, /* Length of zInput in bytes */ 00100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 00101 ){ 00102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 00103 IcuCursor *pCsr; 00104 00105 const int32_t opt = U_FOLD_CASE_DEFAULT; 00106 UErrorCode status = U_ZERO_ERROR; 00107 int nChar; 00108 00109 UChar32 c; 00110 int iInput = 0; 00111 int iOut = 0; 00112 00113 *ppCursor = 0; 00114 00115 if( -1 == nInput ) nInput = strlen(nInput); 00116 nChar = nInput+1; 00117 pCsr = (IcuCursor *)sqlite3_malloc( 00118 sizeof(IcuCursor) + /* IcuCursor */ 00119 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ 00120 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ 00121 ); 00122 if( !pCsr ){ 00123 return SQLITE_NOMEM; 00124 } 00125 memset(pCsr, 0, sizeof(IcuCursor)); 00126 pCsr->aChar = (UChar *)&pCsr[1]; 00127 pCsr->aOffset = (int *)&pCsr->aChar[nChar]; 00128 00129 pCsr->aOffset[iOut] = iInput; 00130 U8_NEXT(zInput, iInput, nInput, c); 00131 while( c>0 ){ 00132 int isError = 0; 00133 c = u_foldCase(c, opt); 00134 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); 00135 if( isError ){ 00136 sqlite3_free(pCsr); 00137 return SQLITE_ERROR; 00138 } 00139 pCsr->aOffset[iOut] = iInput; 00140 00141 if( iInput<nInput ){ 00142 U8_NEXT(zInput, iInput, nInput, c); 00143 }else{ 00144 c = 0; 00145 } 00146 } 00147 00148 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); 00149 if( !U_SUCCESS(status) ){ 00150 sqlite3_free(pCsr); 00151 return SQLITE_ERROR; 00152 } 00153 pCsr->nChar = iOut; 00154 00155 ubrk_first(pCsr->pIter); 00156 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; 00157 return SQLITE_OK; 00158 } 00159 00160 /* 00161 ** Close a tokenization cursor previously opened by a call to icuOpen(). 00162 */ 00163 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ 00164 IcuCursor *pCsr = (IcuCursor *)pCursor; 00165 ubrk_close(pCsr->pIter); 00166 sqlite3_free(pCsr->zBuffer); 00167 sqlite3_free(pCsr); 00168 return SQLITE_OK; 00169 } 00170 00171 /* 00172 ** Extract the next token from a tokenization cursor. 00173 */ 00174 static int icuNext( 00175 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ 00176 const char **ppToken, /* OUT: *ppToken is the token text */ 00177 int *pnBytes, /* OUT: Number of bytes in token */ 00178 int *piStartOffset, /* OUT: Starting offset of token */ 00179 int *piEndOffset, /* OUT: Ending offset of token */ 00180 int *piPosition /* OUT: Position integer of token */ 00181 ){ 00182 IcuCursor *pCsr = (IcuCursor *)pCursor; 00183 00184 int iStart = 0; 00185 int iEnd = 0; 00186 int nByte = 0; 00187 00188 while( iStart==iEnd ){ 00189 UChar32 c; 00190 00191 iStart = ubrk_current(pCsr->pIter); 00192 iEnd = ubrk_next(pCsr->pIter); 00193 if( iEnd==UBRK_DONE ){ 00194 return SQLITE_DONE; 00195 } 00196 00197 while( iStart<iEnd ){ 00198 int iWhite = iStart; 00199 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); 00200 if( u_isspace(c) ){ 00201 iStart = iWhite; 00202 }else{ 00203 break; 00204 } 00205 } 00206 assert(iStart<=iEnd); 00207 } 00208 00209 do { 00210 UErrorCode status = U_ZERO_ERROR; 00211 if( nByte ){ 00212 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); 00213 if( !zNew ){ 00214 return SQLITE_NOMEM; 00215 } 00216 pCsr->zBuffer = zNew; 00217 pCsr->nBuffer = nByte; 00218 } 00219 00220 u_strToUTF8( 00221 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ 00222 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ 00223 &status /* Output success/failure */ 00224 ); 00225 } while( nByte>pCsr->nBuffer ); 00226 00227 *ppToken = pCsr->zBuffer; 00228 *pnBytes = nByte; 00229 *piStartOffset = pCsr->aOffset[iStart]; 00230 *piEndOffset = pCsr->aOffset[iEnd]; 00231 *piPosition = pCsr->iToken++; 00232 00233 return SQLITE_OK; 00234 } 00235 00236 /* 00237 ** The set of routines that implement the simple tokenizer 00238 */ 00239 static const sqlite3_tokenizer_module icuTokenizerModule = { 00240 0, /* iVersion */ 00241 icuCreate, /* xCreate */ 00242 icuDestroy, /* xCreate */ 00243 icuOpen, /* xOpen */ 00244 icuClose, /* xClose */ 00245 icuNext, /* xNext */ 00246 }; 00247 00248 /* 00249 ** Set *ppModule to point at the implementation of the ICU tokenizer. 00250 */ 00251 void sqlite3Fts2IcuTokenizerModule( 00252 sqlite3_tokenizer_module const**ppModule 00253 ){ 00254 *ppModule = &icuTokenizerModule; 00255 } 00256 00257 #endif /* defined(SQLITE_ENABLE_ICU) */ 00258 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1