fts2_icu.c

Go to the documentation of this file.
00001 /*
00002 ** 2007 June 22
00003 **
00004 ** The author disclaims copyright to this source code.  In place of
00005 ** a legal notice, here is a blessing:
00006 **
00007 **    May you do good and not evil.
00008 **    May you find forgiveness for yourself and forgive others.
00009 **    May you share freely, never taking more than you give.
00010 **
00011 *************************************************************************
00012 ** This file implements a tokenizer for fts2 based on the ICU library.
00013 ** 
00014 ** $Id: fts2_icu.c,v 1.2 2008/07/22 22:20:50 shess Exp $
00015 */
00016 
00017 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
00018 #ifdef SQLITE_ENABLE_ICU
00019 
00020 #include <assert.h>
00021 #include <string.h>
00022 #include "fts2_tokenizer.h"
00023 
00024 #include <unicode/ubrk.h>
00025 #include <unicode/ucol.h>
00026 #include <unicode/ustring.h>
00027 #include <unicode/utf16.h>
00028 
00029 typedef struct IcuTokenizer IcuTokenizer;
00030 typedef struct IcuCursor IcuCursor;
00031 
00032 struct IcuTokenizer {
00033   sqlite3_tokenizer base;
00034   char *zLocale;
00035 };
00036 
00037 struct IcuCursor {
00038   sqlite3_tokenizer_cursor base;
00039 
00040   UBreakIterator *pIter;      /* ICU break-iterator object */
00041   int nChar;                  /* Number of UChar elements in pInput */
00042   UChar *aChar;               /* Copy of input using utf-16 encoding */
00043   int *aOffset;               /* Offsets of each character in utf-8 input */
00044 
00045   int nBuffer;
00046   char *zBuffer;
00047 
00048   int iToken;
00049 };
00050 
00051 /*
00052 ** Create a new tokenizer instance.
00053 */
00054 static int icuCreate(
00055   int argc,                            /* Number of entries in argv[] */
00056   const char * const *argv,            /* Tokenizer creation arguments */
00057   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
00058 ){
00059   IcuTokenizer *p;
00060   int n = 0;
00061 
00062   if( argc>0 ){
00063     n = strlen(argv[0])+1;
00064   }
00065   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
00066   if( !p ){
00067     return SQLITE_NOMEM;
00068   }
00069   memset(p, 0, sizeof(IcuTokenizer));
00070 
00071   if( n ){
00072     p->zLocale = (char *)&p[1];
00073     memcpy(p->zLocale, argv[0], n);
00074   }
00075 
00076   *ppTokenizer = (sqlite3_tokenizer *)p;
00077 
00078   return SQLITE_OK;
00079 }
00080 
00081 /*
00082 ** Destroy a tokenizer
00083 */
00084 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
00085   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
00086   sqlite3_free(p);
00087   return SQLITE_OK;
00088 }
00089 
00090 /*
00091 ** Prepare to begin tokenizing a particular string.  The input
00092 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
00093 ** used to incrementally tokenize this string is returned in 
00094 ** *ppCursor.
00095 */
00096 static int icuOpen(
00097   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
00098   const char *zInput,                    /* Input string */
00099   int nInput,                            /* Length of zInput in bytes */
00100   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
00101 ){
00102   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
00103   IcuCursor *pCsr;
00104 
00105   const int32_t opt = U_FOLD_CASE_DEFAULT;
00106   UErrorCode status = U_ZERO_ERROR;
00107   int nChar;
00108 
00109   UChar32 c;
00110   int iInput = 0;
00111   int iOut = 0;
00112 
00113   *ppCursor = 0;
00114 
00115   if( -1 == nInput ) nInput = strlen(nInput);
00116   nChar = nInput+1;
00117   pCsr = (IcuCursor *)sqlite3_malloc(
00118       sizeof(IcuCursor) +                /* IcuCursor */
00119       nChar * sizeof(UChar) +            /* IcuCursor.aChar[] */
00120       (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
00121   );
00122   if( !pCsr ){
00123     return SQLITE_NOMEM;
00124   }
00125   memset(pCsr, 0, sizeof(IcuCursor));
00126   pCsr->aChar = (UChar *)&pCsr[1];
00127   pCsr->aOffset = (int *)&pCsr->aChar[nChar];
00128 
00129   pCsr->aOffset[iOut] = iInput;
00130   U8_NEXT(zInput, iInput, nInput, c); 
00131   while( c>0 ){
00132     int isError = 0;
00133     c = u_foldCase(c, opt);
00134     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
00135     if( isError ){
00136       sqlite3_free(pCsr);
00137       return SQLITE_ERROR;
00138     }
00139     pCsr->aOffset[iOut] = iInput;
00140 
00141     if( iInput<nInput ){
00142       U8_NEXT(zInput, iInput, nInput, c);
00143     }else{
00144       c = 0;
00145     }
00146   }
00147 
00148   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
00149   if( !U_SUCCESS(status) ){
00150     sqlite3_free(pCsr);
00151     return SQLITE_ERROR;
00152   }
00153   pCsr->nChar = iOut;
00154 
00155   ubrk_first(pCsr->pIter);
00156   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
00157   return SQLITE_OK;
00158 }
00159 
00160 /*
00161 ** Close a tokenization cursor previously opened by a call to icuOpen().
00162 */
00163 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
00164   IcuCursor *pCsr = (IcuCursor *)pCursor;
00165   ubrk_close(pCsr->pIter);
00166   sqlite3_free(pCsr->zBuffer);
00167   sqlite3_free(pCsr);
00168   return SQLITE_OK;
00169 }
00170 
00171 /*
00172 ** Extract the next token from a tokenization cursor.
00173 */
00174 static int icuNext(
00175   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
00176   const char **ppToken,               /* OUT: *ppToken is the token text */
00177   int *pnBytes,                       /* OUT: Number of bytes in token */
00178   int *piStartOffset,                 /* OUT: Starting offset of token */
00179   int *piEndOffset,                   /* OUT: Ending offset of token */
00180   int *piPosition                     /* OUT: Position integer of token */
00181 ){
00182   IcuCursor *pCsr = (IcuCursor *)pCursor;
00183 
00184   int iStart = 0;
00185   int iEnd = 0;
00186   int nByte = 0;
00187 
00188   while( iStart==iEnd ){
00189     UChar32 c;
00190 
00191     iStart = ubrk_current(pCsr->pIter);
00192     iEnd = ubrk_next(pCsr->pIter);
00193     if( iEnd==UBRK_DONE ){
00194       return SQLITE_DONE;
00195     }
00196 
00197     while( iStart<iEnd ){
00198       int iWhite = iStart;
00199       U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
00200       if( u_isspace(c) ){
00201         iStart = iWhite;
00202       }else{
00203         break;
00204       }
00205     }
00206     assert(iStart<=iEnd);
00207   }
00208 
00209   do {
00210     UErrorCode status = U_ZERO_ERROR;
00211     if( nByte ){
00212       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
00213       if( !zNew ){
00214         return SQLITE_NOMEM;
00215       }
00216       pCsr->zBuffer = zNew;
00217       pCsr->nBuffer = nByte;
00218     }
00219 
00220     u_strToUTF8(
00221         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
00222         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
00223         &status                                  /* Output success/failure */
00224     );
00225   } while( nByte>pCsr->nBuffer );
00226 
00227   *ppToken = pCsr->zBuffer;
00228   *pnBytes = nByte;
00229   *piStartOffset = pCsr->aOffset[iStart];
00230   *piEndOffset = pCsr->aOffset[iEnd];
00231   *piPosition = pCsr->iToken++;
00232 
00233   return SQLITE_OK;
00234 }
00235 
00236 /*
00237 ** The set of routines that implement the simple tokenizer
00238 */
00239 static const sqlite3_tokenizer_module icuTokenizerModule = {
00240   0,                           /* iVersion */
00241   icuCreate,                   /* xCreate  */
00242   icuDestroy,                  /* xCreate  */
00243   icuOpen,                     /* xOpen    */
00244   icuClose,                    /* xClose   */
00245   icuNext,                     /* xNext    */
00246 };
00247 
00248 /*
00249 ** Set *ppModule to point at the implementation of the ICU tokenizer.
00250 */
00251 void sqlite3Fts2IcuTokenizerModule(
00252   sqlite3_tokenizer_module const**ppModule
00253 ){
00254   *ppModule = &icuTokenizerModule;
00255 }
00256 
00257 #endif /* defined(SQLITE_ENABLE_ICU) */
00258 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1