fts3_icu.c

Go to the documentation of this file.
00001 /*
00002 ** 2007 June 22
00003 **
00004 ** The author disclaims copyright to this source code.  In place of
00005 ** a legal notice, here is a blessing:
00006 **
00007 **    May you do good and not evil.
00008 **    May you find forgiveness for yourself and forgive others.
00009 **    May you share freely, never taking more than you give.
00010 **
00011 *************************************************************************
00012 ** This file implements a tokenizer for fts3 based on the ICU library.
00013 ** 
00014 ** $Id: fts3_icu.c,v 1.3 2008/09/01 18:34:20 danielk1977 Exp $
00015 */
00016 
00017 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
00018 #ifdef SQLITE_ENABLE_ICU
00019 
00020 #include <assert.h>
00021 #include <string.h>
00022 #include "fts3_tokenizer.h"
00023 
00024 #include <unicode/ubrk.h>
00025 #include <unicode/ucol.h>
00026 #include <unicode/ustring.h>
00027 #include <unicode/utf16.h>
00028 
00029 typedef struct IcuTokenizer IcuTokenizer;
00030 typedef struct IcuCursor IcuCursor;
00031 
00032 struct IcuTokenizer {
00033   sqlite3_tokenizer base;
00034   char *zLocale;
00035 };
00036 
00037 struct IcuCursor {
00038   sqlite3_tokenizer_cursor base;
00039 
00040   UBreakIterator *pIter;      /* ICU break-iterator object */
00041   int nChar;                  /* Number of UChar elements in pInput */
00042   UChar *aChar;               /* Copy of input using utf-16 encoding */
00043   int *aOffset;               /* Offsets of each character in utf-8 input */
00044 
00045   int nBuffer;
00046   char *zBuffer;
00047 
00048   int iToken;
00049 };
00050 
00051 /*
00052 ** Create a new tokenizer instance.
00053 */
00054 static int icuCreate(
00055   int argc,                            /* Number of entries in argv[] */
00056   const char * const *argv,            /* Tokenizer creation arguments */
00057   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
00058 ){
00059   IcuTokenizer *p;
00060   int n = 0;
00061 
00062   if( argc>0 ){
00063     n = strlen(argv[0])+1;
00064   }
00065   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
00066   if( !p ){
00067     return SQLITE_NOMEM;
00068   }
00069   memset(p, 0, sizeof(IcuTokenizer));
00070 
00071   if( n ){
00072     p->zLocale = (char *)&p[1];
00073     memcpy(p->zLocale, argv[0], n);
00074   }
00075 
00076   *ppTokenizer = (sqlite3_tokenizer *)p;
00077 
00078   return SQLITE_OK;
00079 }
00080 
00081 /*
00082 ** Destroy a tokenizer
00083 */
00084 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
00085   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
00086   sqlite3_free(p);
00087   return SQLITE_OK;
00088 }
00089 
00090 /*
00091 ** Prepare to begin tokenizing a particular string.  The input
00092 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
00093 ** used to incrementally tokenize this string is returned in 
00094 ** *ppCursor.
00095 */
00096 static int icuOpen(
00097   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
00098   const char *zInput,                    /* Input string */
00099   int nInput,                            /* Length of zInput in bytes */
00100   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
00101 ){
00102   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
00103   IcuCursor *pCsr;
00104 
00105   const int32_t opt = U_FOLD_CASE_DEFAULT;
00106   UErrorCode status = U_ZERO_ERROR;
00107   int nChar;
00108 
00109   UChar32 c;
00110   int iInput = 0;
00111   int iOut = 0;
00112 
00113   *ppCursor = 0;
00114 
00115   if( nInput<0 ){
00116     nInput = strlen(zInput);
00117   }
00118   nChar = nInput+1;
00119   pCsr = (IcuCursor *)sqlite3_malloc(
00120       sizeof(IcuCursor) +                /* IcuCursor */
00121       nChar * sizeof(UChar) +            /* IcuCursor.aChar[] */
00122       (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
00123   );
00124   if( !pCsr ){
00125     return SQLITE_NOMEM;
00126   }
00127   memset(pCsr, 0, sizeof(IcuCursor));
00128   pCsr->aChar = (UChar *)&pCsr[1];
00129   pCsr->aOffset = (int *)&pCsr->aChar[nChar];
00130 
00131   pCsr->aOffset[iOut] = iInput;
00132   U8_NEXT(zInput, iInput, nInput, c); 
00133   while( c>0 ){
00134     int isError = 0;
00135     c = u_foldCase(c, opt);
00136     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
00137     if( isError ){
00138       sqlite3_free(pCsr);
00139       return SQLITE_ERROR;
00140     }
00141     pCsr->aOffset[iOut] = iInput;
00142 
00143     if( iInput<nInput ){
00144       U8_NEXT(zInput, iInput, nInput, c);
00145     }else{
00146       c = 0;
00147     }
00148   }
00149 
00150   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
00151   if( !U_SUCCESS(status) ){
00152     sqlite3_free(pCsr);
00153     return SQLITE_ERROR;
00154   }
00155   pCsr->nChar = iOut;
00156 
00157   ubrk_first(pCsr->pIter);
00158   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
00159   return SQLITE_OK;
00160 }
00161 
00162 /*
00163 ** Close a tokenization cursor previously opened by a call to icuOpen().
00164 */
00165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
00166   IcuCursor *pCsr = (IcuCursor *)pCursor;
00167   ubrk_close(pCsr->pIter);
00168   sqlite3_free(pCsr->zBuffer);
00169   sqlite3_free(pCsr);
00170   return SQLITE_OK;
00171 }
00172 
00173 /*
00174 ** Extract the next token from a tokenization cursor.
00175 */
00176 static int icuNext(
00177   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
00178   const char **ppToken,               /* OUT: *ppToken is the token text */
00179   int *pnBytes,                       /* OUT: Number of bytes in token */
00180   int *piStartOffset,                 /* OUT: Starting offset of token */
00181   int *piEndOffset,                   /* OUT: Ending offset of token */
00182   int *piPosition                     /* OUT: Position integer of token */
00183 ){
00184   IcuCursor *pCsr = (IcuCursor *)pCursor;
00185 
00186   int iStart = 0;
00187   int iEnd = 0;
00188   int nByte = 0;
00189 
00190   while( iStart==iEnd ){
00191     UChar32 c;
00192 
00193     iStart = ubrk_current(pCsr->pIter);
00194     iEnd = ubrk_next(pCsr->pIter);
00195     if( iEnd==UBRK_DONE ){
00196       return SQLITE_DONE;
00197     }
00198 
00199     while( iStart<iEnd ){
00200       int iWhite = iStart;
00201       U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
00202       if( u_isspace(c) ){
00203         iStart = iWhite;
00204       }else{
00205         break;
00206       }
00207     }
00208     assert(iStart<=iEnd);
00209   }
00210 
00211   do {
00212     UErrorCode status = U_ZERO_ERROR;
00213     if( nByte ){
00214       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
00215       if( !zNew ){
00216         return SQLITE_NOMEM;
00217       }
00218       pCsr->zBuffer = zNew;
00219       pCsr->nBuffer = nByte;
00220     }
00221 
00222     u_strToUTF8(
00223         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
00224         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
00225         &status                                  /* Output success/failure */
00226     );
00227   } while( nByte>pCsr->nBuffer );
00228 
00229   *ppToken = pCsr->zBuffer;
00230   *pnBytes = nByte;
00231   *piStartOffset = pCsr->aOffset[iStart];
00232   *piEndOffset = pCsr->aOffset[iEnd];
00233   *piPosition = pCsr->iToken++;
00234 
00235   return SQLITE_OK;
00236 }
00237 
00238 /*
00239 ** The set of routines that implement the simple tokenizer
00240 */
00241 static const sqlite3_tokenizer_module icuTokenizerModule = {
00242   0,                           /* iVersion */
00243   icuCreate,                   /* xCreate  */
00244   icuDestroy,                  /* xCreate  */
00245   icuOpen,                     /* xOpen    */
00246   icuClose,                    /* xClose   */
00247   icuNext,                     /* xNext    */
00248 };
00249 
00250 /*
00251 ** Set *ppModule to point at the implementation of the ICU tokenizer.
00252 */
00253 void sqlite3Fts3IcuTokenizerModule(
00254   sqlite3_tokenizer_module const**ppModule
00255 ){
00256   *ppModule = &icuTokenizerModule;
00257 }
00258 
00259 #endif /* defined(SQLITE_ENABLE_ICU) */
00260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1