icu.c

Go to the documentation of this file.
00001 /*
00002 ** 2007 May 6
00003 **
00004 ** The author disclaims copyright to this source code.  In place of
00005 ** a legal notice, here is a blessing:
00006 **
00007 **    May you do good and not evil.
00008 **    May you find forgiveness for yourself and forgive others.
00009 **    May you share freely, never taking more than you give.
00010 **
00011 *************************************************************************
00012 ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
00013 **
00014 ** This file implements an integration between the ICU library 
00015 ** ("International Components for Unicode", an open-source library 
00016 ** for handling unicode data) and SQLite. The integration uses 
00017 ** ICU to provide the following to SQLite:
00018 **
00019 **   * An implementation of the SQL regexp() function (and hence REGEXP
00020 **     operator) using the ICU uregex_XX() APIs.
00021 **
00022 **   * Implementations of the SQL scalar upper() and lower() functions
00023 **     for case mapping.
00024 **
00025 **   * Integration of ICU and SQLite collation seqences.
00026 **
00027 **   * An implementation of the LIKE operator that uses ICU to 
00028 **     provide case-independent matching.
00029 */
00030 
00031 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
00032 
00033 /* Include ICU headers */
00034 #include <unicode/utypes.h>
00035 #include <unicode/uregex.h>
00036 #include <unicode/ustring.h>
00037 #include <unicode/ucol.h>
00038 
00039 #include <assert.h>
00040 
00041 #ifndef SQLITE_CORE
00042   #include "sqlite3ext.h"
00043   SQLITE_EXTENSION_INIT1
00044 #else
00045   #include "sqlite3.h"
00046 #endif
00047 
00048 /*
00049 ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
00050 ** operator.
00051 */
00052 #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
00053 # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
00054 #endif
00055 
00056 /*
00057 ** Version of sqlite3_free() that is always a function, never a macro.
00058 */
00059 static void xFree(void *p){
00060   sqlite3_free(p);
00061 }
00062 
00063 /*
00064 ** Compare two UTF-8 strings for equality where the first string is
00065 ** a "LIKE" expression. Return true (1) if they are the same and 
00066 ** false (0) if they are different.
00067 */
00068 static int icuLikeCompare(
00069   const uint8_t *zPattern,   /* LIKE pattern */
00070   const uint8_t *zString,    /* The UTF-8 string to compare against */
00071   const UChar32 uEsc         /* The escape character */
00072 ){
00073   static const int MATCH_ONE = (UChar32)'_';
00074   static const int MATCH_ALL = (UChar32)'%';
00075 
00076   int iPattern = 0;       /* Current byte index in zPattern */
00077   int iString = 0;        /* Current byte index in zString */
00078 
00079   int prevEscape = 0;     /* True if the previous character was uEsc */
00080 
00081   while( zPattern[iPattern]!=0 ){
00082 
00083     /* Read (and consume) the next character from the input pattern. */
00084     UChar32 uPattern;
00085     U8_NEXT_UNSAFE(zPattern, iPattern, uPattern);
00086     assert(uPattern!=0);
00087 
00088     /* There are now 4 possibilities:
00089     **
00090     **     1. uPattern is an unescaped match-all character "%",
00091     **     2. uPattern is an unescaped match-one character "_",
00092     **     3. uPattern is an unescaped escape character, or
00093     **     4. uPattern is to be handled as an ordinary character
00094     */
00095     if( !prevEscape && uPattern==MATCH_ALL ){
00096       /* Case 1. */
00097       uint8_t c;
00098 
00099       /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
00100       ** MATCH_ALL. For each MATCH_ONE, skip one character in the 
00101       ** test string.
00102       */
00103       while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){
00104         if( c==MATCH_ONE ){
00105           if( zString[iString]==0 ) return 0;
00106           U8_FWD_1_UNSAFE(zString, iString);
00107         }
00108         iPattern++;
00109       }
00110 
00111       if( zPattern[iPattern]==0 ) return 1;
00112 
00113       while( zString[iString] ){
00114         if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){
00115           return 1;
00116         }
00117         U8_FWD_1_UNSAFE(zString, iString);
00118       }
00119       return 0;
00120 
00121     }else if( !prevEscape && uPattern==MATCH_ONE ){
00122       /* Case 2. */
00123       if( zString[iString]==0 ) return 0;
00124       U8_FWD_1_UNSAFE(zString, iString);
00125 
00126     }else if( !prevEscape && uPattern==uEsc){
00127       /* Case 3. */
00128       prevEscape = 1;
00129 
00130     }else{
00131       /* Case 4. */
00132       UChar32 uString;
00133       U8_NEXT_UNSAFE(zString, iString, uString);
00134       uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
00135       uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
00136       if( uString!=uPattern ){
00137         return 0;
00138       }
00139       prevEscape = 0;
00140     }
00141   }
00142 
00143   return zString[iString]==0;
00144 }
00145 
00146 /*
00147 ** Implementation of the like() SQL function.  This function implements
00148 ** the build-in LIKE operator.  The first argument to the function is the
00149 ** pattern and the second argument is the string.  So, the SQL statements:
00150 **
00151 **       A LIKE B
00152 **
00153 ** is implemented as like(B, A). If there is an escape character E, 
00154 **
00155 **       A LIKE B ESCAPE E
00156 **
00157 ** is mapped to like(B, A, E).
00158 */
00159 static void icuLikeFunc(
00160   sqlite3_context *context, 
00161   int argc, 
00162   sqlite3_value **argv
00163 ){
00164   const unsigned char *zA = sqlite3_value_text(argv[0]);
00165   const unsigned char *zB = sqlite3_value_text(argv[1]);
00166   UChar32 uEsc = 0;
00167 
00168   /* Limit the length of the LIKE or GLOB pattern to avoid problems
00169   ** of deep recursion and N*N behavior in patternCompare().
00170   */
00171   if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
00172     sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
00173     return;
00174   }
00175 
00176 
00177   if( argc==3 ){
00178     /* The escape character string must consist of a single UTF-8 character.
00179     ** Otherwise, return an error.
00180     */
00181     int nE= sqlite3_value_bytes(argv[2]);
00182     const unsigned char *zE = sqlite3_value_text(argv[2]);
00183     int i = 0;
00184     if( zE==0 ) return;
00185     U8_NEXT(zE, i, nE, uEsc);
00186     if( i!=nE){
00187       sqlite3_result_error(context, 
00188           "ESCAPE expression must be a single character", -1);
00189       return;
00190     }
00191   }
00192 
00193   if( zA && zB ){
00194     sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
00195   }
00196 }
00197 
00198 /*
00199 ** This function is called when an ICU function called from within
00200 ** the implementation of an SQL scalar function returns an error.
00201 **
00202 ** The scalar function context passed as the first argument is 
00203 ** loaded with an error message based on the following two args.
00204 */
00205 static void icuFunctionError(
00206   sqlite3_context *pCtx,       /* SQLite scalar function context */
00207   const char *zName,           /* Name of ICU function that failed */
00208   UErrorCode e                 /* Error code returned by ICU function */
00209 ){
00210   char zBuf[128];
00211   sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
00212   zBuf[127] = '\0';
00213   sqlite3_result_error(pCtx, zBuf, -1);
00214 }
00215 
00216 /*
00217 ** Function to delete compiled regexp objects. Registered as
00218 ** a destructor function with sqlite3_set_auxdata().
00219 */
00220 static void icuRegexpDelete(void *p){
00221   URegularExpression *pExpr = (URegularExpression *)p;
00222   uregex_close(pExpr);
00223 }
00224 
00225 /*
00226 ** Implementation of SQLite REGEXP operator. This scalar function takes
00227 ** two arguments. The first is a regular expression pattern to compile
00228 ** the second is a string to match against that pattern. If either 
00229 ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
00230 ** is 1 if the string matches the pattern, or 0 otherwise.
00231 **
00232 ** SQLite maps the regexp() function to the regexp() operator such
00233 ** that the following two are equivalent:
00234 **
00235 **     zString REGEXP zPattern
00236 **     regexp(zPattern, zString)
00237 **
00238 ** Uses the following ICU regexp APIs:
00239 **
00240 **     uregex_open()
00241 **     uregex_matches()
00242 **     uregex_close()
00243 */
00244 static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
00245   UErrorCode status = U_ZERO_ERROR;
00246   URegularExpression *pExpr;
00247   UBool res;
00248   const UChar *zString = sqlite3_value_text16(apArg[1]);
00249 
00250   /* If the left hand side of the regexp operator is NULL, 
00251   ** then the result is also NULL. 
00252   */
00253   if( !zString ){
00254     return;
00255   }
00256 
00257   pExpr = sqlite3_get_auxdata(p, 0);
00258   if( !pExpr ){
00259     const UChar *zPattern = sqlite3_value_text16(apArg[0]);
00260     if( !zPattern ){
00261       return;
00262     }
00263     pExpr = uregex_open(zPattern, -1, 0, 0, &status);
00264 
00265     if( U_SUCCESS(status) ){
00266       sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
00267     }else{
00268       assert(!pExpr);
00269       icuFunctionError(p, "uregex_open", status);
00270       return;
00271     }
00272   }
00273 
00274   /* Configure the text that the regular expression operates on. */
00275   uregex_setText(pExpr, zString, -1, &status);
00276   if( !U_SUCCESS(status) ){
00277     icuFunctionError(p, "uregex_setText", status);
00278     return;
00279   }
00280 
00281   /* Attempt the match */
00282   res = uregex_matches(pExpr, 0, &status);
00283   if( !U_SUCCESS(status) ){
00284     icuFunctionError(p, "uregex_matches", status);
00285     return;
00286   }
00287 
00288   /* Set the text that the regular expression operates on to a NULL
00289   ** pointer. This is not really necessary, but it is tidier than 
00290   ** leaving the regular expression object configured with an invalid
00291   ** pointer after this function returns.
00292   */
00293   uregex_setText(pExpr, 0, 0, &status);
00294 
00295   /* Return 1 or 0. */
00296   sqlite3_result_int(p, res ? 1 : 0);
00297 }
00298 
00299 /*
00300 ** Implementations of scalar functions for case mapping - upper() and 
00301 ** lower(). Function upper() converts its input to upper-case (ABC).
00302 ** Function lower() converts to lower-case (abc).
00303 **
00304 ** ICU provides two types of case mapping, "general" case mapping and
00305 ** "language specific". Refer to ICU documentation for the differences
00306 ** between the two.
00307 **
00308 ** To utilise "general" case mapping, the upper() or lower() scalar 
00309 ** functions are invoked with one argument:
00310 **
00311 **     upper('ABC') -> 'abc'
00312 **     lower('abc') -> 'ABC'
00313 **
00314 ** To access ICU "language specific" case mapping, upper() or lower()
00315 ** should be invoked with two arguments. The second argument is the name
00316 ** of the locale to use. Passing an empty string ("") or SQL NULL value
00317 ** as the second argument is the same as invoking the 1 argument version
00318 ** of upper() or lower().
00319 **
00320 **     lower('I', 'en_us') -> 'i'
00321 **     lower('I', 'tr_tr') -> 'ı' (small dotless i)
00322 **
00323 ** http://www.icu-project.org/userguide/posix.html#case_mappings
00324 */
00325 static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
00326   const UChar *zInput;
00327   UChar *zOutput;
00328   int nInput;
00329   int nOutput;
00330 
00331   UErrorCode status = U_ZERO_ERROR;
00332   const char *zLocale = 0;
00333 
00334   assert(nArg==1 || nArg==2);
00335   if( nArg==2 ){
00336     zLocale = (const char *)sqlite3_value_text(apArg[1]);
00337   }
00338 
00339   zInput = sqlite3_value_text16(apArg[0]);
00340   if( !zInput ){
00341     return;
00342   }
00343   nInput = sqlite3_value_bytes16(apArg[0]);
00344 
00345   nOutput = nInput * 2 + 2;
00346   zOutput = sqlite3_malloc(nOutput);
00347   if( !zOutput ){
00348     return;
00349   }
00350 
00351   if( sqlite3_user_data(p) ){
00352     u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
00353   }else{
00354     u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
00355   }
00356 
00357   if( !U_SUCCESS(status) ){
00358     icuFunctionError(p, "u_strToLower()/u_strToUpper", status);
00359     return;
00360   }
00361 
00362   sqlite3_result_text16(p, zOutput, -1, xFree);
00363 }
00364 
00365 /*
00366 ** Collation sequence destructor function. The pCtx argument points to
00367 ** a UCollator structure previously allocated using ucol_open().
00368 */
00369 static void icuCollationDel(void *pCtx){
00370   UCollator *p = (UCollator *)pCtx;
00371   ucol_close(p);
00372 }
00373 
00374 /*
00375 ** Collation sequence comparison function. The pCtx argument points to
00376 ** a UCollator structure previously allocated using ucol_open().
00377 */
00378 static int icuCollationColl(
00379   void *pCtx,
00380   int nLeft,
00381   const void *zLeft,
00382   int nRight,
00383   const void *zRight
00384 ){
00385   UCollationResult res;
00386   UCollator *p = (UCollator *)pCtx;
00387   res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
00388   switch( res ){
00389     case UCOL_LESS:    return -1;
00390     case UCOL_GREATER: return +1;
00391     case UCOL_EQUAL:   return 0;
00392   }
00393   assert(!"Unexpected return value from ucol_strcoll()");
00394   return 0;
00395 }
00396 
00397 /*
00398 ** Implementation of the scalar function icu_load_collation().
00399 **
00400 ** This scalar function is used to add ICU collation based collation 
00401 ** types to an SQLite database connection. It is intended to be called
00402 ** as follows:
00403 **
00404 **     SELECT icu_load_collation(<locale>, <collation-name>);
00405 **
00406 ** Where <locale> is a string containing an ICU locale identifier (i.e.
00407 ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
00408 ** collation sequence to create.
00409 */
00410 static void icuLoadCollation(
00411   sqlite3_context *p, 
00412   int nArg, 
00413   sqlite3_value **apArg
00414 ){
00415   sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
00416   UErrorCode status = U_ZERO_ERROR;
00417   const char *zLocale;      /* Locale identifier - (eg. "jp_JP") */
00418   const char *zName;        /* SQL Collation sequence name (eg. "japanese") */
00419   UCollator *pUCollator;    /* ICU library collation object */
00420   int rc;                   /* Return code from sqlite3_create_collation_x() */
00421 
00422   assert(nArg==2);
00423   zLocale = (const char *)sqlite3_value_text(apArg[0]);
00424   zName = (const char *)sqlite3_value_text(apArg[1]);
00425 
00426   if( !zLocale || !zName ){
00427     return;
00428   }
00429 
00430   pUCollator = ucol_open(zLocale, &status);
00431   if( !U_SUCCESS(status) ){
00432     icuFunctionError(p, "ucol_open", status);
00433     return;
00434   }
00435   assert(p);
00436 
00437   rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator, 
00438       icuCollationColl, icuCollationDel
00439   );
00440   if( rc!=SQLITE_OK ){
00441     ucol_close(pUCollator);
00442     sqlite3_result_error(p, "Error registering collation function", -1);
00443   }
00444 }
00445 
00446 /*
00447 ** Register the ICU extension functions with database db.
00448 */
00449 int sqlite3IcuInit(sqlite3 *db){
00450   struct IcuScalar {
00451     const char *zName;                        /* Function name */
00452     int nArg;                                 /* Number of arguments */
00453     int enc;                                  /* Optimal text encoding */
00454     void *pContext;                           /* sqlite3_user_data() context */
00455     void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
00456   } scalars[] = {
00457     {"regexp",-1, SQLITE_ANY,          0, icuRegexpFunc},
00458 
00459     {"lower",  1, SQLITE_UTF16,        0, icuCaseFunc16},
00460     {"lower",  2, SQLITE_UTF16,        0, icuCaseFunc16},
00461     {"upper",  1, SQLITE_UTF16, (void*)1, icuCaseFunc16},
00462     {"upper",  2, SQLITE_UTF16, (void*)1, icuCaseFunc16},
00463 
00464     {"lower",  1, SQLITE_UTF8,         0, icuCaseFunc16},
00465     {"lower",  2, SQLITE_UTF8,         0, icuCaseFunc16},
00466     {"upper",  1, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
00467     {"upper",  2, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
00468 
00469     {"like",   2, SQLITE_UTF8,         0, icuLikeFunc},
00470     {"like",   3, SQLITE_UTF8,         0, icuLikeFunc},
00471 
00472     {"icu_load_collation",  2, SQLITE_UTF8, (void*)db, icuLoadCollation},
00473   };
00474 
00475   int rc = SQLITE_OK;
00476   int i;
00477 
00478   for(i=0; rc==SQLITE_OK && i<(sizeof(scalars)/sizeof(struct IcuScalar)); i++){
00479     struct IcuScalar *p = &scalars[i];
00480     rc = sqlite3_create_function(
00481         db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0
00482     );
00483   }
00484 
00485   return rc;
00486 }
00487 
00488 #if !SQLITE_CORE
00489 int sqlite3_extension_init(
00490   sqlite3 *db, 
00491   char **pzErrMsg,
00492   const sqlite3_api_routines *pApi
00493 ){
00494   SQLITE_EXTENSION_INIT2(pApi)
00495   return sqlite3IcuInit(db);
00496 }
00497 #endif
00498 
00499 #endif

ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:54 2011 by Doxygen 1.6.1