utf.c

Go to the documentation of this file.
00001 /*
00002 ** 2004 April 13
00003 **
00004 ** The author disclaims copyright to this source code.  In place of
00005 ** a legal notice, here is a blessing:
00006 **
00007 **    May you do good and not evil.
00008 **    May you find forgiveness for yourself and forgive others.
00009 **    May you share freely, never taking more than you give.
00010 **
00011 *************************************************************************
00012 ** This file contains routines used to translate between UTF-8, 
00013 ** UTF-16, UTF-16BE, and UTF-16LE.
00014 **
00015 ** $Id: utf.c,v 1.66 2008/11/07 03:29:34 drh Exp $
00016 **
00017 ** Notes on UTF-8:
00018 **
00019 **   Byte-0    Byte-1    Byte-2    Byte-3    Value
00020 **  0xxxxxxx                                 00000000 00000000 0xxxxxxx
00021 **  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
00022 **  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
00023 **  11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx
00024 **
00025 **
00026 ** Notes on UTF-16:  (with wwww+1==uuuuu)
00027 **
00028 **      Word-0               Word-1          Value
00029 **  110110ww wwzzzzyy   110111yy yyxxxxxx    000uuuuu zzzzyyyy yyxxxxxx
00030 **  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
00031 **
00032 **
00033 ** BOM or Byte Order Mark:
00034 **     0xff 0xfe   little-endian utf-16 follows
00035 **     0xfe 0xff   big-endian utf-16 follows
00036 **
00037 */
00038 #include "sqliteInt.h"
00039 #include <assert.h>
00040 #include "vdbeInt.h"
00041 
00042 /*
00043 ** The following constant value is used by the SQLITE_BIGENDIAN and
00044 ** SQLITE_LITTLEENDIAN macros.
00045 */
00046 const int sqlite3one = 1;
00047 
00048 /*
00049 ** This lookup table is used to help decode the first byte of
00050 ** a multi-byte UTF8 character.
00051 */
00052 static const unsigned char sqlite3UtfTrans1[] = {
00053   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
00054   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
00055   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
00056   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
00057   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
00058   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
00059   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
00060   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
00061 };
00062 
00063 
00064 #define WRITE_UTF8(zOut, c) {                          \
00065   if( c<0x00080 ){                                     \
00066     *zOut++ = (c&0xFF);                                \
00067   }                                                    \
00068   else if( c<0x00800 ){                                \
00069     *zOut++ = 0xC0 + ((c>>6)&0x1F);                    \
00070     *zOut++ = 0x80 + (c & 0x3F);                       \
00071   }                                                    \
00072   else if( c<0x10000 ){                                \
00073     *zOut++ = 0xE0 + ((c>>12)&0x0F);                   \
00074     *zOut++ = 0x80 + ((c>>6) & 0x3F);                  \
00075     *zOut++ = 0x80 + (c & 0x3F);                       \
00076   }else{                                               \
00077     *zOut++ = 0xF0 + ((c>>18) & 0x07);                 \
00078     *zOut++ = 0x80 + ((c>>12) & 0x3F);                 \
00079     *zOut++ = 0x80 + ((c>>6) & 0x3F);                  \
00080     *zOut++ = 0x80 + (c & 0x3F);                       \
00081   }                                                    \
00082 }
00083 
00084 #define WRITE_UTF16LE(zOut, c) {                                \
00085   if( c<=0xFFFF ){                                              \
00086     *zOut++ = (c&0x00FF);                                       \
00087     *zOut++ = ((c>>8)&0x00FF);                                  \
00088   }else{                                                        \
00089     *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
00090     *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03));              \
00091     *zOut++ = (c&0x00FF);                                       \
00092     *zOut++ = (0x00DC + ((c>>8)&0x03));                         \
00093   }                                                             \
00094 }
00095 
00096 #define WRITE_UTF16BE(zOut, c) {                                \
00097   if( c<=0xFFFF ){                                              \
00098     *zOut++ = ((c>>8)&0x00FF);                                  \
00099     *zOut++ = (c&0x00FF);                                       \
00100   }else{                                                        \
00101     *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03));              \
00102     *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
00103     *zOut++ = (0x00DC + ((c>>8)&0x03));                         \
00104     *zOut++ = (c&0x00FF);                                       \
00105   }                                                             \
00106 }
00107 
00108 #define READ_UTF16LE(zIn, c){                                         \
00109   c = (*zIn++);                                                       \
00110   c += ((*zIn++)<<8);                                                 \
00111   if( c>=0xD800 && c<0xE000 ){                                       \
00112     int c2 = (*zIn++);                                                \
00113     c2 += ((*zIn++)<<8);                                              \
00114     c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
00115     if( (c & 0xFFFF0000)==0 ) c = 0xFFFD;                             \
00116   }                                                                   \
00117 }
00118 
00119 #define READ_UTF16BE(zIn, c){                                         \
00120   c = ((*zIn++)<<8);                                                  \
00121   c += (*zIn++);                                                      \
00122   if( c>=0xD800 && c<0xE000 ){                                       \
00123     int c2 = ((*zIn++)<<8);                                           \
00124     c2 += (*zIn++);                                                   \
00125     c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
00126     if( (c & 0xFFFF0000)==0 ) c = 0xFFFD;                             \
00127   }                                                                   \
00128 }
00129 
00130 /*
00131 ** Translate a single UTF-8 character.  Return the unicode value.
00132 **
00133 ** During translation, assume that the byte that zTerm points
00134 ** is a 0x00.
00135 **
00136 ** Write a pointer to the next unread byte back into *pzNext.
00137 **
00138 ** Notes On Invalid UTF-8:
00139 **
00140 **  *  This routine never allows a 7-bit character (0x00 through 0x7f) to
00141 **     be encoded as a multi-byte character.  Any multi-byte character that
00142 **     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
00143 **
00144 **  *  This routine never allows a UTF16 surrogate value to be encoded.
00145 **     If a multi-byte character attempts to encode a value between
00146 **     0xd800 and 0xe000 then it is rendered as 0xfffd.
00147 **
00148 **  *  Bytes in the range of 0x80 through 0xbf which occur as the first
00149 **     byte of a character are interpreted as single-byte characters
00150 **     and rendered as themselves even though they are technically
00151 **     invalid characters.
00152 **
00153 **  *  This routine accepts an infinite number of different UTF8 encodings
00154 **     for unicode values 0x80 and greater.  It do not change over-length
00155 **     encodings to 0xfffd as some systems recommend.
00156 */
00157 #define READ_UTF8(zIn, zTerm, c)                           \
00158   c = *(zIn++);                                            \
00159   if( c>=0xc0 ){                                           \
00160     c = sqlite3UtfTrans1[c-0xc0];                          \
00161     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
00162       c = (c<<6) + (0x3f & *(zIn++));                      \
00163     }                                                      \
00164     if( c<0x80                                             \
00165         || (c&0xFFFFF800)==0xD800                          \
00166         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
00167   }
00168 int sqlite3Utf8Read(
00169   const unsigned char *z,         /* First byte of UTF-8 character */
00170   const unsigned char *zTerm,     /* Pretend this byte is 0x00 */
00171   const unsigned char **pzNext    /* Write first byte past UTF-8 char here */
00172 ){
00173   int c;
00174   READ_UTF8(z, zTerm, c);
00175   *pzNext = z;
00176   return c;
00177 }
00178 
00179 
00180 
00181 
00182 /*
00183 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
00184 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
00185 */ 
00186 /* #define TRANSLATE_TRACE 1 */
00187 
00188 #ifndef SQLITE_OMIT_UTF16
00189 /*
00190 ** This routine transforms the internal text encoding used by pMem to
00191 ** desiredEnc. It is an error if the string is already of the desired
00192 ** encoding, or if *pMem does not contain a string value.
00193 */
00194 int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
00195   int len;                    /* Maximum length of output string in bytes */
00196   unsigned char *zOut;                  /* Output buffer */
00197   unsigned char *zIn;                   /* Input iterator */
00198   unsigned char *zTerm;                 /* End of input */
00199   unsigned char *z;                     /* Output iterator */
00200   unsigned int c;
00201 
00202   assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
00203   assert( pMem->flags&MEM_Str );
00204   assert( pMem->enc!=desiredEnc );
00205   assert( pMem->enc!=0 );
00206   assert( pMem->n>=0 );
00207 
00208 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
00209   {
00210     char zBuf[100];
00211     sqlite3VdbeMemPrettyPrint(pMem, zBuf);
00212     fprintf(stderr, "INPUT:  %s\n", zBuf);
00213   }
00214 #endif
00215 
00216   /* If the translation is between UTF-16 little and big endian, then 
00217   ** all that is required is to swap the byte order. This case is handled
00218   ** differently from the others.
00219   */
00220   if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
00221     u8 temp;
00222     int rc;
00223     rc = sqlite3VdbeMemMakeWriteable(pMem);
00224     if( rc!=SQLITE_OK ){
00225       assert( rc==SQLITE_NOMEM );
00226       return SQLITE_NOMEM;
00227     }
00228     zIn = (u8*)pMem->z;
00229     zTerm = &zIn[pMem->n&~1];
00230     while( zIn<zTerm ){
00231       temp = *zIn;
00232       *zIn = *(zIn+1);
00233       zIn++;
00234       *zIn++ = temp;
00235     }
00236     pMem->enc = desiredEnc;
00237     goto translate_out;
00238   }
00239 
00240   /* Set len to the maximum number of bytes required in the output buffer. */
00241   if( desiredEnc==SQLITE_UTF8 ){
00242     /* When converting from UTF-16, the maximum growth results from
00243     ** translating a 2-byte character to a 4-byte UTF-8 character.
00244     ** A single byte is required for the output string
00245     ** nul-terminator.
00246     */
00247     pMem->n &= ~1;
00248     len = pMem->n * 2 + 1;
00249   }else{
00250     /* When converting from UTF-8 to UTF-16 the maximum growth is caused
00251     ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
00252     ** character. Two bytes are required in the output buffer for the
00253     ** nul-terminator.
00254     */
00255     len = pMem->n * 2 + 2;
00256   }
00257 
00258   /* Set zIn to point at the start of the input buffer and zTerm to point 1
00259   ** byte past the end.
00260   **
00261   ** Variable zOut is set to point at the output buffer, space obtained
00262   ** from sqlite3_malloc().
00263   */
00264   zIn = (u8*)pMem->z;
00265   zTerm = &zIn[pMem->n];
00266   zOut = sqlite3DbMallocRaw(pMem->db, len);
00267   if( !zOut ){
00268     return SQLITE_NOMEM;
00269   }
00270   z = zOut;
00271 
00272   if( pMem->enc==SQLITE_UTF8 ){
00273     if( desiredEnc==SQLITE_UTF16LE ){
00274       /* UTF-8 -> UTF-16 Little-endian */
00275       while( zIn<zTerm ){
00276         /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
00277         READ_UTF8(zIn, zTerm, c);
00278         WRITE_UTF16LE(z, c);
00279       }
00280     }else{
00281       assert( desiredEnc==SQLITE_UTF16BE );
00282       /* UTF-8 -> UTF-16 Big-endian */
00283       while( zIn<zTerm ){
00284         /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
00285         READ_UTF8(zIn, zTerm, c);
00286         WRITE_UTF16BE(z, c);
00287       }
00288     }
00289     pMem->n = z - zOut;
00290     *z++ = 0;
00291   }else{
00292     assert( desiredEnc==SQLITE_UTF8 );
00293     if( pMem->enc==SQLITE_UTF16LE ){
00294       /* UTF-16 Little-endian -> UTF-8 */
00295       while( zIn<zTerm ){
00296         READ_UTF16LE(zIn, c); 
00297         WRITE_UTF8(z, c);
00298       }
00299     }else{
00300       /* UTF-16 Big-endian -> UTF-8 */
00301       while( zIn<zTerm ){
00302         READ_UTF16BE(zIn, c); 
00303         WRITE_UTF8(z, c);
00304       }
00305     }
00306     pMem->n = z - zOut;
00307   }
00308   *z = 0;
00309   assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
00310 
00311   sqlite3VdbeMemRelease(pMem);
00312   pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem);
00313   pMem->enc = desiredEnc;
00314   pMem->flags |= (MEM_Term|MEM_Dyn);
00315   pMem->z = (char*)zOut;
00316   pMem->zMalloc = pMem->z;
00317 
00318 translate_out:
00319 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
00320   {
00321     char zBuf[100];
00322     sqlite3VdbeMemPrettyPrint(pMem, zBuf);
00323     fprintf(stderr, "OUTPUT: %s\n", zBuf);
00324   }
00325 #endif
00326   return SQLITE_OK;
00327 }
00328 
00329 /*
00330 ** This routine checks for a byte-order mark at the beginning of the 
00331 ** UTF-16 string stored in *pMem. If one is present, it is removed and
00332 ** the encoding of the Mem adjusted. This routine does not do any
00333 ** byte-swapping, it just sets Mem.enc appropriately.
00334 **
00335 ** The allocation (static, dynamic etc.) and encoding of the Mem may be
00336 ** changed by this function.
00337 */
00338 int sqlite3VdbeMemHandleBom(Mem *pMem){
00339   int rc = SQLITE_OK;
00340   u8 bom = 0;
00341 
00342   if( pMem->n<0 || pMem->n>1 ){
00343     u8 b1 = *(u8 *)pMem->z;
00344     u8 b2 = *(((u8 *)pMem->z) + 1);
00345     if( b1==0xFE && b2==0xFF ){
00346       bom = SQLITE_UTF16BE;
00347     }
00348     if( b1==0xFF && b2==0xFE ){
00349       bom = SQLITE_UTF16LE;
00350     }
00351   }
00352   
00353   if( bom ){
00354     rc = sqlite3VdbeMemMakeWriteable(pMem);
00355     if( rc==SQLITE_OK ){
00356       pMem->n -= 2;
00357       memmove(pMem->z, &pMem->z[2], pMem->n);
00358       pMem->z[pMem->n] = '\0';
00359       pMem->z[pMem->n+1] = '\0';
00360       pMem->flags |= MEM_Term;
00361       pMem->enc = bom;
00362     }
00363   }
00364   return rc;
00365 }
00366 #endif /* SQLITE_OMIT_UTF16 */
00367 
00368 /*
00369 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
00370 ** return the number of unicode characters in pZ up to (but not including)
00371 ** the first 0x00 byte. If nByte is not less than zero, return the
00372 ** number of unicode characters in the first nByte of pZ (or up to 
00373 ** the first 0x00, whichever comes first).
00374 */
00375 int sqlite3Utf8CharLen(const char *zIn, int nByte){
00376   int r = 0;
00377   const u8 *z = (const u8*)zIn;
00378   const u8 *zTerm;
00379   if( nByte>=0 ){
00380     zTerm = &z[nByte];
00381   }else{
00382     zTerm = (const u8*)(-1);
00383   }
00384   assert( z<=zTerm );
00385   while( *z!=0 && z<zTerm ){
00386     SQLITE_SKIP_UTF8(z);
00387     r++;
00388   }
00389   return r;
00390 }
00391 
00392 /* This test function is not currently used by the automated test-suite. 
00393 ** Hence it is only available in debug builds.
00394 */
00395 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
00396 /*
00397 ** Translate UTF-8 to UTF-8.
00398 **
00399 ** This has the effect of making sure that the string is well-formed
00400 ** UTF-8.  Miscoded characters are removed.
00401 **
00402 ** The translation is done in-place (since it is impossible for the
00403 ** correct UTF-8 encoding to be longer than a malformed encoding).
00404 */
00405 int sqlite3Utf8To8(unsigned char *zIn){
00406   unsigned char *zOut = zIn;
00407   unsigned char *zStart = zIn;
00408   unsigned char *zTerm = &zIn[strlen((char *)zIn)];
00409   u32 c;
00410 
00411   while( zIn[0] ){
00412     c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
00413     if( c!=0xfffd ){
00414       WRITE_UTF8(zOut, c);
00415     }
00416   }
00417   *zOut = 0;
00418   return zOut - zStart;
00419 }
00420 #endif
00421 
00422 #ifndef SQLITE_OMIT_UTF16
00423 /*
00424 ** Convert a UTF-16 string in the native encoding into a UTF-8 string.
00425 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
00426 ** be freed by the calling function.
00427 **
00428 ** NULL is returned if there is an allocation error.
00429 */
00430 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){
00431   Mem m;
00432   memset(&m, 0, sizeof(m));
00433   m.db = db;
00434   sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC);
00435   sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
00436   if( db->mallocFailed ){
00437     sqlite3VdbeMemRelease(&m);
00438     m.z = 0;
00439   }
00440   assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
00441   assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
00442   return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z);
00443 }
00444 
00445 /*
00446 ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
00447 ** return the number of bytes up to (but not including), the first pair
00448 ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
00449 ** then return the number of bytes in the first nChar unicode characters
00450 ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
00451 */
00452 int sqlite3Utf16ByteLen(const void *zIn, int nChar){
00453   unsigned int c = 1;
00454   char const *z = zIn;
00455   int n = 0;
00456   if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
00457     /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here
00458     ** and in other parts of this file means that at one branch will
00459     ** not be covered by coverage testing on any single host. But coverage
00460     ** will be complete if the tests are run on both a little-endian and 
00461     ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE
00462     ** macros are constant at compile time the compiler can determine
00463     ** which branch will be followed. It is therefore assumed that no runtime
00464     ** penalty is paid for this "if" statement.
00465     */
00466     while( c && ((nChar<0) || n<nChar) ){
00467       READ_UTF16BE(z, c);
00468       n++;
00469     }
00470   }else{
00471     while( c && ((nChar<0) || n<nChar) ){
00472       READ_UTF16LE(z, c);
00473       n++;
00474     }
00475   }
00476   return (z-(char const *)zIn)-((c==0)?2:0);
00477 }
00478 
00479 #if defined(SQLITE_TEST)
00480 /*
00481 ** This routine is called from the TCL test function "translate_selftest".
00482 ** It checks that the primitives for serializing and deserializing
00483 ** characters in each encoding are inverses of each other.
00484 */
00485 void sqlite3UtfSelfTest(void){
00486   unsigned int i, t;
00487   unsigned char zBuf[20];
00488   unsigned char *z;
00489   unsigned char *zTerm;
00490   int n;
00491   unsigned int c;
00492 
00493   for(i=0; i<0x00110000; i++){
00494     z = zBuf;
00495     WRITE_UTF8(z, i);
00496     n = z-zBuf;
00497     z[0] = 0;
00498     zTerm = z;
00499     z = zBuf;
00500     c = sqlite3Utf8Read(z, zTerm, (const u8**)&z);
00501     t = i;
00502     if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
00503     if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
00504     assert( c==t );
00505     assert( (z-zBuf)==n );
00506   }
00507   for(i=0; i<0x00110000; i++){
00508     if( i>=0xD800 && i<0xE000 ) continue;
00509     z = zBuf;
00510     WRITE_UTF16LE(z, i);
00511     n = z-zBuf;
00512     z[0] = 0;
00513     z = zBuf;
00514     READ_UTF16LE(z, c);
00515     assert( c==i );
00516     assert( (z-zBuf)==n );
00517   }
00518   for(i=0; i<0x00110000; i++){
00519     if( i>=0xD800 && i<0xE000 ) continue;
00520     z = zBuf;
00521     WRITE_UTF16BE(z, i);
00522     n = z-zBuf;
00523     z[0] = 0;
00524     z = zBuf;
00525     READ_UTF16BE(z, c);
00526     assert( c==i );
00527     assert( (z-zBuf)==n );
00528   }
00529 }
00530 #endif /* SQLITE_TEST */
00531 #endif /* SQLITE_OMIT_UTF16 */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:57 2011 by Doxygen 1.6.1