00001 /* 00002 ** 2004 April 13 00003 ** 00004 ** The author disclaims copyright to this source code. In place of 00005 ** a legal notice, here is a blessing: 00006 ** 00007 ** May you do good and not evil. 00008 ** May you find forgiveness for yourself and forgive others. 00009 ** May you share freely, never taking more than you give. 00010 ** 00011 ************************************************************************* 00012 ** This file contains routines used to translate between UTF-8, 00013 ** UTF-16, UTF-16BE, and UTF-16LE. 00014 ** 00015 ** $Id: utf.c,v 1.66 2008/11/07 03:29:34 drh Exp $ 00016 ** 00017 ** Notes on UTF-8: 00018 ** 00019 ** Byte-0 Byte-1 Byte-2 Byte-3 Value 00020 ** 0xxxxxxx 00000000 00000000 0xxxxxxx 00021 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx 00022 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx 00023 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx 00024 ** 00025 ** 00026 ** Notes on UTF-16: (with wwww+1==uuuuu) 00027 ** 00028 ** Word-0 Word-1 Value 00029 ** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx 00030 ** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx 00031 ** 00032 ** 00033 ** BOM or Byte Order Mark: 00034 ** 0xff 0xfe little-endian utf-16 follows 00035 ** 0xfe 0xff big-endian utf-16 follows 00036 ** 00037 */ 00038 #include "sqliteInt.h" 00039 #include <assert.h> 00040 #include "vdbeInt.h" 00041 00042 /* 00043 ** The following constant value is used by the SQLITE_BIGENDIAN and 00044 ** SQLITE_LITTLEENDIAN macros. 00045 */ 00046 const int sqlite3one = 1; 00047 00048 /* 00049 ** This lookup table is used to help decode the first byte of 00050 ** a multi-byte UTF8 character. 00051 */ 00052 static const unsigned char sqlite3UtfTrans1[] = { 00053 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 00054 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 00055 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 00056 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 00057 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 00058 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 00059 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 00060 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, 00061 }; 00062 00063 00064 #define WRITE_UTF8(zOut, c) { \ 00065 if( c<0x00080 ){ \ 00066 *zOut++ = (c&0xFF); \ 00067 } \ 00068 else if( c<0x00800 ){ \ 00069 *zOut++ = 0xC0 + ((c>>6)&0x1F); \ 00070 *zOut++ = 0x80 + (c & 0x3F); \ 00071 } \ 00072 else if( c<0x10000 ){ \ 00073 *zOut++ = 0xE0 + ((c>>12)&0x0F); \ 00074 *zOut++ = 0x80 + ((c>>6) & 0x3F); \ 00075 *zOut++ = 0x80 + (c & 0x3F); \ 00076 }else{ \ 00077 *zOut++ = 0xF0 + ((c>>18) & 0x07); \ 00078 *zOut++ = 0x80 + ((c>>12) & 0x3F); \ 00079 *zOut++ = 0x80 + ((c>>6) & 0x3F); \ 00080 *zOut++ = 0x80 + (c & 0x3F); \ 00081 } \ 00082 } 00083 00084 #define WRITE_UTF16LE(zOut, c) { \ 00085 if( c<=0xFFFF ){ \ 00086 *zOut++ = (c&0x00FF); \ 00087 *zOut++ = ((c>>8)&0x00FF); \ 00088 }else{ \ 00089 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ 00090 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ 00091 *zOut++ = (c&0x00FF); \ 00092 *zOut++ = (0x00DC + ((c>>8)&0x03)); \ 00093 } \ 00094 } 00095 00096 #define WRITE_UTF16BE(zOut, c) { \ 00097 if( c<=0xFFFF ){ \ 00098 *zOut++ = ((c>>8)&0x00FF); \ 00099 *zOut++ = (c&0x00FF); \ 00100 }else{ \ 00101 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ 00102 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ 00103 *zOut++ = (0x00DC + ((c>>8)&0x03)); \ 00104 *zOut++ = (c&0x00FF); \ 00105 } \ 00106 } 00107 00108 #define READ_UTF16LE(zIn, c){ \ 00109 c = (*zIn++); \ 00110 c += ((*zIn++)<<8); \ 00111 if( c>=0xD800 && c<0xE000 ){ \ 00112 int c2 = (*zIn++); \ 00113 c2 += ((*zIn++)<<8); \ 00114 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 00115 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ 00116 } \ 00117 } 00118 00119 #define READ_UTF16BE(zIn, c){ \ 00120 c = ((*zIn++)<<8); \ 00121 c += (*zIn++); \ 00122 if( c>=0xD800 && c<0xE000 ){ \ 00123 int c2 = ((*zIn++)<<8); \ 00124 c2 += (*zIn++); \ 00125 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 00126 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ 00127 } \ 00128 } 00129 00130 /* 00131 ** Translate a single UTF-8 character. Return the unicode value. 00132 ** 00133 ** During translation, assume that the byte that zTerm points 00134 ** is a 0x00. 00135 ** 00136 ** Write a pointer to the next unread byte back into *pzNext. 00137 ** 00138 ** Notes On Invalid UTF-8: 00139 ** 00140 ** * This routine never allows a 7-bit character (0x00 through 0x7f) to 00141 ** be encoded as a multi-byte character. Any multi-byte character that 00142 ** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd. 00143 ** 00144 ** * This routine never allows a UTF16 surrogate value to be encoded. 00145 ** If a multi-byte character attempts to encode a value between 00146 ** 0xd800 and 0xe000 then it is rendered as 0xfffd. 00147 ** 00148 ** * Bytes in the range of 0x80 through 0xbf which occur as the first 00149 ** byte of a character are interpreted as single-byte characters 00150 ** and rendered as themselves even though they are technically 00151 ** invalid characters. 00152 ** 00153 ** * This routine accepts an infinite number of different UTF8 encodings 00154 ** for unicode values 0x80 and greater. It do not change over-length 00155 ** encodings to 0xfffd as some systems recommend. 00156 */ 00157 #define READ_UTF8(zIn, zTerm, c) \ 00158 c = *(zIn++); \ 00159 if( c>=0xc0 ){ \ 00160 c = sqlite3UtfTrans1[c-0xc0]; \ 00161 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \ 00162 c = (c<<6) + (0x3f & *(zIn++)); \ 00163 } \ 00164 if( c<0x80 \ 00165 || (c&0xFFFFF800)==0xD800 \ 00166 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ 00167 } 00168 int sqlite3Utf8Read( 00169 const unsigned char *z, /* First byte of UTF-8 character */ 00170 const unsigned char *zTerm, /* Pretend this byte is 0x00 */ 00171 const unsigned char **pzNext /* Write first byte past UTF-8 char here */ 00172 ){ 00173 int c; 00174 READ_UTF8(z, zTerm, c); 00175 *pzNext = z; 00176 return c; 00177 } 00178 00179 00180 00181 00182 /* 00183 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is 00184 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). 00185 */ 00186 /* #define TRANSLATE_TRACE 1 */ 00187 00188 #ifndef SQLITE_OMIT_UTF16 00189 /* 00190 ** This routine transforms the internal text encoding used by pMem to 00191 ** desiredEnc. It is an error if the string is already of the desired 00192 ** encoding, or if *pMem does not contain a string value. 00193 */ 00194 int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ 00195 int len; /* Maximum length of output string in bytes */ 00196 unsigned char *zOut; /* Output buffer */ 00197 unsigned char *zIn; /* Input iterator */ 00198 unsigned char *zTerm; /* End of input */ 00199 unsigned char *z; /* Output iterator */ 00200 unsigned int c; 00201 00202 assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) ); 00203 assert( pMem->flags&MEM_Str ); 00204 assert( pMem->enc!=desiredEnc ); 00205 assert( pMem->enc!=0 ); 00206 assert( pMem->n>=0 ); 00207 00208 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) 00209 { 00210 char zBuf[100]; 00211 sqlite3VdbeMemPrettyPrint(pMem, zBuf); 00212 fprintf(stderr, "INPUT: %s\n", zBuf); 00213 } 00214 #endif 00215 00216 /* If the translation is between UTF-16 little and big endian, then 00217 ** all that is required is to swap the byte order. This case is handled 00218 ** differently from the others. 00219 */ 00220 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ 00221 u8 temp; 00222 int rc; 00223 rc = sqlite3VdbeMemMakeWriteable(pMem); 00224 if( rc!=SQLITE_OK ){ 00225 assert( rc==SQLITE_NOMEM ); 00226 return SQLITE_NOMEM; 00227 } 00228 zIn = (u8*)pMem->z; 00229 zTerm = &zIn[pMem->n&~1]; 00230 while( zIn<zTerm ){ 00231 temp = *zIn; 00232 *zIn = *(zIn+1); 00233 zIn++; 00234 *zIn++ = temp; 00235 } 00236 pMem->enc = desiredEnc; 00237 goto translate_out; 00238 } 00239 00240 /* Set len to the maximum number of bytes required in the output buffer. */ 00241 if( desiredEnc==SQLITE_UTF8 ){ 00242 /* When converting from UTF-16, the maximum growth results from 00243 ** translating a 2-byte character to a 4-byte UTF-8 character. 00244 ** A single byte is required for the output string 00245 ** nul-terminator. 00246 */ 00247 pMem->n &= ~1; 00248 len = pMem->n * 2 + 1; 00249 }else{ 00250 /* When converting from UTF-8 to UTF-16 the maximum growth is caused 00251 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 00252 ** character. Two bytes are required in the output buffer for the 00253 ** nul-terminator. 00254 */ 00255 len = pMem->n * 2 + 2; 00256 } 00257 00258 /* Set zIn to point at the start of the input buffer and zTerm to point 1 00259 ** byte past the end. 00260 ** 00261 ** Variable zOut is set to point at the output buffer, space obtained 00262 ** from sqlite3_malloc(). 00263 */ 00264 zIn = (u8*)pMem->z; 00265 zTerm = &zIn[pMem->n]; 00266 zOut = sqlite3DbMallocRaw(pMem->db, len); 00267 if( !zOut ){ 00268 return SQLITE_NOMEM; 00269 } 00270 z = zOut; 00271 00272 if( pMem->enc==SQLITE_UTF8 ){ 00273 if( desiredEnc==SQLITE_UTF16LE ){ 00274 /* UTF-8 -> UTF-16 Little-endian */ 00275 while( zIn<zTerm ){ 00276 /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */ 00277 READ_UTF8(zIn, zTerm, c); 00278 WRITE_UTF16LE(z, c); 00279 } 00280 }else{ 00281 assert( desiredEnc==SQLITE_UTF16BE ); 00282 /* UTF-8 -> UTF-16 Big-endian */ 00283 while( zIn<zTerm ){ 00284 /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */ 00285 READ_UTF8(zIn, zTerm, c); 00286 WRITE_UTF16BE(z, c); 00287 } 00288 } 00289 pMem->n = z - zOut; 00290 *z++ = 0; 00291 }else{ 00292 assert( desiredEnc==SQLITE_UTF8 ); 00293 if( pMem->enc==SQLITE_UTF16LE ){ 00294 /* UTF-16 Little-endian -> UTF-8 */ 00295 while( zIn<zTerm ){ 00296 READ_UTF16LE(zIn, c); 00297 WRITE_UTF8(z, c); 00298 } 00299 }else{ 00300 /* UTF-16 Big-endian -> UTF-8 */ 00301 while( zIn<zTerm ){ 00302 READ_UTF16BE(zIn, c); 00303 WRITE_UTF8(z, c); 00304 } 00305 } 00306 pMem->n = z - zOut; 00307 } 00308 *z = 0; 00309 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); 00310 00311 sqlite3VdbeMemRelease(pMem); 00312 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem); 00313 pMem->enc = desiredEnc; 00314 pMem->flags |= (MEM_Term|MEM_Dyn); 00315 pMem->z = (char*)zOut; 00316 pMem->zMalloc = pMem->z; 00317 00318 translate_out: 00319 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) 00320 { 00321 char zBuf[100]; 00322 sqlite3VdbeMemPrettyPrint(pMem, zBuf); 00323 fprintf(stderr, "OUTPUT: %s\n", zBuf); 00324 } 00325 #endif 00326 return SQLITE_OK; 00327 } 00328 00329 /* 00330 ** This routine checks for a byte-order mark at the beginning of the 00331 ** UTF-16 string stored in *pMem. If one is present, it is removed and 00332 ** the encoding of the Mem adjusted. This routine does not do any 00333 ** byte-swapping, it just sets Mem.enc appropriately. 00334 ** 00335 ** The allocation (static, dynamic etc.) and encoding of the Mem may be 00336 ** changed by this function. 00337 */ 00338 int sqlite3VdbeMemHandleBom(Mem *pMem){ 00339 int rc = SQLITE_OK; 00340 u8 bom = 0; 00341 00342 if( pMem->n<0 || pMem->n>1 ){ 00343 u8 b1 = *(u8 *)pMem->z; 00344 u8 b2 = *(((u8 *)pMem->z) + 1); 00345 if( b1==0xFE && b2==0xFF ){ 00346 bom = SQLITE_UTF16BE; 00347 } 00348 if( b1==0xFF && b2==0xFE ){ 00349 bom = SQLITE_UTF16LE; 00350 } 00351 } 00352 00353 if( bom ){ 00354 rc = sqlite3VdbeMemMakeWriteable(pMem); 00355 if( rc==SQLITE_OK ){ 00356 pMem->n -= 2; 00357 memmove(pMem->z, &pMem->z[2], pMem->n); 00358 pMem->z[pMem->n] = '\0'; 00359 pMem->z[pMem->n+1] = '\0'; 00360 pMem->flags |= MEM_Term; 00361 pMem->enc = bom; 00362 } 00363 } 00364 return rc; 00365 } 00366 #endif /* SQLITE_OMIT_UTF16 */ 00367 00368 /* 00369 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero, 00370 ** return the number of unicode characters in pZ up to (but not including) 00371 ** the first 0x00 byte. If nByte is not less than zero, return the 00372 ** number of unicode characters in the first nByte of pZ (or up to 00373 ** the first 0x00, whichever comes first). 00374 */ 00375 int sqlite3Utf8CharLen(const char *zIn, int nByte){ 00376 int r = 0; 00377 const u8 *z = (const u8*)zIn; 00378 const u8 *zTerm; 00379 if( nByte>=0 ){ 00380 zTerm = &z[nByte]; 00381 }else{ 00382 zTerm = (const u8*)(-1); 00383 } 00384 assert( z<=zTerm ); 00385 while( *z!=0 && z<zTerm ){ 00386 SQLITE_SKIP_UTF8(z); 00387 r++; 00388 } 00389 return r; 00390 } 00391 00392 /* This test function is not currently used by the automated test-suite. 00393 ** Hence it is only available in debug builds. 00394 */ 00395 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) 00396 /* 00397 ** Translate UTF-8 to UTF-8. 00398 ** 00399 ** This has the effect of making sure that the string is well-formed 00400 ** UTF-8. Miscoded characters are removed. 00401 ** 00402 ** The translation is done in-place (since it is impossible for the 00403 ** correct UTF-8 encoding to be longer than a malformed encoding). 00404 */ 00405 int sqlite3Utf8To8(unsigned char *zIn){ 00406 unsigned char *zOut = zIn; 00407 unsigned char *zStart = zIn; 00408 unsigned char *zTerm = &zIn[strlen((char *)zIn)]; 00409 u32 c; 00410 00411 while( zIn[0] ){ 00412 c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); 00413 if( c!=0xfffd ){ 00414 WRITE_UTF8(zOut, c); 00415 } 00416 } 00417 *zOut = 0; 00418 return zOut - zStart; 00419 } 00420 #endif 00421 00422 #ifndef SQLITE_OMIT_UTF16 00423 /* 00424 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. 00425 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must 00426 ** be freed by the calling function. 00427 ** 00428 ** NULL is returned if there is an allocation error. 00429 */ 00430 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){ 00431 Mem m; 00432 memset(&m, 0, sizeof(m)); 00433 m.db = db; 00434 sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC); 00435 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); 00436 if( db->mallocFailed ){ 00437 sqlite3VdbeMemRelease(&m); 00438 m.z = 0; 00439 } 00440 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed ); 00441 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed ); 00442 return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z); 00443 } 00444 00445 /* 00446 ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero, 00447 ** return the number of bytes up to (but not including), the first pair 00448 ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero, 00449 ** then return the number of bytes in the first nChar unicode characters 00450 ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first). 00451 */ 00452 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ 00453 unsigned int c = 1; 00454 char const *z = zIn; 00455 int n = 0; 00456 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ 00457 /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here 00458 ** and in other parts of this file means that at one branch will 00459 ** not be covered by coverage testing on any single host. But coverage 00460 ** will be complete if the tests are run on both a little-endian and 00461 ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE 00462 ** macros are constant at compile time the compiler can determine 00463 ** which branch will be followed. It is therefore assumed that no runtime 00464 ** penalty is paid for this "if" statement. 00465 */ 00466 while( c && ((nChar<0) || n<nChar) ){ 00467 READ_UTF16BE(z, c); 00468 n++; 00469 } 00470 }else{ 00471 while( c && ((nChar<0) || n<nChar) ){ 00472 READ_UTF16LE(z, c); 00473 n++; 00474 } 00475 } 00476 return (z-(char const *)zIn)-((c==0)?2:0); 00477 } 00478 00479 #if defined(SQLITE_TEST) 00480 /* 00481 ** This routine is called from the TCL test function "translate_selftest". 00482 ** It checks that the primitives for serializing and deserializing 00483 ** characters in each encoding are inverses of each other. 00484 */ 00485 void sqlite3UtfSelfTest(void){ 00486 unsigned int i, t; 00487 unsigned char zBuf[20]; 00488 unsigned char *z; 00489 unsigned char *zTerm; 00490 int n; 00491 unsigned int c; 00492 00493 for(i=0; i<0x00110000; i++){ 00494 z = zBuf; 00495 WRITE_UTF8(z, i); 00496 n = z-zBuf; 00497 z[0] = 0; 00498 zTerm = z; 00499 z = zBuf; 00500 c = sqlite3Utf8Read(z, zTerm, (const u8**)&z); 00501 t = i; 00502 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; 00503 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; 00504 assert( c==t ); 00505 assert( (z-zBuf)==n ); 00506 } 00507 for(i=0; i<0x00110000; i++){ 00508 if( i>=0xD800 && i<0xE000 ) continue; 00509 z = zBuf; 00510 WRITE_UTF16LE(z, i); 00511 n = z-zBuf; 00512 z[0] = 0; 00513 z = zBuf; 00514 READ_UTF16LE(z, c); 00515 assert( c==i ); 00516 assert( (z-zBuf)==n ); 00517 } 00518 for(i=0; i<0x00110000; i++){ 00519 if( i>=0xD800 && i<0xE000 ) continue; 00520 z = zBuf; 00521 WRITE_UTF16BE(z, i); 00522 n = z-zBuf; 00523 z[0] = 0; 00524 z = zBuf; 00525 READ_UTF16BE(z, c); 00526 assert( c==i ); 00527 assert( (z-zBuf)==n ); 00528 } 00529 } 00530 #endif /* SQLITE_TEST */ 00531 #endif /* SQLITE_OMIT_UTF16 */
ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:57 2011 by Doxygen 1.6.1