fts2_tokenizer.c

Go to the documentation of this file.
00001 /*
00002 ** 2007 June 22
00003 **
00004 ** The author disclaims copyright to this source code.  In place of
00005 ** a legal notice, here is a blessing:
00006 **
00007 **    May you do good and not evil.
00008 **    May you find forgiveness for yourself and forgive others.
00009 **    May you share freely, never taking more than you give.
00010 **
00011 ******************************************************************************
00012 **
00013 ** This is part of an SQLite module implementing full-text search.
00014 ** This particular file implements the generic tokenizer interface.
00015 */
00016 
00017 /*
00018 ** The code in this file is only compiled if:
00019 **
00020 **     * The FTS2 module is being built as an extension
00021 **       (in which case SQLITE_CORE is not defined), or
00022 **
00023 **     * The FTS2 module is being built into the core of
00024 **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
00025 */
00026 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
00027 
00028 
00029 #include "sqlite3.h"
00030 #include "sqlite3ext.h"
00031 SQLITE_EXTENSION_INIT1
00032 
00033 #include "fts2_hash.h"
00034 #include "fts2_tokenizer.h"
00035 #include <assert.h>
00036 
00037 /*
00038 ** Implementation of the SQL scalar function for accessing the underlying 
00039 ** hash table. This function may be called as follows:
00040 **
00041 **   SELECT <function-name>(<key-name>);
00042 **   SELECT <function-name>(<key-name>, <pointer>);
00043 **
00044 ** where <function-name> is the name passed as the second argument
00045 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
00046 **
00047 ** If the <pointer> argument is specified, it must be a blob value
00048 ** containing a pointer to be stored as the hash data corresponding
00049 ** to the string <key-name>. If <pointer> is not specified, then
00050 ** the string <key-name> must already exist in the has table. Otherwise,
00051 ** an error is returned.
00052 **
00053 ** Whether or not the <pointer> argument is specified, the value returned
00054 ** is a blob containing the pointer stored as the hash data corresponding
00055 ** to string <key-name> (after the hash-table is updated, if applicable).
00056 */
00057 static void scalarFunc(
00058   sqlite3_context *context,
00059   int argc,
00060   sqlite3_value **argv
00061 ){
00062   fts2Hash *pHash;
00063   void *pPtr = 0;
00064   const unsigned char *zName;
00065   int nName;
00066 
00067   assert( argc==1 || argc==2 );
00068 
00069   pHash = (fts2Hash *)sqlite3_user_data(context);
00070 
00071   zName = sqlite3_value_text(argv[0]);
00072   nName = sqlite3_value_bytes(argv[0])+1;
00073 
00074   if( argc==2 ){
00075     void *pOld;
00076     int n = sqlite3_value_bytes(argv[1]);
00077     if( n!=sizeof(pPtr) ){
00078       sqlite3_result_error(context, "argument type mismatch", -1);
00079       return;
00080     }
00081     pPtr = *(void **)sqlite3_value_blob(argv[1]);
00082     pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
00083     if( pOld==pPtr ){
00084       sqlite3_result_error(context, "out of memory", -1);
00085       return;
00086     }
00087   }else{
00088     pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
00089     if( !pPtr ){
00090       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
00091       sqlite3_result_error(context, zErr, -1);
00092       sqlite3_free(zErr);
00093       return;
00094     }
00095   }
00096 
00097   sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
00098 }
00099 
00100 #ifdef SQLITE_TEST
00101 
00102 #include <tcl.h>
00103 #include <string.h>
00104 
00105 /*
00106 ** Implementation of a special SQL scalar function for testing tokenizers 
00107 ** designed to be used in concert with the Tcl testing framework. This
00108 ** function must be called with two arguments:
00109 **
00110 **   SELECT <function-name>(<key-name>, <input-string>);
00111 **   SELECT <function-name>(<key-name>, <pointer>);
00112 **
00113 ** where <function-name> is the name passed as the second argument
00114 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
00115 ** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
00116 **
00117 ** The return value is a string that may be interpreted as a Tcl
00118 ** list. For each token in the <input-string>, three elements are
00119 ** added to the returned list. The first is the token position, the 
00120 ** second is the token text (folded, stemmed, etc.) and the third is the
00121 ** substring of <input-string> associated with the token. For example, 
00122 ** using the built-in "simple" tokenizer:
00123 **
00124 **   SELECT fts_tokenizer_test('simple', 'I don't see how');
00125 **
00126 ** will return the string:
00127 **
00128 **   "{0 i I 1 dont don't 2 see see 3 how how}"
00129 **   
00130 */
00131 static void testFunc(
00132   sqlite3_context *context,
00133   int argc,
00134   sqlite3_value **argv
00135 ){
00136   fts2Hash *pHash;
00137   sqlite3_tokenizer_module *p;
00138   sqlite3_tokenizer *pTokenizer = 0;
00139   sqlite3_tokenizer_cursor *pCsr = 0;
00140 
00141   const char *zErr = 0;
00142 
00143   const char *zName;
00144   int nName;
00145   const char *zInput;
00146   int nInput;
00147 
00148   const char *zArg = 0;
00149 
00150   const char *zToken;
00151   int nToken;
00152   int iStart;
00153   int iEnd;
00154   int iPos;
00155 
00156   Tcl_Obj *pRet;
00157 
00158   assert( argc==2 || argc==3 );
00159 
00160   nName = sqlite3_value_bytes(argv[0]);
00161   zName = (const char *)sqlite3_value_text(argv[0]);
00162   nInput = sqlite3_value_bytes(argv[argc-1]);
00163   zInput = (const char *)sqlite3_value_text(argv[argc-1]);
00164 
00165   if( argc==3 ){
00166     zArg = (const char *)sqlite3_value_text(argv[1]);
00167   }
00168 
00169   pHash = (fts2Hash *)sqlite3_user_data(context);
00170   p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
00171 
00172   if( !p ){
00173     char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
00174     sqlite3_result_error(context, zErr, -1);
00175     sqlite3_free(zErr);
00176     return;
00177   }
00178 
00179   pRet = Tcl_NewObj();
00180   Tcl_IncrRefCount(pRet);
00181 
00182   if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
00183     zErr = "error in xCreate()";
00184     goto finish;
00185   }
00186   pTokenizer->pModule = p;
00187   if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
00188     zErr = "error in xOpen()";
00189     goto finish;
00190   }
00191   pCsr->pTokenizer = pTokenizer;
00192 
00193   while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
00194     Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
00195     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
00196     zToken = &zInput[iStart];
00197     nToken = iEnd-iStart;
00198     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
00199   }
00200 
00201   if( SQLITE_OK!=p->xClose(pCsr) ){
00202     zErr = "error in xClose()";
00203     goto finish;
00204   }
00205   if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
00206     zErr = "error in xDestroy()";
00207     goto finish;
00208   }
00209 
00210 finish:
00211   if( zErr ){
00212     sqlite3_result_error(context, zErr, -1);
00213   }else{
00214     sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
00215   }
00216   Tcl_DecrRefCount(pRet);
00217 }
00218 
00219 static
00220 int registerTokenizer(
00221   sqlite3 *db, 
00222   char *zName, 
00223   const sqlite3_tokenizer_module *p
00224 ){
00225   int rc;
00226   sqlite3_stmt *pStmt;
00227   const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
00228 
00229   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
00230   if( rc!=SQLITE_OK ){
00231     return rc;
00232   }
00233 
00234   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
00235   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
00236   sqlite3_step(pStmt);
00237 
00238   return sqlite3_finalize(pStmt);
00239 }
00240 
00241 static
00242 int queryTokenizer(
00243   sqlite3 *db, 
00244   char *zName,  
00245   const sqlite3_tokenizer_module **pp
00246 ){
00247   int rc;
00248   sqlite3_stmt *pStmt;
00249   const char zSql[] = "SELECT fts2_tokenizer(?)";
00250 
00251   *pp = 0;
00252   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
00253   if( rc!=SQLITE_OK ){
00254     return rc;
00255   }
00256 
00257   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
00258   if( SQLITE_ROW==sqlite3_step(pStmt) ){
00259     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
00260       memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
00261     }
00262   }
00263 
00264   return sqlite3_finalize(pStmt);
00265 }
00266 
00267 void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
00268 
00269 /*
00270 ** Implementation of the scalar function fts2_tokenizer_internal_test().
00271 ** This function is used for testing only, it is not included in the
00272 ** build unless SQLITE_TEST is defined.
00273 **
00274 ** The purpose of this is to test that the fts2_tokenizer() function
00275 ** can be used as designed by the C-code in the queryTokenizer and
00276 ** registerTokenizer() functions above. These two functions are repeated
00277 ** in the README.tokenizer file as an example, so it is important to
00278 ** test them.
00279 **
00280 ** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
00281 ** function with no arguments. An assert() will fail if a problem is
00282 ** detected. i.e.:
00283 **
00284 **     SELECT fts2_tokenizer_internal_test();
00285 **
00286 */
00287 static void intTestFunc(
00288   sqlite3_context *context,
00289   int argc,
00290   sqlite3_value **argv
00291 ){
00292   int rc;
00293   const sqlite3_tokenizer_module *p1;
00294   const sqlite3_tokenizer_module *p2;
00295   sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
00296 
00297   /* Test the query function */
00298   sqlite3Fts2SimpleTokenizerModule(&p1);
00299   rc = queryTokenizer(db, "simple", &p2);
00300   assert( rc==SQLITE_OK );
00301   assert( p1==p2 );
00302   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
00303   assert( rc==SQLITE_ERROR );
00304   assert( p2==0 );
00305   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
00306 
00307   /* Test the storage function */
00308   rc = registerTokenizer(db, "nosuchtokenizer", p1);
00309   assert( rc==SQLITE_OK );
00310   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
00311   assert( rc==SQLITE_OK );
00312   assert( p2==p1 );
00313 
00314   sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
00315 }
00316 
00317 #endif
00318 
00319 /*
00320 ** Set up SQL objects in database db used to access the contents of
00321 ** the hash table pointed to by argument pHash. The hash table must
00322 ** been initialised to use string keys, and to take a private copy 
00323 ** of the key when a value is inserted. i.e. by a call similar to:
00324 **
00325 **    sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
00326 **
00327 ** This function adds a scalar function (see header comment above
00328 ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
00329 ** defined at compilation time, a temporary virtual table (see header 
00330 ** comment above struct HashTableVtab) to the database schema. Both 
00331 ** provide read/write access to the contents of *pHash.
00332 **
00333 ** The third argument to this function, zName, is used as the name
00334 ** of both the scalar and, if created, the virtual table.
00335 */
00336 int sqlite3Fts2InitHashTable(
00337   sqlite3 *db, 
00338   fts2Hash *pHash, 
00339   const char *zName
00340 ){
00341   int rc = SQLITE_OK;
00342   void *p = (void *)pHash;
00343   const int any = SQLITE_ANY;
00344   char *zTest = 0;
00345   char *zTest2 = 0;
00346 
00347 #ifdef SQLITE_TEST
00348   void *pdb = (void *)db;
00349   zTest = sqlite3_mprintf("%s_test", zName);
00350   zTest2 = sqlite3_mprintf("%s_internal_test", zName);
00351   if( !zTest || !zTest2 ){
00352     rc = SQLITE_NOMEM;
00353   }
00354 #endif
00355 
00356   if( rc!=SQLITE_OK
00357    || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
00358    || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
00359 #ifdef SQLITE_TEST
00360    || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
00361    || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
00362    || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
00363 #endif
00364   );
00365 
00366   sqlite3_free(zTest);
00367   sqlite3_free(zTest2);
00368   return rc;
00369 }
00370 
00371 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1