fts2_tokenizer.h

Go to the documentation of this file.
00001 /*
00002 ** 2006 July 10
00003 **
00004 ** The author disclaims copyright to this source code.
00005 **
00006 *************************************************************************
00007 ** Defines the interface to tokenizers used by fulltext-search.  There
00008 ** are three basic components:
00009 **
00010 ** sqlite3_tokenizer_module is a singleton defining the tokenizer
00011 ** interface functions.  This is essentially the class structure for
00012 ** tokenizers.
00013 **
00014 ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
00015 ** including customization information defined at creation time.
00016 **
00017 ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
00018 ** tokens from a particular input.
00019 */
00020 #ifndef _FTS2_TOKENIZER_H_
00021 #define _FTS2_TOKENIZER_H_
00022 
00023 /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
00024 ** If tokenizers are to be allowed to call sqlite3_*() functions, then
00025 ** we will need a way to register the API consistently.
00026 */
00027 #include "sqlite3.h"
00028 
00029 /*
00030 ** Structures used by the tokenizer interface. When a new tokenizer
00031 ** implementation is registered, the caller provides a pointer to
00032 ** an sqlite3_tokenizer_module containing pointers to the callback
00033 ** functions that make up an implementation.
00034 **
00035 ** When an fts2 table is created, it passes any arguments passed to
00036 ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
00037 ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
00038 ** implementation. The xCreate() function in turn returns an 
00039 ** sqlite3_tokenizer structure representing the specific tokenizer to
00040 ** be used for the fts2 table (customized by the tokenizer clause arguments).
00041 **
00042 ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
00043 ** method is called. It returns an sqlite3_tokenizer_cursor object
00044 ** that may be used to tokenize a specific input buffer based on
00045 ** the tokenization rules supplied by a specific sqlite3_tokenizer
00046 ** object.
00047 */
00048 typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
00049 typedef struct sqlite3_tokenizer sqlite3_tokenizer;
00050 typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
00051 
00052 struct sqlite3_tokenizer_module {
00053 
00054   /*
00055   ** Structure version. Should always be set to 0.
00056   */
00057   int iVersion;
00058 
00059   /*
00060   ** Create a new tokenizer. The values in the argv[] array are the
00061   ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
00062   ** TABLE statement that created the fts2 table. For example, if
00063   ** the following SQL is executed:
00064   **
00065   **   CREATE .. USING fts2( ... , tokenizer <tokenizer-name> arg1 arg2)
00066   **
00067   ** then argc is set to 2, and the argv[] array contains pointers
00068   ** to the strings "arg1" and "arg2".
00069   **
00070   ** This method should return either SQLITE_OK (0), or an SQLite error 
00071   ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
00072   ** to point at the newly created tokenizer structure. The generic
00073   ** sqlite3_tokenizer.pModule variable should not be initialised by
00074   ** this callback. The caller will do so.
00075   */
00076   int (*xCreate)(
00077     int argc,                           /* Size of argv array */
00078     const char *const*argv,             /* Tokenizer argument strings */
00079     sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
00080   );
00081 
00082   /*
00083   ** Destroy an existing tokenizer. The fts2 module calls this method
00084   ** exactly once for each successful call to xCreate().
00085   */
00086   int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
00087 
00088   /*
00089   ** Create a tokenizer cursor to tokenize an input buffer. The caller
00090   ** is responsible for ensuring that the input buffer remains valid
00091   ** until the cursor is closed (using the xClose() method). 
00092   */
00093   int (*xOpen)(
00094     sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
00095     const char *pInput, int nBytes,      /* Input buffer */
00096     sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
00097   );
00098 
00099   /*
00100   ** Destroy an existing tokenizer cursor. The fts2 module calls this 
00101   ** method exactly once for each successful call to xOpen().
00102   */
00103   int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
00104 
00105   /*
00106   ** Retrieve the next token from the tokenizer cursor pCursor. This
00107   ** method should either return SQLITE_OK and set the values of the
00108   ** "OUT" variables identified below, or SQLITE_DONE to indicate that
00109   ** the end of the buffer has been reached, or an SQLite error code.
00110   **
00111   ** *ppToken should be set to point at a buffer containing the 
00112   ** normalized version of the token (i.e. after any case-folding and/or
00113   ** stemming has been performed). *pnBytes should be set to the length
00114   ** of this buffer in bytes. The input text that generated the token is
00115   ** identified by the byte offsets returned in *piStartOffset and
00116   ** *piEndOffset.
00117   **
00118   ** The buffer *ppToken is set to point at is managed by the tokenizer
00119   ** implementation. It is only required to be valid until the next call
00120   ** to xNext() or xClose(). 
00121   */
00122   /* TODO(shess) current implementation requires pInput to be
00123   ** nul-terminated.  This should either be fixed, or pInput/nBytes
00124   ** should be converted to zInput.
00125   */
00126   int (*xNext)(
00127     sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
00128     const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
00129     int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
00130     int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
00131     int *piPosition      /* OUT: Number of tokens returned before this one */
00132   );
00133 };
00134 
00135 struct sqlite3_tokenizer {
00136   const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
00137   /* Tokenizer implementations will typically add additional fields */
00138 };
00139 
00140 struct sqlite3_tokenizer_cursor {
00141   sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
00142   /* Tokenizer implementations will typically add additional fields */
00143 };
00144 
00145 #endif /* _FTS2_TOKENIZER_H_ */

ContextLogger2—ContextLogger2 Logger Daemon Internals—Generated on Mon May 2 13:49:53 2011 by Doxygen 1.6.1