CLucene API Documentation (Version )

00001 /*------------------------------------------------------------------------------
00002 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
00003 * 
00004 * Distributable under the terms of either the Apache License (Version 2.0) or 
00005 * the GNU Lesser General Public License, as specified in the COPYING file.
00006 ------------------------------------------------------------------------------*/
00007 #ifndef _lucene_analysis_Analyzers_
00008 #define _lucene_analysis_Analyzers_
00009 
00010 #if defined(_LUCENE_PRAGMA_ONCE)
00011 # pragma once
00012 #endif
00013 
00014 #include "CLucene/util/Reader.h"
00015 #include "AnalysisHeader.h"
00016 #include "CLucene/util/Misc.h"
00017 #include "CLucene/util/VoidMapSetDefinitions.h"
00018 
00019 CL_NS_DEF(analysis)
00020 
00021 
00022 class CharTokenizer:public Tokenizer {
00023 private:
00024         int32_t offset, bufferIndex, dataLen;
00025         TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
00026         const TCHAR* ioBuffer;
00027 protected:
00028     
00033         virtual bool isTokenChar(const TCHAR c) const = 0;
00034 
00038         virtual TCHAR normalize(const TCHAR c) const;
00039 
00040 public:
00041         CharTokenizer(CL_NS(util)::Reader* in);
00042         virtual ~CharTokenizer();
00043         bool next(Token* token);
00044 };
00045 
00046 
00053 class LetterTokenizer:public CharTokenizer {
00054 public:
00055         // Construct a new LetterTokenizer. 
00056         LetterTokenizer(CL_NS(util)::Reader* in);
00057     virtual ~LetterTokenizer();
00058 protected:
00060         bool isTokenChar(const TCHAR c) const;
00061 };
00062 
00063 
00064 
00075 class LowerCaseTokenizer:public LetterTokenizer {
00076 public:
00078         LowerCaseTokenizer(CL_NS(util)::Reader* in);
00079     virtual ~LowerCaseTokenizer();
00080 protected:
00082         TCHAR normalize(const TCHAR chr) const;
00083 };
00084 
00085 
00088 class WhitespaceTokenizer: public CharTokenizer {
00089 public:
00091         WhitespaceTokenizer(CL_NS(util)::Reader* in);
00092         virtual ~WhitespaceTokenizer();
00093 protected:
00096         bool isTokenChar(const TCHAR c) const;
00097 };
00098 
00099 
00101 class WhitespaceAnalyzer: public Analyzer {
00102 public:
00103     WhitespaceAnalyzer();
00104     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
00105     virtual ~WhitespaceAnalyzer();
00106 };
00107 
00109 class CLUCENE_EXPORT SimpleAnalyzer: public Analyzer {
00110 public:
00111     SimpleAnalyzer();
00112         TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
00113         virtual ~SimpleAnalyzer();
00114 };
00115 
00116 
00117 
00121 class LowerCaseFilter: public TokenFilter {
00122 public:
00123         LowerCaseFilter(TokenStream* in, bool deleteTokenStream);
00124         virtual ~LowerCaseFilter();
00125         bool next(Token* token);
00126 };
00127 
00128 
00132 class StopFilter: public TokenFilter {
00133 private:
00134         //bvk: i found this to work faster with a non-hash table. the number of items
00135         //in the stop table is not like to make it worth having hashing.
00136         //ish: implement a radix/patricia tree for this?
00137         CLTCSetList* stopWords;
00138         bool deleteStopTable;
00139 
00140         bool enablePositionIncrements;
00141         const bool ignoreCase;
00142 public:
00143         static bool ENABLE_POSITION_INCREMENTS_DEFAULT;
00144 
00145         // Constructs a filter which removes words from the input
00146         //      TokenStream that are named in the array of words. 
00147         StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** _stopWords, const bool _ignoreCase = false);
00148 
00149         virtual ~StopFilter();
00150 
00154         StopFilter(TokenStream* in, bool deleteTokenStream, CLTCSetList* stopTable, bool _deleteStopTable=false);
00155         
00162         static void fillStopTable(CLTCSetList* stopTable,
00163                                       const TCHAR** stopWords, const bool _ignoreCase = false);
00164 
00168         bool next(Token* token);
00169 
00170 
00174         static bool getEnablePositionIncrementsDefault();
00175 
00187         static void setEnablePositionIncrementsDefault(const bool defaultValue);
00188 
00192         bool getEnablePositionIncrements() const;
00193 
00202         void setEnablePositionIncrements(const bool enable);
00203 
00204 };
00205 
00210 class WordlistLoader {
00211 public:
00221         static CLTCSetList* getWordSet(const char* wordfilePath, const char* enc = NULL, CLTCSetList* stopTable = NULL);
00222 
00232         static CLTCSetList* getWordSet(CL_NS(util)::Reader* reader, CLTCSetList* stopTable = NULL, const bool bDeleteReader = false);
00233 };
00234 
00235 
00237 class StopAnalyzer: public Analyzer {
00238         CLTCSetList* stopTable;
00239 
00240 public:
00242     StopAnalyzer();
00243     virtual ~StopAnalyzer();
00244     
00246     StopAnalyzer( const TCHAR** stopWords );
00247 
00251         StopAnalyzer(const char* stopwordsFile, const char* enc = NULL);
00252 
00256         StopAnalyzer(CL_NS(util)::Reader* stopwordsReader, const bool _bDeleteReader = false);
00257 
00259     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
00260         
00263     static const TCHAR* ENGLISH_STOP_WORDS[];
00264 };
00265 
00266 
00267 
00288 class PerFieldAnalyzerWrapper : public Analyzer {
00289 private:
00290     Analyzer* defaultAnalyzer;
00291     
00292     typedef CL_NS(util)::CLHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar,
00293         CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > AnalyzerMapType;
00294     AnalyzerMapType* analyzerMap;
00295 public:
00302     PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
00303     virtual ~PerFieldAnalyzerWrapper();
00304     
00311     void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
00312     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
00313 };
00314 
00315 
00323 class ISOLatin1AccentFilter: public TokenFilter {
00324 public:
00325         ISOLatin1AccentFilter(TokenStream* input, bool deleteTs);
00326         
00330         bool next(Token* token);
00331         
00332         virtual ~ISOLatin1AccentFilter();
00333 };
00334 
00335 
00339 class KeywordTokenizer: public Tokenizer {
00340 private:
00341     LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
00342     bool done;
00343     int bufferSize;
00344 public:
00345     KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
00346     virtual ~KeywordTokenizer();
00347     bool next(Token* token);
00348 };
00349 
00354 class KeywordAnalyzer: public Analyzer {
00355 public:
00356     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
00357     virtual ~KeywordAnalyzer();
00358 };
00359 
00360     
00365 class LengthFilter: public TokenFilter {
00366 private:
00367     size_t _min;
00368     size_t _max;
00369 public:
00374     LengthFilter(TokenStream* in, const size_t _min, const size_t _max);
00375     
00379     bool next(Token* token);
00380 };
00381 
00382 
00383 CL_NS_END
00384 #endif
Analyzers.h