CLucene - a full-featured, c++ search engine
API Documentation
00001 /*------------------------------------------------------------------------------ 00002 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team 00003 * 00004 * Distributable under the terms of either the Apache License (Version 2.0) or 00005 * the GNU Lesser General Public License, as specified in the COPYING file. 00006 ------------------------------------------------------------------------------*/ 00007 #ifndef _lucene_analysis_Analyzers_ 00008 #define _lucene_analysis_Analyzers_ 00009 00010 #if defined(_LUCENE_PRAGMA_ONCE) 00011 # pragma once 00012 #endif 00013 00014 #include "CLucene/util/Reader.h" 00015 #include "AnalysisHeader.h" 00016 #include "CLucene/util/Misc.h" 00017 #include "CLucene/util/VoidMapSetDefinitions.h" 00018 00019 CL_NS_DEF(analysis) 00020 00021 00022 class CharTokenizer:public Tokenizer { 00023 private: 00024 int32_t offset, bufferIndex, dataLen; 00025 TCHAR buffer[LUCENE_MAX_WORD_LEN+1]; 00026 const TCHAR* ioBuffer; 00027 protected: 00028 00033 virtual bool isTokenChar(const TCHAR c) const = 0; 00034 00038 virtual TCHAR normalize(const TCHAR c) const; 00039 00040 public: 00041 CharTokenizer(CL_NS(util)::Reader* in); 00042 virtual ~CharTokenizer(); 00043 bool next(Token* token); 00044 }; 00045 00046 00053 class LetterTokenizer:public CharTokenizer { 00054 public: 00055 // Construct a new LetterTokenizer. 00056 LetterTokenizer(CL_NS(util)::Reader* in); 00057 virtual ~LetterTokenizer(); 00058 protected: 00060 bool isTokenChar(const TCHAR c) const; 00061 }; 00062 00063 00064 00075 class LowerCaseTokenizer:public LetterTokenizer { 00076 public: 00078 LowerCaseTokenizer(CL_NS(util)::Reader* in); 00079 virtual ~LowerCaseTokenizer(); 00080 protected: 00082 TCHAR normalize(const TCHAR chr) const; 00083 }; 00084 00085 00088 class WhitespaceTokenizer: public CharTokenizer { 00089 public: 00091 WhitespaceTokenizer(CL_NS(util)::Reader* in); 00092 virtual ~WhitespaceTokenizer(); 00093 protected: 00096 bool isTokenChar(const TCHAR c) const; 00097 }; 00098 00099 00101 class WhitespaceAnalyzer: public Analyzer { 00102 public: 00103 WhitespaceAnalyzer(); 00104 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); 00105 virtual ~WhitespaceAnalyzer(); 00106 }; 00107 00109 class CLUCENE_EXPORT SimpleAnalyzer: public Analyzer { 00110 public: 00111 SimpleAnalyzer(); 00112 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); 00113 virtual ~SimpleAnalyzer(); 00114 }; 00115 00116 00117 00121 class LowerCaseFilter: public TokenFilter { 00122 public: 00123 LowerCaseFilter(TokenStream* in, bool deleteTokenStream); 00124 virtual ~LowerCaseFilter(); 00125 bool next(Token* token); 00126 }; 00127 00128 00132 class StopFilter: public TokenFilter { 00133 private: 00134 //bvk: i found this to work faster with a non-hash table. the number of items 00135 //in the stop table is not like to make it worth having hashing. 00136 //ish: implement a radix/patricia tree for this? 00137 CLTCSetList* stopWords; 00138 bool deleteStopTable; 00139 00140 bool enablePositionIncrements; 00141 const bool ignoreCase; 00142 public: 00143 static bool ENABLE_POSITION_INCREMENTS_DEFAULT; 00144 00145 // Constructs a filter which removes words from the input 00146 // TokenStream that are named in the array of words. 00147 StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** _stopWords, const bool _ignoreCase = false); 00148 00149 virtual ~StopFilter(); 00150 00154 StopFilter(TokenStream* in, bool deleteTokenStream, CLTCSetList* stopTable, bool _deleteStopTable=false); 00155 00162 static void fillStopTable(CLTCSetList* stopTable, 00163 const TCHAR** stopWords, const bool _ignoreCase = false); 00164 00168 bool next(Token* token); 00169 00170 00174 static bool getEnablePositionIncrementsDefault(); 00175 00187 static void setEnablePositionIncrementsDefault(const bool defaultValue); 00188 00192 bool getEnablePositionIncrements() const; 00193 00202 void setEnablePositionIncrements(const bool enable); 00203 00204 }; 00205 00210 class WordlistLoader { 00211 public: 00221 static CLTCSetList* getWordSet(const char* wordfilePath, const char* enc = NULL, CLTCSetList* stopTable = NULL); 00222 00232 static CLTCSetList* getWordSet(CL_NS(util)::Reader* reader, CLTCSetList* stopTable = NULL, const bool bDeleteReader = false); 00233 }; 00234 00235 00237 class StopAnalyzer: public Analyzer { 00238 CLTCSetList* stopTable; 00239 00240 public: 00242 StopAnalyzer(); 00243 virtual ~StopAnalyzer(); 00244 00246 StopAnalyzer( const TCHAR** stopWords ); 00247 00251 StopAnalyzer(const char* stopwordsFile, const char* enc = NULL); 00252 00256 StopAnalyzer(CL_NS(util)::Reader* stopwordsReader, const bool _bDeleteReader = false); 00257 00259 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); 00260 00263 static const TCHAR* ENGLISH_STOP_WORDS[]; 00264 }; 00265 00266 00267 00288 class PerFieldAnalyzerWrapper : public Analyzer { 00289 private: 00290 Analyzer* defaultAnalyzer; 00291 00292 typedef CL_NS(util)::CLHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar, 00293 CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > AnalyzerMapType; 00294 AnalyzerMapType* analyzerMap; 00295 public: 00302 PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer); 00303 virtual ~PerFieldAnalyzerWrapper(); 00304 00311 void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer); 00312 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); 00313 }; 00314 00315 00323 class ISOLatin1AccentFilter: public TokenFilter { 00324 public: 00325 ISOLatin1AccentFilter(TokenStream* input, bool deleteTs); 00326 00330 bool next(Token* token); 00331 00332 virtual ~ISOLatin1AccentFilter(); 00333 }; 00334 00335 00339 class KeywordTokenizer: public Tokenizer { 00340 private: 00341 LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256); 00342 bool done; 00343 int bufferSize; 00344 public: 00345 KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1); 00346 virtual ~KeywordTokenizer(); 00347 bool next(Token* token); 00348 }; 00349 00354 class KeywordAnalyzer: public Analyzer { 00355 public: 00356 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); 00357 virtual ~KeywordAnalyzer(); 00358 }; 00359 00360 00365 class LengthFilter: public TokenFilter { 00366 private: 00367 size_t _min; 00368 size_t _max; 00369 public: 00374 LengthFilter(TokenStream* in, const size_t _min, const size_t _max); 00375 00379 bool next(Token* token); 00380 }; 00381 00382 00383 CL_NS_END 00384 #endif