CLucene - a full-featured, c++ search engine
API Documentation
00001 /*------------------------------------------------------------------------------ 00002 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team 00003 * 00004 * Distributable under the terms of either the Apache License (Version 2.0) or 00005 * the GNU Lesser General Public License, as specified in the COPYING file. 00006 ------------------------------------------------------------------------------*/ 00007 #ifndef _lucene_analysis_standard_StandardTokenizer 00008 #define _lucene_analysis_standard_StandardTokenizer 00009 00010 00011 #include "../AnalysisHeader.h" //required for Tokenizer 00012 #include "StandardTokenizerConstants.h" 00013 CL_CLASS_DEF(analysis,Token) 00014 CL_CLASS_DEF(util,Reader) 00015 CL_CLASS_DEF(util,StringBuffer) 00016 CL_CLASS_DEF(util,FastCharStream) 00017 00018 CL_NS_DEF2(analysis,standard) 00019 00036 class StandardTokenizer: public Tokenizer { 00037 private: 00038 int32_t rdPos; 00039 int32_t tokenStart; 00040 00041 // Advance by one character, incrementing rdPos and returning the character. 00042 int readChar(); 00043 // Retreat by one character, decrementing rdPos. 00044 void unReadChar(); 00045 00046 // createToken centralizes token creation for auditing purposes. 00047 //Token* createToken(CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode); 00048 inline bool setToken(Token* t, CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode); 00049 00050 bool ReadDotted(CL_NS(util)::StringBuffer* str, TokenTypes forcedType,Token* t); 00051 00052 public: 00053 CL_NS(util)::FastCharStream* rd; 00054 00055 // Constructs a tokenizer for this Reader. 00056 StandardTokenizer(CL_NS(util)::Reader* reader); 00057 00058 ~StandardTokenizer(); 00059 00063 bool next(Token* token); 00064 00065 // Reads for number like "1"/"1234.567", or IP address like "192.168.1.2". 00066 bool ReadNumber(const TCHAR* previousNumber, const TCHAR prev, Token* t); 00067 00068 bool ReadAlphaNum(const TCHAR prev, Token* t); 00069 00070 // Reads for apostrophe-containing word. 00071 bool ReadApostrophe(CL_NS(util)::StringBuffer* str, Token* t); 00072 00073 // Reads for something@... it may be a COMPANY name or a EMAIL address 00074 bool ReadAt(CL_NS(util)::StringBuffer* str, Token* t); 00075 00076 // Reads for COMPANY name like AT&T. 00077 bool ReadCompany(CL_NS(util)::StringBuffer* str, Token* t); 00078 00079 // Reads CJK characters 00080 bool ReadCJK(const TCHAR prev, Token* t); 00081 }; 00082 00083 CL_NS_END2 00084 #endif