CLucene - a full-featured, c++ search engine
API Documentation
00001 /*------------------------------------------------------------------------------ 00002 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team 00003 * 00004 * Distributable under the terms of either the Apache License (Version 2.0) or 00005 * the GNU Lesser General Public License, as specified in the COPYING file. 00006 ------------------------------------------------------------------------------*/ 00007 #ifndef _lucene_Config_ 00008 #define _lucene_Config_ 00009 00010 00012 // this settings should be set up in the compiler, 00013 // but are put here for reference as to what could be defined 00015 // 00016 //define this if you want debugging code to be enabled 00017 //#define _DEBUG 00018 // 00019 //define this if you want condition debugging to be enabled 00020 #if defined(_DEBUG) && !defined(_CL__CND_DEBUG) 00021 //#define _CL__CND_DEBUG 00022 #endif 00023 // 00024 //define this to print out lots of information about merges, etc 00025 //requires __CL__CND_DEBUG to be defined 00026 //#define _CL_DEBUG_INFO stdout 00027 // 00028 //to disable namespaces define this 00029 //#define DISABLE_NAMESPACE 00030 // 00031 //disable hashmap/set usage. Just use map and set. 00032 //this has been shown to be quicker than the hash equivalents in some impementations 00033 #ifndef LUCENE_DISABLE_HASHING 00034 #define LUCENE_DISABLE_HASHING 00035 #endif 00036 // 00038 00040 // These options can be set depending on the particular needs of 00041 // Your application 00043 // 00044 //define this to force the build into ascii mode 00045 //#define _ASCII 00046 // 00047 //define this to force the build into ucs2 mode 00048 //#define _UCS2 00049 // 00050 //if a wide character is being converted to a ascii character and it 00051 //cannot fit, this character is used instead. Required. 00052 #define LUCENE_OOR_CHAR(c) ((char)(((unsigned short)c)&0xFF)) 00053 // 00054 //define if you would like to force clucene to use the internal 00055 //character functions. 00056 //Tests may display unpredictable behaviour if this is not defined. 00057 #define LUCENE_USE_INTERNAL_CHAR_FUNCTIONS 00058 // 00059 //define this to enable mmap support in the fsdirectory IndexInput 00060 //todo: only available for windows so far...need to add MMapInput.cpp to project 00061 //EXPERIMENTAL 00062 //#define LUCENE_FS_MMAP 00063 // 00064 //LOCK_DIR implementation: 00065 //define this to set an exact directory for the lock dir (not recommended) 00066 //all other methods of getting the temporary directory will be ignored 00067 //#define LUCENE_LOCK_DIR "/tmp" 00068 // 00069 //define this to try and load the lock dir from this specified environment variable 00070 #define LUCENE_LOCK_DIR_ENV_1 "TEMP" 00071 //define this if you want to have look up this environment variable if the first one fails 00072 #define LUCENE_LOCK_DIR_ENV_2 "TMP" 00073 //define this if you want to have a fallback directory, if not defined then 00074 //the lockdirectory will be the index directory 00075 #define LUCENE_LOCK_DIR_ENV_FALLBACK "/tmp" 00076 // 00078 00079 00080 00082 // The following are search query options 00083 // THe NO_* options can make CLucene faster and/or smaller 00084 // special queries sometime require longer search times or may 00085 // not be required 00087 // 00088 //Define this to remove fuzzy query and sloppy scoring 00089 //#define NO_FUZZY_QUERY 00090 // 00091 //Define to remove wildcard t*m or te?m to match term 00092 //#define NO_WILDCARD_QUERY 00093 // 00094 //Define to remove prefix term query - ter* to match term or terms 00095 //#define NO_PREFIX_QUERY 00096 // 00097 //Define to remove range (exlusive and inclusive) 00098 //#define NO_RANGE_QUERY 00099 // 00100 //This must always be defined. They can be adjusted if required. But 00101 //general Wildcard string would be '*' and Wildcard Char would be '?' 00102 //Both are Required. 00103 #define LUCENE_WILDCARDTERMENUM_WILDCARD_STRING '*' 00104 #define LUCENE_WILDCARDTERMENUM_WILDCARD_CHAR '?' 00105 // 00107 00109 // memory handling configurations 00111 // 00112 /*** 00113 * If this is defined, lucene's configurations are changed 00114 * to use less memory, but may run slower. 00115 * todo: i dont think this actualy changes speed much, just memory 00116 */ 00117 #define LUCENE_OPTIMIZE_FOR_MEMORY 00118 00119 // 00120 //enable this if you want to enable reference counting. This is 00121 //not necessary or useful in most cases except when implementing wrappers 00122 //which have reference counting. If the wrapper wraps a StringReader, 00123 //for example, it should expect that the wrapped StringReader should not 00124 //be deleted. However, when the stringreader is added into a Field, 00125 //the Field usually takes over the stringReader and deletes it on completion. 00126 //If reference counting is enabled, the wrapper can add a reference to any class 00127 //and when _CLDECDELETE is called, the reference is decremented and only deleted 00128 //if the refcount is zero. 00129 //#define LUCENE_ENABLE_REFCOUNT 00130 00131 00133 // These options allow you to remove certain implementations 00134 // out of clucene so that they can be implemented in the client 00135 // application 00137 // 00138 //define this to your own setting if you would like to implement your own 00139 //threading locking code. it should have the same sort of functions as 00140 //mutex_default. If not defined, clucene will try and use posix,win32 critical 00141 //sections, or a timer based mutex hack. 00142 //#define _LUCENE_THREADMUTEX CL_NS(util)::mutex_default 00143 // 00144 //define this if you want to implement the _Cnd_OutDebug routine yourself 00145 //you can then easily customise in your own application how to handle debug messages 00146 //#define _CND_DEBUG_DONTIMPLEMENT_OUTDEBUG 00147 // 00148 //define this if you want to implement your own namespace macros 00149 //#define _LUCENE_DONTIMPLEMENT_NS_MACROS 00150 // 00151 //define this if you do not want clucene to include any standard libraries. 00152 //this could be useful if you want to use alternate libraries 00153 //#define LUCENE_DISABLE_INCLUDES 00154 // 00156 00157 00159 // These options will be changed depending on your compiler/platform 00160 // but can also be changed here if required 00162 // 00163 //if you want to define your own default file encoding. specify it 00164 //here - normally defined in the platform specific headers 00165 //#define PLATFORM_DEFAULT_READER_ENCODING CL_NS(util)::FileReader::ENCODING_ASCII 00166 // 00168 00169 00170 00172 // These options should not be changed. But you can experiment with 00173 // them to optimize performance 00175 // 00176 //some defaults, wouldn't usually need to be changed 00177 //Buffer size for input/output streams. Required. 00178 #define LUCENE_STREAM_BUFFER_SIZE 1024 00179 // 00180 // DSR:2004.08.19: 00181 // Formerly, StringBuffer used 1024 as the default size of its internal buffer. 00182 // However, StringBuffer is used primarily for token- and term-oriented 00183 // processing, e.g. in StandardTokenizer. I've calculated that the average 00184 // token (as produced by StandardTokenizer) in all .txt files distributed in 00185 // the Project Gutenberg CD Image (August 2003 release) has only 6 characters. 00186 // Although most languages are likely to have a longer average word length than 00187 // English due to the popularity of "non-atomized" conjugation and declension 00188 // mechanisms, 1024 is still vastly excessive. 00189 // I made two changes intended to deliver better overall performance: 00190 // a) Switched to a default StringBuffer character capacity of 32. Though 32 00191 // is longer than the average token, the high cost of realloc makes a 00192 // slightly liberal default size optimal. I chose the default size of 32 00193 // after fairly extensive experimentation on the Gutenberg e-texts. The 00194 // results are summarized in the following table: 00195 // ------------------------------------------------------------------------ 00196 // LUCENE_DEFAULT_TOKEN_BUFFER_SIZE value | % faster than default size 1024 00197 // ------------------------------------------------------------------------ 00198 // 8 : 4% 00199 // 16 : 7% 00200 // 32 : 6% 00201 // 64 : 3% 00202 // A default size of 32 is actually slightly slower than 16, but I was 00203 // experimenting on English text; I expect that 32 will maintain decent 00204 // performance in languages such as German, and in technical documents 00205 // with long tokens. 00206 // 00207 // b) To offset the switch to a smaller default buffer size, I implemented a 00208 // more aggressive growth strategy. A StringBuffer now [at least] doubles 00209 // the size of its internal buffer every time it needs to grow, rather 00210 // than [at least] increasing by LUCENE_DEFAULT_TOKEN_BUFFER_SIZE no 00211 // matter how many times it has already grown. 00212 //Required. 00213 #define LUCENE_DEFAULT_TOKEN_BUFFER_SIZE 32 00214 //todo: should implement a similar strategy in analysis/token 00215 // 00216 //Expert: The fraction of {@link TermDocs} entries stored in skip tables, 00217 //used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in 00218 //smaller indices, greater acceleration, but fewer accelerable cases, while 00219 //smaller values result in bigger indices, less acceleration and more 00220 //accelerable cases. More detailed experiments would be useful here. */ 00221 #define LUCENE_DEFAULT_TERMDOCS_SKIP_INTERVAL 16 00222 // 00223 //Size of TermScore cache. Required. 00224 #define LUCENE_SCORE_CACHE_SIZE 32 00225 // 00226 //analysis options 00227 //maximum length that the CharTokenizer uses. Required. 00228 //By adjusting this value, you can greatly improve the performance of searching 00229 //and especially indexing. Default is 255, but smaller numbers will decrease 00230 //the amount of memory used as well as increasing the speed. 00231 #define LUCENE_MAX_WORD_LEN 255 00232 //Maximum length of a token word. 00233 //Should be the same or more than LUCENE_MAX_WORD_LEN 00234 //if not defined, then no token limit, but may be slower 00235 //if defined will be faster (up to 15% in some cases), but will use more memory 00236 #ifndef LUCENE_OPTIMIZE_FOR_MEMORY 00237 #define LUCENE_TOKEN_WORD_LENGTH LUCENE_MAX_WORD_LEN 00238 #endif 00239 // 00240 //maximum field length. some optimisation can be done if a maximum field 00241 //length is given... The smaller the better 00242 #define LUCENE_MAX_FIELD_LEN 100 00243 // 00244 //The initial value set to BooleanQuery::maxClauseCount. Default is 1024 00245 #define LUCENE_BOOLEANQUERY_MAXCLAUSECOUNT 1024 00246 // 00247 //bvk: 12.3.2005 00248 //============================================================================== 00249 //Previously the way the tokenizer has worked has been changed to optionally 00250 //use a a fixed word length. I have implemented this in the Term class as well. 00251 //It seems that by predefining the text length instead of using new TCHAR[x] 00252 //in the constructor greatly improves the performance by 20-30% for certain 00253 //operations. 00254 //Maximum length of a term text. 00255 //Should be the same or more than LUCENE_MAX_WORD_LEN 00256 //if not defined, then no term text limit, but may be slower 00257 //if defined will be faster (up to 30% in some cases), but will use more memory 00258 #ifndef LUCENE_OPTIMIZE_FOR_MEMORY 00259 #define LUCENE_TERM_TEXT_LENGTH LUCENE_MAX_WORD_LEN 00260 #endif 00261 // 00262 //Size of the CharTokenizer buffersize. Required. 00263 #define LUCENE_IO_BUFFER_SIZE 1024 00264 // 00265 //the minimum amount the segment term enum should grow by. Must be at least 1 00266 #define LUCENE_SEGMENTTERMENUM_GROWSIZE 8 00267 // 00269 00270 #endif 00271