CLucene API Documentation (Version )

00001 /*------------------------------------------------------------------------------
00002 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
00003 * 
00004 * Distributable under the terms of either the Apache License (Version 2.0) or 
00005 * the GNU Lesser General Public License, as specified in the COPYING file.
00006 ------------------------------------------------------------------------------*/
00007 #ifndef _lucene_Config_
00008 #define _lucene_Config_
00009 
00010 
00012 //   this settings should be set up in the compiler, 
00013 //   but are put here for reference as to what could be defined
00015 //
00016 //define this if you want debugging code to be enabled
00017 //#define _DEBUG
00018 //
00019 //define this if you want condition debugging to be enabled
00020 #if defined(_DEBUG) && !defined(_CL__CND_DEBUG)
00021  //#define _CL__CND_DEBUG
00022 #endif
00023 //
00024 //define this to print out lots of information about merges, etc
00025 //requires __CL__CND_DEBUG to be defined
00026 //#define _CL_DEBUG_INFO stdout
00027 //
00028 //to disable namespaces define this
00029 //#define DISABLE_NAMESPACE
00030 //
00031 //disable hashmap/set usage. Just use map and set.
00032 //this has been shown to be quicker than the hash equivalents in some impementations
00033 #ifndef LUCENE_DISABLE_HASHING
00034     #define LUCENE_DISABLE_HASHING
00035 #endif
00036 //
00038 
00040 //   These options can be set depending on the particular needs of
00041 //   Your application
00043 //
00044 //define this to force the build into ascii mode
00045 //#define _ASCII
00046 //
00047 //define this to force the build into ucs2 mode
00048 //#define _UCS2
00049 //
00050 //if a wide character is being converted to a ascii character and it
00051 //cannot fit, this character is used instead. Required.
00052 #define LUCENE_OOR_CHAR(c) ((char)(((unsigned short)c)&0xFF))
00053 //
00054 //define if you would like to force clucene to use the internal
00055 //character functions.
00056 //Tests may display unpredictable behaviour if this is not defined.
00057 #define LUCENE_USE_INTERNAL_CHAR_FUNCTIONS
00058 //
00059 //define this to enable mmap support in the fsdirectory IndexInput
00060 //todo: only available for windows so far...need to add MMapInput.cpp to project
00061 //EXPERIMENTAL
00062 //#define LUCENE_FS_MMAP
00063 //
00064 //LOCK_DIR implementation:
00065 //define this to set an exact directory for the lock dir (not recommended)
00066 //all other methods of getting the temporary directory will be ignored
00067 //#define LUCENE_LOCK_DIR "/tmp"
00068 //
00069 //define this to try and load the lock dir from this specified environment variable
00070 #define LUCENE_LOCK_DIR_ENV_1 "TEMP"
00071 //define this if you want to have look up this environment variable if the first one fails
00072 #define LUCENE_LOCK_DIR_ENV_2 "TMP"
00073 //define this if you want to have a fallback directory, if not defined then 
00074 //the lockdirectory will be the index directory
00075 #define LUCENE_LOCK_DIR_ENV_FALLBACK "/tmp"
00076 //
00078 
00079 
00080 
00082 //   The following are search query options
00083 //   THe NO_* options can make CLucene faster and/or smaller
00084 //   special queries sometime require longer search times or may 
00085 //   not be required
00087 //
00088 //Define this to remove fuzzy query and sloppy scoring
00089 //#define NO_FUZZY_QUERY
00090 //
00091 //Define to remove wildcard t*m or te?m to match term
00092 //#define NO_WILDCARD_QUERY
00093 //
00094 //Define to remove prefix term query - ter* to match term or terms
00095 //#define NO_PREFIX_QUERY
00096 //
00097 //Define to remove range (exlusive and inclusive)
00098 //#define NO_RANGE_QUERY
00099 //
00100 //This must always be defined. They can be adjusted if required. But
00101 //general Wildcard string would be '*' and Wildcard Char would be '?'
00102 //Both are Required.
00103 #define LUCENE_WILDCARDTERMENUM_WILDCARD_STRING '*'
00104 #define LUCENE_WILDCARDTERMENUM_WILDCARD_CHAR   '?'
00105 //
00107 
00109 //   memory handling configurations
00111 //
00112 /***
00113 * If this is defined, lucene's configurations are changed
00114 * to use less memory, but may run slower.
00115 * todo: i dont think this actualy changes speed much, just memory
00116 */
00117 #define LUCENE_OPTIMIZE_FOR_MEMORY
00118 
00119 //
00120 //enable this if you want to enable reference counting. This is
00121 //not necessary or useful in most cases except when implementing wrappers 
00122 //which have reference counting. If the wrapper wraps a StringReader,
00123 //for example, it should expect that the wrapped StringReader should not
00124 //be deleted. However, when the stringreader is added into a Field,
00125 //the Field usually takes over the stringReader and deletes it on completion.
00126 //If reference counting is enabled, the wrapper can add a reference to any class
00127 //and when _CLDECDELETE is called, the reference is decremented and only deleted
00128 //if the refcount is zero.
00129 //#define LUCENE_ENABLE_REFCOUNT
00130 
00131 
00133 //   These options allow you to remove certain implementations
00134 //   out of clucene so that they can be implemented in the client
00135 //   application
00137 //
00138 //define this to your own setting if you would like to implement your own
00139 //threading locking code. it should have the same sort of functions as
00140 //mutex_default. If not defined, clucene will try and use posix,win32 critical
00141 //sections, or a timer based mutex hack.
00142 //#define _LUCENE_THREADMUTEX CL_NS(util)::mutex_default
00143 //
00144 //define this if you want to implement the _Cnd_OutDebug routine yourself
00145 //you can then easily customise in your own application how to handle debug messages
00146 //#define _CND_DEBUG_DONTIMPLEMENT_OUTDEBUG
00147 //
00148 //define this if you want to implement your own namespace macros
00149 //#define _LUCENE_DONTIMPLEMENT_NS_MACROS
00150 //
00151 //define this if you do not want clucene to include any standard libraries.
00152 //this could be useful if you want to use alternate libraries
00153 //#define LUCENE_DISABLE_INCLUDES
00154 //
00156 
00157 
00159 //   These options will be changed depending on your compiler/platform
00160 //   but can also be changed here if required
00162 //
00163 //if you want to define your own default file encoding. specify it
00164 //here - normally defined in the platform specific headers
00165 //#define PLATFORM_DEFAULT_READER_ENCODING CL_NS(util)::FileReader::ENCODING_ASCII
00166 //
00168 
00169 
00170 
00172 //   These options should not be changed. But you can experiment with
00173 //   them to optimize performance
00175 //
00176 //some defaults, wouldn't usually need to be changed
00177 //Buffer size for input/output streams. Required.
00178 #define LUCENE_STREAM_BUFFER_SIZE 1024
00179 //
00180 // DSR:2004.08.19:
00181 // Formerly, StringBuffer used 1024 as the default size of its internal buffer.
00182 // However, StringBuffer is used primarily for token- and term-oriented
00183 // processing, e.g. in StandardTokenizer.  I've calculated that the average
00184 // token (as produced by StandardTokenizer) in all .txt files distributed in
00185 // the Project Gutenberg CD Image (August 2003 release) has only 6 characters.
00186 // Although most languages are likely to have a longer average word length than
00187 // English due to the popularity of "non-atomized" conjugation and declension
00188 // mechanisms, 1024 is still vastly excessive.
00189 // I made two changes intended to deliver better overall performance:
00190 //   a) Switched to a default StringBuffer character capacity of 32.  Though 32
00191 //      is longer than the average token, the high cost of realloc makes a
00192 //      slightly liberal default size optimal.  I chose the default size of 32
00193 //      after fairly extensive experimentation on the Gutenberg e-texts.  The
00194 //      results are summarized in the following table:
00195 //      ------------------------------------------------------------------------
00196 //      LUCENE_DEFAULT_TOKEN_BUFFER_SIZE value | % faster than default size 1024
00197 //      ------------------------------------------------------------------------
00198 //                                           8 : 4%
00199 //                                          16 : 7%
00200 //                                          32 : 6%
00201 //                                          64 : 3%
00202 //      A default size of 32 is actually slightly slower than 16, but I was
00203 //      experimenting on English text; I expect that 32 will maintain decent
00204 //      performance in languages such as German, and in technical documents
00205 //      with long tokens.
00206 //
00207 //   b) To offset the switch to a smaller default buffer size, I implemented a
00208 //      more aggressive growth strategy.  A StringBuffer now [at least] doubles
00209 //      the size of its internal buffer every time it needs to grow, rather
00210 //      than [at least] increasing by LUCENE_DEFAULT_TOKEN_BUFFER_SIZE no
00211 //      matter how many times it has already grown.
00212 //Required.
00213 #define LUCENE_DEFAULT_TOKEN_BUFFER_SIZE 32
00214 //todo: should implement a similar strategy in analysis/token
00215 //
00216 //Expert: The fraction of {@link TermDocs} entries stored in skip tables,
00217 //used to accellerate {@link TermDocs#skipTo(int)}.  Larger values result in
00218 //smaller indices, greater acceleration, but fewer accelerable cases, while
00219 //smaller values result in bigger indices, less acceleration and more
00220 //accelerable cases. More detailed experiments would be useful here. */
00221 #define LUCENE_DEFAULT_TERMDOCS_SKIP_INTERVAL 16
00222 //
00223 //Size of TermScore cache. Required.
00224 #define LUCENE_SCORE_CACHE_SIZE 32
00225 //
00226 //analysis options
00227 //maximum length that the CharTokenizer uses. Required.
00228 //By adjusting this value, you can greatly improve the performance of searching
00229 //and especially indexing. Default is 255, but smaller numbers will decrease
00230 //the amount of memory used as well as increasing the speed.
00231 #define  LUCENE_MAX_WORD_LEN 255
00232 //Maximum length of a token word. 
00233 //Should be the same or more than LUCENE_MAX_WORD_LEN
00234 //if not defined, then no token limit, but may be slower
00235 //if defined will be faster (up to 15% in some cases), but will use more memory
00236 #ifndef LUCENE_OPTIMIZE_FOR_MEMORY
00237  #define LUCENE_TOKEN_WORD_LENGTH LUCENE_MAX_WORD_LEN
00238 #endif
00239 //
00240 //maximum field length. some optimisation can be done if a maximum field
00241 //length is given... The smaller the better
00242 #define LUCENE_MAX_FIELD_LEN 100
00243 //
00244 //The initial value set to BooleanQuery::maxClauseCount. Default is 1024
00245 #define LUCENE_BOOLEANQUERY_MAXCLAUSECOUNT 1024
00246 //
00247 //bvk: 12.3.2005
00248 //==============================================================================
00249 //Previously the way the tokenizer has worked has been changed to optionally
00250 //use a a fixed word length. I have implemented this in the Term class as well.
00251 //It seems that by predefining the text length instead of using new TCHAR[x]
00252 //in the constructor greatly improves the performance by 20-30% for certain
00253 //operations.
00254 //Maximum length of a term text. 
00255 //Should be the same or more than LUCENE_MAX_WORD_LEN
00256 //if not defined, then no term text limit, but may be slower
00257 //if defined will be faster (up to 30% in some cases), but will use more memory
00258 #ifndef LUCENE_OPTIMIZE_FOR_MEMORY
00259  #define LUCENE_TERM_TEXT_LENGTH LUCENE_MAX_WORD_LEN
00260 #endif
00261 //
00262 //Size of the CharTokenizer buffersize. Required.
00263 #define LUCENE_IO_BUFFER_SIZE 1024
00264 //
00265 //the minimum amount the segment term enum should grow by. Must be at least 1
00266 #define LUCENE_SEGMENTTERMENUM_GROWSIZE 8
00267 //
00269 
00270 #endif
00271
CLConfig.h