From 8200bb78d2b6504b88b2a82bcf8c0124aa023916 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 26 May 2016 18:20:09 +0200 Subject: [PATCH] Use structured comments in recoll.conf and use them to generate the docbook and man page texts --- src/doc/man/recoll.conf.5 | 774 +++++++----- src/doc/user/Makefile | 2 +- src/doc/user/recoll.conf.xml | 588 ++++++++++ src/doc/user/usermanual.html | 2142 ++++++++++++++++++---------------- src/doc/user/usermanual.xml | 876 +------------- src/sampleconf/recoll.conf | 261 +++-- 6 files changed, 2376 insertions(+), 2267 deletions(-) create mode 100644 src/doc/user/recoll.conf.xml diff --git a/src/doc/man/recoll.conf.5 b/src/doc/man/recoll.conf.5 index 7cdc8216..9123005a 100644 --- a/src/doc/man/recoll.conf.5 +++ b/src/doc/man/recoll.conf.5 @@ -54,315 +54,565 @@ Where values are lists, white space is used for separation, and elements with embedded spaces can be quoted with double-quotes. .SH OPTIONS .TP -.BI "topdirs = " directories -Specifies the list of directories to index (recursively). +.BI "topdirs = "string +Space-separated list of files or +directories to recursively index. Default to ~ (indexes +$HOME). You can use symbolic links in the list, they will be followed, +independantly of the value of the followLinks variable. .TP -.BI "skippedNames = " patterns -A space-separated list of patterns for names of files or directories that -should be completely ignored. The list defined in the default file is: -.sp -.nf -*~ #* bin CVS Cache caughtspam tmp +.BI "skippedNames = "string +Files and directories which should be ignored. +White space separated list of wildcard patterns (simple ones, not paths, +must contain no / ), which will be tested against file and directory +names. The list in the default configuration does not exclude hidden +directories (names beginning with a dot), which means that it may index +quite a few things that you do not want. On the other hand, email user +agents like Thunderbird usually store messages in hidden directories, and +you probably want this indexed. One possible solution is to have '.*' in +'skippedNames', and add things like '~/.thunderbird' '~/.evolution' to +'topdirs'. Not even the file names are indexed for patterns in this +list, see the 'noContentSuffixes' variable for an alternative approach +which indexes the file names. Can be redefined for any +subtree. +.TP +.BI "noContentSuffixes = "string +List of name endings (not necessarily dot-separated suffixes) for +which we don't try MIME type identification, and don't uncompress or +index content. Only the names will be indexed. This +complements the now obsoleted recoll_noindex list from the mimemap file, +which will go away in a future release (the move from mimemap to +recoll.conf allows editing the list through the GUI). This is different +from skippedNames because these are name ending matches only (not +wildcard patterns), and the file name itself gets indexed normally. This +can be redefined for subdirectories. +.TP +.BI "skippedPaths = "string +Paths we should not go into. Space-separated list of +wildcard expressions for filesystem paths. Can contain files and +directories. The database and configuration directories will +automatically be added. The expressions are matched using 'fnmatch(3)' +with the FNM_PATHNAME flag set by default. This means that '/' characters +must be matched explicitely. You can set 'skippedPathsFnmPathname' to 0 +to disable the use of FNM_PATHNAME (meaning that '/*/dir3' will match +'/dir1/dir2/dir3'). The default value contains the usual mount point for +removable media to remind you that it is a bad idea to have Recoll work +on these (esp. with the monitor: media gets indexed on mount, all data +gets erased on unmount). Explicitely adding '/media/xxx' to the topdirs +will override this. +.TP +.BI "skippedPathsFnmPathname = "bool +Set to 0 to +override use of FNM_PATHNAME for matching skipped +paths. +.TP +.BI "daemSkippedPaths = "string +skippedPaths equivalent specific to +real time indexing. This enables having parts of the tree +which are initially indexed but not monitored. If daemSkippedPaths is +not set, the daemon uses skippedPaths. +.TP +.BI "zipSkippedNames = "string +Space-separated list of wildcard expressions for names that should +be ignored inside zip archives. This is used directly by +the zip handler, and has a function similar to skippedNames, but works +independantly. Can be redefined for subdirectories. Supported by recoll +1.20 and newer. See +https://bitbucket.org/medoc/recoll/wiki/Filtering%20out%20Zip%20archive%20members -.fi -The list can be redefined for subdirectories, but is only actually changed -for the top level ones in -.I topdirs .TP -.BI "skippedPaths = " patterns -A space-separated list of patterns for paths the indexer should not descend -into. Together with topdirs, this allows pruning the indexed tree to one's -content. -.B daemSkippedPaths -can be used to define a specific value for the real time indexing monitor. +.BI "followLinks = "bool +Follow symbolic links during +indexing. The default is to ignore symbolic links to avoid +multiple indexing of linked files. No effort is made to avoid duplication +when this option is set to true. This option can be set individually for +each of the 'topdirs' members by using sections. It can not be changed +below the 'topdirs' level. Links in the 'topdirs' list itself are always +followed. .TP -.BI "skippedPathsFnmPathname = " 0/1 -The values in the *skippedPaths variables are matched by default with -fnmatch(3), with the FNM_PATHNAME and FNM_LEADING_DIR flags. This means -that '/' characters must be matched explicitly. You can set -skippedPathsFnmPathname to 0 to disable the use of FNM_PATHNAME (meaning -that /*/dir3 will match /dir1/dir2/dir3). +.BI "indexedmimetypes = "string +Restrictive list of +indexed mime types. Normally not set (in which case all +supported types are indexed). If it is set, +only the types from the list will have their contents indexed. The names +will be indexed anyway if indexallfilenames is set (default). MIME +type names should be taken from the mimemap file. Can be redefined for +subtrees. .TP -.BI "followLinks = " boolean -Specifies if the indexer should follow -symbolic links while walking the file tree. The default is -to ignore symbolic links to avoid multiple indexing of -linked files. No effort is made to avoid duplication when -this option is set to true. This option can be set -individually for each of the -.I topdirs -members by using sections. It can not be changed below the -.I topdirs -level. +.BI "excludedmimetypes = "string +List of excluded MIME +types. Lets you exclude some types from indexing. Can be +redefined for subtrees. .TP -.BI "indexedmimetypes = " list -Recoll normally indexes any file which it knows how to read. This list lets -you restrict the indexed mime types to what you specify. If the variable is -unspecified or the list empty (the default), all supported types are -processed. +.BI "compressedfilemaxkbs = "int +Size limit for compressed +files. We need to decompress these in a +temporary directory for identification, which can be wasteful in some +cases. Limit the waste. Negative means no limit. 0 results in no +processing of any compressed file. Default 50 MB. .TP -.BI "compressedfilemaxkbs = " value -Size limit for compressed (.gz or .bz2) files. These need to be -decompressed in a temporary directory for identification, which can be very -wasteful if 'uninteresting' big compressed files are present. Negative -means no limit, 0 means no processing of any compressed file. Defaults -to \-1. +.BI "textfilemaxmbs = "int +Size limit for text +files. Mostly for skipping monster +logs. Default 20 MB. .TP -.BI "textfilemaxmbs = " value -Maximum size for text files. Very big text files are often uninteresting -logs. Set to \-1 to disable (default 20MB). +.BI "indexallfilenames = "bool +Index the file names of +unprocessed files Index the names of files the contents of +which we don't index because of an excluded or unsupported MIME +type. .TP -.BI "textfilepagekbs = " value -If this is set to other than \-1, text files will be indexed as multiple -documents of the given page size. This may be useful if you do want to -index very big text files as it will both reduce memory usage at index time -and help with loading data to the preview window. A size of a few megabytes -would seem reasonable (default: 1000 : 1MB). +.BI "usesystemfilecommand = "bool +Use a system command +for file MIME type guessing as a final step in file type +identification This is generally useful, but will usually +cause the indexing of many bogus 'text' files. See 'systemfilecommand' +for the command used. .TP -.BI "membermaxkbs = " "value in kilobytes" -This defines the maximum size for an archive member (zip, tar or rar at -the moment). Bigger entries will be skipped. Current default: 50000 (50 MB). +.BI "systemfilecommand = "string +Command used to guess +MIME types if the internal methods fails This should be a +"file -i" workalike. The file path will be added as a last parameter to +the command line. 'xdg-mime' works better than the traditional 'file' +command, and is now the configured default (with a hard-coded fallback to +'file') .TP -.BI "indexallfilenames = " boolean -Recoll indexes file names into a special section of the database to allow -specific file names searches using wild cards. This parameter decides if -file name indexing is performed only for files with mime types that would -qualify them for full text indexing, or for all files inside -the selected subtrees, independent of mime type. +.BI "processwebqueue = "bool +Decide if we process the +Web queue. The queue is a directory where the Recoll Web +browser plugins create the copies of visited pages. .TP -.BI "usesystemfilecommand = " boolean -Decide if we use the -.B "file \-i" -system command as a final step for determining the mime type for a file -(the main procedure uses suffix associations as defined in the -.B mimemap -file). This can be useful for files with suffixless names, but it will -also cause the indexing of many bogus "text" files. -.TP -.BI "processbeaglequeue = " 0/1 -If this is set, process the directory where Beagle Web browser plugins copy -visited pages for indexing. Of course, Beagle MUST NOT be running, else -things will behave strangely. -.TP -.BI "beaglequeuedir = " directory path -The path to the Beagle indexing queue. This is hard-coded in the Beagle -plugin as ~/.beagle/ToIndex so there should be no need to change it. -.TP -.BI "indexStripChars = " 0/1 -Decide if we strip characters of diacritics and convert them to lower-case -before terms are indexed. If we don't, searches sensitive to case and -diacritics can be performed, but the index will be bigger, and some -marginal weirdness may sometimes occur. The default is a stripped index -(indexStripChars = 1) for now. When using multiple indexes for a search, +.BI "textfilepagekbs = "int +Page size for text +files. If this is set, text/plain files will be divided +into documents of approximately this size. Will reduce memory usage at +index time and help with loading data in the preview window at query +time. Particularly useful with very big files, such as application or +system logs. Also see textfilemaxmbs and +compressedfilemaxkbs. +.TP +.BI "membermaxkbs = "int +Size limit for archive +members. This is passed to the filters in the environment +as RECOLL_FILTER_MAXMEMBERKB. +.TP +.BI "indexStripChars = "bool +Decide if we store +character case and diacritics in the index. If we do, +searches sensitive to case and diacritics can be performed, but the index +will be bigger, and some marginal weirdness may sometimes occur. The +default is a stripped index. When using multiple indexes for a search, this parameter must be defined identically for all. Changing the value implies an index reset. -.TP -.BI "maxTermExpand = " value -Maximum expansion count for a single term (e.g.: when using wildcards). The -default of 10000 is reasonable and will avoid queries that appear frozen -while the engine is walking the term list. -.TP -.BI "maxXapianClauses = " value -Maximum number of elementary clauses we can add to a single Xapian -query. In some cases, the result of term expansion can be multiplicative, -and we want to avoid using excessive memory. The default of 100 000 should -be both high enough in most cases and compatible with current typical -hardware configurations. -.TP -.BI "nonumbers = " 0/1 -If this set to true, no terms will be generated for numbers. For example -"123", "1.5e6", 192.168.1.4, would not be indexed ("value123" would still -be). Numbers are often quite interesting to search for, and this should -probably not be set except for special situations, ie, scientific documents -with huge amounts of numbers in them. This can only be set for a whole -index, not for a subtree. .TP -.BI "nocjk = " boolean -If this set to true, specific east asian (Chinese Korean Japanese) -characters/word splitting is turned off. This will save a small amount of -cpu if you have no CJK documents. If your document base does include such -text but you are not interested in searching it, setting -.I nocjk -may be a significant time and space saver. +.BI "nonumbers = "bool +Decides if terms will be +generated for numbers. For example "123", "1.5e6", +192.168.1.4, would not be indexed if nonumbers is set ("value123" would +still be). Numbers are often quite interesting to search for, and this +should probably not be set except for special situations, ie, scientific +documents with huge amounts of numbers in them, where setting nonumbers +will reduce the index size. This can only be set for a whole index, not +for a subtree. .TP -.BI "cjkngramlen = " value -This lets you adjust the size of n-grams used for indexing CJK text. The -default value of 2 is probably appropriate in most cases. A value of 3 -would allow more precision and efficiency on longer words, but the index -will be approximately twice as large. +.BI "dehyphenate = "bool +Determines if we index +'coworker' also when the input is 'co-worker'. This is new +in version 1.22, and on by default. Setting the variable to off allows +restoring the previous behaviour. .TP -.BI "indexstemminglanguages = " languages -A list of languages for which the stem expansion databases will be -built. See recollindex(1) for possible values. +.BI "nocjk = "bool +Decides if specific East Asian +(Chinese Korean Japanese) characters/word splitting is turned +off. This will save a small amount of CPU if you have no CJK +documents. If your document base does include such text but you are not +interested in searching it, setting nocjk may be a +significant time and space saver. .TP -.BI "defaultcharset = " charset -The name of the character set used for files that do not contain a -character set definition (ie: plain text files). This can be redefined for -any subdirectory. -.TP -.BI "unac_except_trans = " "list of utf-8 groups" -This is a list of characters, encoded in UTF-8, which should be handled -specially when converting text to unaccented lowercase. For example, in -Swedish, the letter "a with diaeresis" has full alphabet citizenship and -should not be turned into an a. -.br -Each element in the space-separated list has the special character as first -element and the translation following. The handling of both the lowercase -and upper-case versions of a character should be specified, as appartenance -to the list will turn-off both standard accent and case processing. -.br -Note that the translation is not limited to a single character. -.br -This parameter cannot be redefined for subdirectories, it is global, -because there is no way to do otherwise when querying. If you have document -sets which would need different values, you will have to index and query -them separately. +.BI "cjkngramlen = "int +This lets you adjust the size of +n-grams used for indexing CJK text. The default value of 2 is +probably appropriate in most cases. A value of 3 would allow more precision +and efficiency on longer words, but the index will be approximately twice +as large. .TP -.BI "maildefcharset = " character set name -This can be used to define the default character set specifically for email -messages which don't specify it. This is mainly useful for readpst (libpst) -dumps, which are utf-8 but do not say so. +.BI "indexstemminglanguages = "string +Languages for which to create stemming expansion +data. Stemmer names can be found by executing 'recollindex +-l', or this can also be set from a list in the GUI. .TP -.BI "localfields = " "fieldname = value:..." -This allows setting fields for all documents under a given -directory. Typical usage would be to set an "rclaptg" field, to be used in -mimeview to select a specific viewer. If several fields are to be set, they -should be separated with a colon (':') character (which there is currently -no way to escape). Ie: localfields= rclaptg=gnus:other = val, then select -specifier viewer with mimetype|tag=... in mimeview. +.BI "defaultcharset = "string +Default character +set. This is used for files which do not contain a +character set definition (e.g.: text/plain). Values found inside files, +e.g. a 'charset' tag in HTML documents, will override it. If this is not +set, the default character set is the one defined by the NLS environment +($LC_ALL, $LC_CTYPE, $LANG), or ultimately iso-8859-1 (cp-1252 in fact). +If for some reason you want a general default which does not match your +LANG and is not 8859-1, use this variable. This can be redefined for any +sub-directory. .TP -.BI "dbdir = " directory -The name of the Xapian database directory. It will be created if needed -when the database is initialized. If this is not an absolute pathname, it -will be taken relative to the configuration directory. +.BI "unac_except_trans = "string +A list of characters, +encoded in UTF-8, which should be handled specially +when converting text to unaccented lowercase. For +example, in Swedish, the letter a with diaeresis has full alphabet +citizenship and should not be turned into an a. +Each element in the space-separated list has the special character as +first element and the translation following. The handling of both the +lowercase and upper-case versions of a character should be specified, as +appartenance to the list will turn-off both standard accent and case +processing. The value is global and affects both indexing and querying. +Examples: +Swedish: +unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl åå Åå +. German: +unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl +In French, you probably want to decompose oe and ae and nobody would type +a German ß +unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl +. The default for all until someone protests follows. These decompositions +are not performed by unac, but it is unlikely that someone would type the +composed forms in a search. +unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl .TP -.BI "idxstatusfile = " "file path" -The name of the scratch file where the indexer process updates its -status. Default: idxstatus.txt inside the configuration directory. +.BI "maildefcharset = "string +Overrides the default +character set for email messages which don't specify +one. This is mainly useful for readpst (libpst) dumps, +which are utf-8 but do not say so. .TP -.BI "maxfsoccuppc = " percentnumber -Maximum file system occupation before we -stop indexing. The value is a percentage, corresponding to -what the "Capacity" df output column shows. The default +.BI "localfields = "string +Set fields on all files +(usually of a specific fs area). Syntax is the usual: +name = value ; attr1 = val1 ; [...] +value is empty so this needs an initial semi-colon. This is useful, e.g., +for setting the rclaptg field for application selection inside +mimeview. +.TP +.BI "testmodifusemtime = "bool +Use mtime instead of +ctime to test if a file has been modified. The time is used +in addition to the size, which is always used. +Setting this can reduce re-indexing on systems where extended attributes +are used (by some other application), but not indexed, because changing +extended attributes only affects ctime. +Notes: +- This may prevent detection of change in some marginal file rename cases +(the target would need to have the same size and mtime). +- You should probably also set noxattrfields to 1 in this case, except if +you still prefer to perform xattr indexing, for example if the local +file update pattern makes it of value (as in general, there is a risk +for pure extended attributes updates without file modification to go +undetected). Perform a full index reset after changing this. + +.TP +.BI "noxattrfields = "bool +Disable extended attributes +conversion to metadata fields. This probably needs to be +set if testmodifusemtime is set. +.TP +.BI "metadatacmds = "string +Define commands to +gather external metadata, e.g. tmsu tags. +There can be several entries, separated by semi-colons, each defining +which field name the data goes into and the command to use. Don't forget the +initial semi-colon. All the field names must be different. You can use +aliases in the "field" file if necessary. +As a not too pretty hack conceded to convenience, any field name +beginning with "rclmulti" will be taken as an indication that the command +returns multiple field values inside a text blob formatted as a recoll +configuration file ("fieldname = fieldvalue" lines). The rclmultixx name +will be ignored, and field names and values will be parsed from the data. +Example: metadatacmds = ; tags = tmsu tags %f; rclmulti1 = cmdOutputsConf %f + +.TP +.BI "cachedir = "dfn +Top directory for Recoll data. Recoll data +directories are normally located relative to the configuration directory +(e.g. ~/.recoll/xapiandb, ~/.recoll/mboxcache). If 'cachedir' is set, the +directories are stored under the specified value instead (e.g. if +cachedir is ~/.cache/recoll, the default dbdir would be +~/.cache/recoll/xapiandb). This affects dbdir, webcachedir, +mboxcachedir, aspellDicDir, which can still be individually specified to +override cachedir. Note that if you have multiple configurations, each +must have a different cachedir, there is no automatic computation of a +subpath under cachedir. +.TP +.BI "maxfsoccuppc = "int +Maximum file system occupation +over which we stop indexing. The value is a percentage, +corresponding to what the "Capacity" df output column shows. The default value is 0, meaning no checking. .TP -.BI "mboxcachedir = " "directory path" -The directory where mbox message offsets cache files are held. This is -normally $RECOLL_CONFDIR/mboxcache, but it may be useful to share a -directory between different configurations. +.BI "xapiandb = "dfn +Xapian database directory +location. This will be created on first indexing. If the +value is not an absolute path, it will be interpreted as relative to +cachedir if set, or the configuration directory (-c argument or +$RECOLL_CONFDIR). If nothing is specified, the default is then +~/.recoll/xapiandb/ .TP -.BI "mboxcacheminmbs = " "value in megabytes" -The minimum mbox file size over which we cache the offsets. There is really no sense in caching offsets for small files. The default is 5 MB. +.BI "idxstatusfile = "fn +Name of the scratch file where the indexer process updates its +status. Default: idxstatus.txt inside the configuration +directory. .TP -.BI "webcachedir = " "directory path" -This is only used by the Beagle web browser plugin indexing code, and -defines where the cache for visited pages will live. Default: +.BI "mboxcachedir = "dfn +Directory location for storing mbox message offsets cache +files. This is normally 'mboxcache' under cachedir if set, +or else under the configuration directory, but it may be useful to share +a directory between different configurations. +.TP +.BI "mboxcacheminmbs = "int +Minimum mbox file size over which we cache the offsets. There is really no sense in caching offsets for small files. The +default is 5 MB. +.TP +.BI "webcachedir = "dfn +Directory where we store the archived web pages. This is only used by the web history indexing code +Default: cachedir/webcache if cachedir is set, else $RECOLL_CONFDIR/webcache .TP -.BI "webcachemaxmbs = " "value in megabytes" -This is only used by the Beagle web browser plugin indexing code, and -defines the maximum size for the web page cache. Default: 40 MB. +.BI "webcachemaxmbs = "int +Maximum size in MB of the Web archive. This is only used by the web history indexing code. +Default: 40 MB. +Reducing the size will not physically truncate the file. .TP -.BI "idxflushmb = " megabytes -Threshold (megabytes of new text data) -where we flush from memory to disk index. Setting this can -help control memory usage. A value of 0 means no explicit -flushing, letting Xapian use its own default, which is -flushing every 10000 documents (or XAPIAN_FLUSH_THRESHOLD), meaning that -memory usage depends on average document size. The default value is 10. +.BI "webqueuedir = "fn +The path to the Web indexing queue. This is +hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no +need or possibility to change it. .TP -.BI "autodiacsens = " 0/1 -IF the index is not stripped, decide if we automatically trigger diacritics -sensitivity if the search term has accented characters (not in -unac_except_trans). Else you need to use the query language and the D -modifier to specify diacritics sensitivity. Default is no. +.BI "aspellDicDir = "dfn +Aspell dictionary storage directory location. The +aspell dictionary (aspdict.(lang).rws) is normally stored in the +directory specified by cachedir if set, or under the configuration +directory. .TP -.BI "autocasesens = " 0/1 -IF the index is not stripped, decide if we automatically trigger character -case sensitivity if the search term has upper-case characters in any but -the first position. Else you need to use the query language and the C -modifier to specify character-case sensitivity. Default is yes. +.BI "filtersdir = "dfn +Directory location for executable input handlers. If +RECOLL_FILTERSDIR is set in the environment, we use it instead. Defaults +to $prefix/share/recoll/filters. Can be redefined for +subdirectories. .TP -.BI "loglevel = " value -Verbosity level for recoll and recollindex. A value of 4 lists quite a lot of -debug/information messages. 3 lists only errors. -.B daemloglevel -can be used to specify a different value for the real-time indexing daemon. +.BI "iconsdir = "dfn +Directory location for icons. The only reason to +change this would be if you want to change the icons displayed in the +result list. Defaults to $prefix/share/recoll/images .TP -.BI "logfilename = " file -Where should the messages go. 'stderr' can be used as a special value. -.B daemlogfilename -can be used to specify a different value for the real-time indexing daemon. +.BI "idxflushmb = "int +Threshold (megabytes of new data) where we flush from memory to +disk index. Setting this allows some control over memory +usage by the indexer process. A value of 0 means no explicit flushing, +which lets Xapian perform its own thing, meaning flushing every +$XAPIAN_FLUSH_THRESHOLD documents created, modified or deleted: as memory +usage depends on average document size, not only document count, the +Xapian approach is is not very useful, and you should let Recoll manage +the flushes. The default value of idxflushmb is 10 MB, and may be a bit +low. If you are looking for maximum speed, you may want to experiment +with values between 20 and +80. In my experience, values beyond 100 are always counterproductive. If +you find otherwise, please drop me a note. .TP -.BI "mondelaypatterns = " "list of patterns" -This allows specify wildcard path patterns (processed with fnmatch(3) with -0 flag), to match files which change too often and for which a delay should -be observed before re-indexing. This is a space-separated list, each entry -being a pattern and a time in seconds, separated by a colon. You can use -double quotes if a path entry contains white space. Example: -.sp -mondelaypatterns = *.log:20 "this one has spaces*:10" -.TP -.BI "monixinterval = " "value in seconds -Minimum interval (seconds) for processing the indexing queue. The real time -monitor does not process each event when it comes in, but will wait this -time for the queue to accumulate to diminish overhead and in order to -aggregate multiple events to the same file. Default 30 S. +.BI "filtermaxseconds = "int +Maximum external filter execution time in +seconds. Default 1200 (20mn). Set to 0 for no limit. This +is mainly to avoid infinite loops in postscript files +(loop.ps) .TP -.BI "monauxinterval = " "value in seconds -Period (in seconds) at which the real time monitor will regenerate the -auxiliary databases (spelling, stemming) if needed. The default is one -hour. +.BI "filtermaxmbytes = "int +Maximum virtual memory space for filter processes +(setrlimit(RLIMIT_AS)), in megabytes. Note that this +includes any mapped libs (there is no reliable Linux way to limit the +data space only), so we need to be a bit generous here. Anything over +2000 will be ignored on 32 bits machines. .TP -.BI "monioniceclass, monioniceclassdata" -These allow defining the ionice class and data used by the indexer (default -class 3, no data). +.BI "thrQSizes = "string +Stage input queues configuration. There are three +internal queues in the indexing pipeline stages (file data extraction, +terms generation, index update). This parameter defines the queue depths +for each stage (three integer values). If a value of -1 is given for a +given stage, no queue is used, and the thread will go on performing the +next stage. In practise, deep queues have not been shown to increase +performance. Default: a value of 0 for the first queue tells Recoll to +perform autoconfiguration based on the detected number of CPUs (no need +for the two other values in this case). Use thrQSizes = -1 -1 -1 to +disable multithreading entirely. .TP -.BI "filtermaxseconds = " "value in seconds" -Maximum filter execution time, after which it is aborted. Some postscript -programs just loop... +.BI "thrTCounts = "string +Number of threads used for each indexing stage. The +three stages are: file data extraction, terms generation, index +update). The use of the counts is also controlled by some special values +in thrQSizes: if the first queue depth is 0, all counts are ignored +(autoconfigured); if a value of -1 is used for a queue depth, the +corresponding thread count is ignored. It makes no sense to use a value +other than 1 for the last stage because updating the Xapian index is +necessarily single-threaded (and protected by a mutex). .TP -.BI "filtersdir = " directory -A directory to search for the external filter scripts used to index some -types of files. The value should not be changed, except if you want to -modify one of the default scripts. The value can be redefined for any -subdirectory. +.BI "loglevel = "int +Log file verbosity 1-6. A value of 2 will print +only errors and warnings. 3 will print information like document updates, +4 is quite verbose and 6 very verbose. .TP -.BI "iconsdir = " directory -The name of the directory where -.B recoll -result list icons are stored. You can change this if you want different -images. +.BI "logfilename = "fn +Log file destination. Use 'stderr' (default) to write to the +console. .TP -.BI "idxabsmlen = " value -Recoll stores an abstract for each indexed file inside the database. The -text can come from an actual 'abstract' section in the document or will -just be the beginning of the document. It is stored in the index so that it -can be displayed inside the result lists without decoding the original -file. The -.I idxabsmlen -parameter defines the size of the stored abstract. The default value is 250 -bytes. The search interface gives you the choice to display this stored +.BI "idxloglevel = "int +Override loglevel for the indexer. +.TP +.BI "idxlogfilename = "fn +Override logfilename for the indexer. +.TP +.BI "daemloglevel = "int +Override loglevel for the indexer in real time +mode. The default is to use the idx... values if set, else +the log... values. +.TP +.BI "daemlogfilename = "fn +Override logfilename for the indexer in real time +mode. The default is to use the idx... values if set, else +the log... values. +.TP +.BI "idxrundir = "dfn +Indexing process current directory. The input +handlers sometimes leave temporary files in the current directory, so it +makes sense to have recollindex chdir to some temporary directory. If the +value is empty, the current directory is not changed. If the +value is (literal) tmp, we use the temporary directory as set by the +environment (RECOLL_TMPDIR else TMPDIR else /tmp). If the value is an +absolute path to a directory, we go there. +.TP +.BI "checkneedretryindexscript = "fn +Script used to heuristically check if we need to retry indexing +files which previously failed. The default script checks +the modified dates on /usr/bin and /usr/local/bin. A relative path will +be looked up in the filters dirs, then in the path. Use an absolute path +to do otherwise. +.TP +.BI "recollhelperpath = "string +Additional places to search for helper executables. This is only used on Windows for now. +.TP +.BI "idxabsmlen = "int +Length of abstracts we store while indexing. Recoll stores an abstract for each indexed file. +The text can come from an actual 'abstract' section in the +document or will just be the beginning of the document. It is stored in +the index so that it can be displayed inside the result lists without +decoding the original file. The idxabsmlen parameter +defines the size of the stored abstract. The default value is 250 +bytes. The search interface gives you the choice to display this stored text or a synthetic abstract built by extracting text around the search terms. If you always prefer the synthetic abstract, you can reduce this value and save a little space. .TP -.BI "aspellLanguage = " lang -Language definitions to use when creating the aspell dictionary. The value -must match a set of aspell language definition files. You can type "aspell -config" to see where these are installed (look for data-dir). The default -if the variable is not set is to use your desktop national language -environment to guess the value. +.BI "idxmetastoredlen = "int +Truncation length of stored metadata fields. This +does not affect indexing (the whole field is processed anyway), just the +amount of data stored in the index for the purpose of displaying fields +inside result lists or previews. The default value is 150 bytes which +may be too low if you have custom fields. .TP -.BI "noaspell = " boolean -If this is set, the aspell dictionary generation is turned off. Useful for -cases where you don't need the functionality or when it is unusable because -aspell crashes during dictionary generation. +.BI "aspellLanguage = "string +Language definitions to use when creating the aspell +dictionary. The value must match a set of aspell language +definition files. You can type "aspell dicts" to see a list The default +if this is not set is to use the NLS environment to guess the +value. .TP -.BI "mhmboxquirks = " flags -This allows definining location-related quirks for the mailbox -handler. Currently only the tbird flag is defined, and it should be set for -directories which hold Thunderbird data, as their folder format is weird. +.BI "aspellAddCreateParam = "string +Additional option and parameter to aspell dictionary creation +command. Some aspell packages may need an additional option +(e.g. on Debian Jessie: --local-data-dir=/usr/lib/aspell). See Debian bug +772415. +.TP +.BI "aspellKeepStderr = "bool +Set this to have a look at aspell dictionary creation +errors. There are always many, so this is mostly for +debugging. +.TP +.BI "noaspell = "bool +Disable aspell use. The aspell dictionary generation +takes time, and some combinations of aspell version, language, and local +terms, result in aspell crashing, so it sometimes makes sense to just +disable the thing. +.TP +.BI "monauxinterval = "int +Auxiliary database update interval. The real time +indexer only updates the auxiliary databases (stemdb, aspell) +periodically, because it would be too costly to do it for every document +change. The default period is one hour. +.TP +.BI "monixinterval = "int +Minimum interval (seconds) between processings of the indexing +queue. The real time indexer does not process each event +when it comes in, but lets the queue accumulate, to diminish overhead and +to aggregate multiple events affecting the same file. Default 30 +S. +.TP +.BI "mondelaypatterns = "string +Timing parameters for the real time indexing. Definitions for files which get a longer delay before reindexing +is allowed. This is for fast-changing files, that should only be +reindexed once in a while. A list of wildcardPattern:seconds pairs. The +patterns are matched with fnmatch(pattern, path, 0) You can quote entries +containing white space with double quotes (quote the whole entry, not the +pattern). The default is empty. +Example: mondelaypatterns = *.log:20 "*with spaces.*:30" +.TP +.BI "monioniceclass = "int +ionice class for the real time indexing process On platforms where this is supported. The default value is +3. +.TP +.BI "monioniceclassdata = "string +ionice class parameter for the real time indexing process. On platforms where this is supported. The default is +empty. +.TP +.BI "autodiacsens = "bool +auto-trigger diacritics sensitivity (raw index only). IF the index is not stripped, decide if we automatically trigger +diacritics sensitivity if the search term has accented characters (not in +unac_except_trans). Else you need to use the query language and the "D" +modifier to specify diacritics sensitivity. Default is no. +.TP +.BI "autocasesens = "bool +auto-trigger case sensitivity (raw index only). IF +the index is not stripped (see indexStripChars), decide if we +automatically trigger character case sensitivity if the search term has +upper-case characters in any but the first position. Else you need to use +the query language and the "C" modifier to specify character-case +sensitivity. Default is yes. +.TP +.BI "maxTermExpand = "int +Maximum query expansion count +for a single term (e.g.: when using wildcards). This only +affects queries, not indexing. We used to not limit this at all (except +for filenames where the limit was too low at 1000), but it is +unreasonable with a big index. Default 10000. +.TP +.BI "maxXapianClauses = "int +Maximum number of clauses +we add to a single Xapian query. This only affects queries, +not indexing. In some cases, the result of term expansion can be +multiplicative, and we want to avoid eating all the memory. Default +50000. +.TP +.BI "snippetMaxPosWalk = "int +Maximum number of positions we walk while populating a snippet for +the result list. The default of 1,000,000 may be +insufficient for very big documents, the consequence would be snippets +with possibly meaning-altering missing words. +.TP +.BI "pdfocr = "bool +Attempt OCR of PDF files with no text content if both tesseract and +pdftoppm are installed. The default is off because OCR is so +very slow. +.TP +.BI "pdfattach = "bool +Enable PDF attachment extraction by executing pdftk (if +available). This is +normally disabled, because it does slow down PDF indexing a bit even if +not one attachment is ever found. +.TP +.BI "mhmboxquirks = "string +Enable thunderbird/mozilla-seamonkey mbox format quirks Set this for the directory where the email mbox files are +stored. .SH SEE ALSO .PP diff --git a/src/doc/user/Makefile b/src/doc/user/Makefile index 4d1e51e6..5d1860e5 100644 --- a/src/doc/user/Makefile +++ b/src/doc/user/Makefile @@ -25,7 +25,7 @@ webh: make -C webhelp usermanual.html: usermanual.xml - xsltproc ${commonoptions} \ + xsltproc --xinclude ${commonoptions} \ -o tmpfile.html "${XSLDIR}/html/docbook.xsl" $< -tidy -indent tmpfile.html > usermanual.html rm -f tmpfile.html diff --git a/src/doc/user/recoll.conf.xml b/src/doc/user/recoll.conf.xml new file mode 100644 index 00000000..a522f5ff --- /dev/null +++ b/src/doc/user/recoll.conf.xml @@ -0,0 +1,588 @@ + + +Recoll main configuration file, recoll.conf + +Parameters affecting what documents we index + +topdirs +Space-separated list of files or +directories to recursively index. Default to ~ (indexes +$HOME). You can use symbolic links in the list, they will be followed, +independantly of the value of the followLinks variable. + +skippedNames +Files and directories which should be ignored. +White space separated list of wildcard patterns (simple ones, not paths, +must contain no / ), which will be tested against file and directory +names. The list in the default configuration does not exclude hidden +directories (names beginning with a dot), which means that it may index +quite a few things that you do not want. On the other hand, email user +agents like Thunderbird usually store messages in hidden directories, and +you probably want this indexed. One possible solution is to have '.*' in +'skippedNames', and add things like '~/.thunderbird' '~/.evolution' to +'topdirs'. Not even the file names are indexed for patterns in this +list, see the 'noContentSuffixes' variable for an alternative approach +which indexes the file names. Can be redefined for any +subtree. + +noContentSuffixes +List of name endings (not necessarily dot-separated suffixes) for +which we don't try MIME type identification, and don't uncompress or +index content. Only the names will be indexed. This +complements the now obsoleted recoll_noindex list from the mimemap file, +which will go away in a future release (the move from mimemap to +recoll.conf allows editing the list through the GUI). This is different +from skippedNames because these are name ending matches only (not +wildcard patterns), and the file name itself gets indexed normally. This +can be redefined for subdirectories. + +skippedPaths +Paths we should not go into. Space-separated list of +wildcard expressions for filesystem paths. Can contain files and +directories. The database and configuration directories will +automatically be added. The expressions are matched using 'fnmatch(3)' +with the FNM_PATHNAME flag set by default. This means that '/' characters +must be matched explicitely. You can set 'skippedPathsFnmPathname' to 0 +to disable the use of FNM_PATHNAME (meaning that '/*/dir3' will match +'/dir1/dir2/dir3'). The default value contains the usual mount point for +removable media to remind you that it is a bad idea to have Recoll work +on these (esp. with the monitor: media gets indexed on mount, all data +gets erased on unmount). Explicitely adding '/media/xxx' to the topdirs +will override this. + +skippedPathsFnmPathname +Set to 0 to +override use of FNM_PATHNAME for matching skipped +paths. + +daemSkippedPaths +skippedPaths equivalent specific to +real time indexing. This enables having parts of the tree +which are initially indexed but not monitored. If daemSkippedPaths is +not set, the daemon uses skippedPaths. + +zipSkippedNames +Space-separated list of wildcard expressions for names that should +be ignored inside zip archives. This is used directly by +the zip handler, and has a function similar to skippedNames, but works +independantly. Can be redefined for subdirectories. Supported by recoll +1.20 and newer. See +https://bitbucket.org/medoc/recoll/wiki/Filtering%20out%20Zip%20archive%20members + + +followLinks +Follow symbolic links during +indexing. The default is to ignore symbolic links to avoid +multiple indexing of linked files. No effort is made to avoid duplication +when this option is set to true. This option can be set individually for +each of the 'topdirs' members by using sections. It can not be changed +below the 'topdirs' level. Links in the 'topdirs' list itself are always +followed. + +indexedmimetypes +Restrictive list of +indexed mime types. Normally not set (in which case all +supported types are indexed). If it is set, +only the types from the list will have their contents indexed. The names +will be indexed anyway if indexallfilenames is set (default). MIME +type names should be taken from the mimemap file. Can be redefined for +subtrees. + +excludedmimetypes +List of excluded MIME +types. Lets you exclude some types from indexing. Can be +redefined for subtrees. + +compressedfilemaxkbs +Size limit for compressed +files. We need to decompress these in a +temporary directory for identification, which can be wasteful in some +cases. Limit the waste. Negative means no limit. 0 results in no +processing of any compressed file. Default 50 MB. + +textfilemaxmbs +Size limit for text +files. Mostly for skipping monster +logs. Default 20 MB. + +indexallfilenames +Index the file names of +unprocessed files Index the names of files the contents of +which we don't index because of an excluded or unsupported MIME +type. + +usesystemfilecommand +Use a system command +for file MIME type guessing as a final step in file type +identification This is generally useful, but will usually +cause the indexing of many bogus 'text' files. See 'systemfilecommand' +for the command used. + +systemfilecommand +Command used to guess +MIME types if the internal methods fails This should be a +"file -i" workalike. The file path will be added as a last parameter to +the command line. 'xdg-mime' works better than the traditional 'file' +command, and is now the configured default (with a hard-coded fallback to +'file') + +processwebqueue +Decide if we process the +Web queue. The queue is a directory where the Recoll Web +browser plugins create the copies of visited pages. + +textfilepagekbs +Page size for text +files. If this is set, text/plain files will be divided +into documents of approximately this size. Will reduce memory usage at +index time and help with loading data in the preview window at query +time. Particularly useful with very big files, such as application or +system logs. Also see textfilemaxmbs and +compressedfilemaxkbs. + +membermaxkbs +Size limit for archive +members. This is passed to the filters in the environment +as RECOLL_FILTER_MAXMEMBERKB. + + +Parameters affecting how we generate terms + +indexStripChars +Decide if we store +character case and diacritics in the index. If we do, +searches sensitive to case and diacritics can be performed, but the index +will be bigger, and some marginal weirdness may sometimes occur. The +default is a stripped index. When using multiple indexes for a search, +this parameter must be defined identically for all. Changing the value +implies an index reset. + +nonumbers +Decides if terms will be +generated for numbers. For example "123", "1.5e6", +192.168.1.4, would not be indexed if nonumbers is set ("value123" would +still be). Numbers are often quite interesting to search for, and this +should probably not be set except for special situations, ie, scientific +documents with huge amounts of numbers in them, where setting nonumbers +will reduce the index size. This can only be set for a whole index, not +for a subtree. + +dehyphenate +Determines if we index +'coworker' also when the input is 'co-worker'. This is new +in version 1.22, and on by default. Setting the variable to off allows +restoring the previous behaviour. + +nocjk +Decides if specific East Asian +(Chinese Korean Japanese) characters/word splitting is turned +off. This will save a small amount of CPU if you have no CJK +documents. If your document base does include such text but you are not +interested in searching it, setting nocjk may be a +significant time and space saver. + +cjkngramlen +This lets you adjust the size of +n-grams used for indexing CJK text. The default value of 2 is +probably appropriate in most cases. A value of 3 would allow more precision +and efficiency on longer words, but the index will be approximately twice +as large. + +indexstemminglanguages +Languages for which to create stemming expansion +data. Stemmer names can be found by executing 'recollindex +-l', or this can also be set from a list in the GUI. + +defaultcharset +Default character +set. This is used for files which do not contain a +character set definition (e.g.: text/plain). Values found inside files, +e.g. a 'charset' tag in HTML documents, will override it. If this is not +set, the default character set is the one defined by the NLS environment +($LC_ALL, $LC_CTYPE, $LANG), or ultimately iso-8859-1 (cp-1252 in fact). +If for some reason you want a general default which does not match your +LANG and is not 8859-1, use this variable. This can be redefined for any +sub-directory. + +unac_except_trans +A list of characters, +encoded in UTF-8, which should be handled specially +when converting text to unaccented lowercase. For +example, in Swedish, the letter a with diaeresis has full alphabet +citizenship and should not be turned into an a. +Each element in the space-separated list has the special character as +first element and the translation following. The handling of both the +lowercase and upper-case versions of a character should be specified, as +appartenance to the list will turn-off both standard accent and case +processing. The value is global and affects both indexing and querying. +Examples: +Swedish: +unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl åå Åå +. German: +unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl +In French, you probably want to decompose oe and ae and nobody would type +a German ß +unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl +. The default for all until someone protests follows. These decompositions +are not performed by unac, but it is unlikely that someone would type the +composed forms in a search. +unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl + +maildefcharset +Overrides the default +character set for email messages which don't specify +one. This is mainly useful for readpst (libpst) dumps, +which are utf-8 but do not say so. + +localfields +Set fields on all files +(usually of a specific fs area). Syntax is the usual: +name = value ; attr1 = val1 ; [...] +value is empty so this needs an initial semi-colon. This is useful, e.g., +for setting the rclaptg field for application selection inside +mimeview. + +testmodifusemtime +Use mtime instead of +ctime to test if a file has been modified. The time is used +in addition to the size, which is always used. +Setting this can reduce re-indexing on systems where extended attributes +are used (by some other application), but not indexed, because changing +extended attributes only affects ctime. +Notes: +- This may prevent detection of change in some marginal file rename cases +(the target would need to have the same size and mtime). +- You should probably also set noxattrfields to 1 in this case, except if +you still prefer to perform xattr indexing, for example if the local +file update pattern makes it of value (as in general, there is a risk +for pure extended attributes updates without file modification to go +undetected). Perform a full index reset after changing this. + + +noxattrfields +Disable extended attributes +conversion to metadata fields. This probably needs to be +set if testmodifusemtime is set. + +metadatacmds +Define commands to +gather external metadata, e.g. tmsu tags. +There can be several entries, separated by semi-colons, each defining +which field name the data goes into and the command to use. Don't forget the +initial semi-colon. All the field names must be different. You can use +aliases in the "field" file if necessary. +As a not too pretty hack conceded to convenience, any field name +beginning with "rclmulti" will be taken as an indication that the command +returns multiple field values inside a text blob formatted as a recoll +configuration file ("fieldname = fieldvalue" lines). The rclmultixx name +will be ignored, and field names and values will be parsed from the data. +Example: metadatacmds = ; tags = tmsu tags %f; rclmulti1 = cmdOutputsConf %f + + + +Parameters affecting where and how we store things + +cachedir +Top directory for Recoll data. Recoll data +directories are normally located relative to the configuration directory +(e.g. ~/.recoll/xapiandb, ~/.recoll/mboxcache). If 'cachedir' is set, the +directories are stored under the specified value instead (e.g. if +cachedir is ~/.cache/recoll, the default dbdir would be +~/.cache/recoll/xapiandb). This affects dbdir, webcachedir, +mboxcachedir, aspellDicDir, which can still be individually specified to +override cachedir. Note that if you have multiple configurations, each +must have a different cachedir, there is no automatic computation of a +subpath under cachedir. + +maxfsoccuppc +Maximum file system occupation +over which we stop indexing. The value is a percentage, +corresponding to what the "Capacity" df output column shows. The default +value is 0, meaning no checking. + +xapiandb +Xapian database directory +location. This will be created on first indexing. If the +value is not an absolute path, it will be interpreted as relative to +cachedir if set, or the configuration directory (-c argument or +$RECOLL_CONFDIR). If nothing is specified, the default is then +~/.recoll/xapiandb/ + +idxstatusfile +Name of the scratch file where the indexer process updates its +status. Default: idxstatus.txt inside the configuration +directory. + +mboxcachedir +Directory location for storing mbox message offsets cache +files. This is normally 'mboxcache' under cachedir if set, +or else under the configuration directory, but it may be useful to share +a directory between different configurations. + +mboxcacheminmbs +Minimum mbox file size over which we cache the offsets. There is really no sense in caching offsets for small files. The +default is 5 MB. + +webcachedir +Directory where we store the archived web pages. This is only used by the web history indexing code +Default: cachedir/webcache if cachedir is set, else +$RECOLL_CONFDIR/webcache + +webcachemaxmbs +Maximum size in MB of the Web archive. This is only used by the web history indexing code. +Default: 40 MB. +Reducing the size will not physically truncate the file. + +webqueuedir +The path to the Web indexing queue. This is +hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no +need or possibility to change it. + +aspellDicDir +Aspell dictionary storage directory location. The +aspell dictionary (aspdict.(lang).rws) is normally stored in the +directory specified by cachedir if set, or under the configuration +directory. + +filtersdir +Directory location for executable input handlers. If +RECOLL_FILTERSDIR is set in the environment, we use it instead. Defaults +to $prefix/share/recoll/filters. Can be redefined for +subdirectories. + +iconsdir +Directory location for icons. The only reason to +change this would be if you want to change the icons displayed in the +result list. Defaults to $prefix/share/recoll/images + + +Parameters affecting indexing performance and resource usage + +idxflushmb +Threshold (megabytes of new data) where we flush from memory to +disk index. Setting this allows some control over memory +usage by the indexer process. A value of 0 means no explicit flushing, +which lets Xapian perform its own thing, meaning flushing every +$XAPIAN_FLUSH_THRESHOLD documents created, modified or deleted: as memory +usage depends on average document size, not only document count, the +Xapian approach is is not very useful, and you should let Recoll manage +the flushes. The default value of idxflushmb is 10 MB, and may be a bit +low. If you are looking for maximum speed, you may want to experiment +with values between 20 and +80. In my experience, values beyond 100 are always counterproductive. If +you find otherwise, please drop me a note. + +filtermaxseconds +Maximum external filter execution time in +seconds. Default 1200 (20mn). Set to 0 for no limit. This +is mainly to avoid infinite loops in postscript files +(loop.ps) + +filtermaxmbytes +Maximum virtual memory space for filter processes +(setrlimit(RLIMIT_AS)), in megabytes. Note that this +includes any mapped libs (there is no reliable Linux way to limit the +data space only), so we need to be a bit generous here. Anything over +2000 will be ignored on 32 bits machines. + +thrQSizes +Stage input queues configuration. There are three +internal queues in the indexing pipeline stages (file data extraction, +terms generation, index update). This parameter defines the queue depths +for each stage (three integer values). If a value of -1 is given for a +given stage, no queue is used, and the thread will go on performing the +next stage. In practise, deep queues have not been shown to increase +performance. Default: a value of 0 for the first queue tells Recoll to +perform autoconfiguration based on the detected number of CPUs (no need +for the two other values in this case). Use thrQSizes = -1 -1 -1 to +disable multithreading entirely. + +thrTCounts +Number of threads used for each indexing stage. The +three stages are: file data extraction, terms generation, index +update). The use of the counts is also controlled by some special values +in thrQSizes: if the first queue depth is 0, all counts are ignored +(autoconfigured); if a value of -1 is used for a queue depth, the +corresponding thread count is ignored. It makes no sense to use a value +other than 1 for the last stage because updating the Xapian index is +necessarily single-threaded (and protected by a mutex). + + +Miscellaneous parameters + +loglevel +Log file verbosity 1-6. A value of 2 will print +only errors and warnings. 3 will print information like document updates, +4 is quite verbose and 6 very verbose. + +logfilename +Log file destination. Use 'stderr' (default) to write to the +console. + +idxloglevel +Override loglevel for the indexer. + +idxlogfilename +Override logfilename for the indexer. + +daemloglevel +Override loglevel for the indexer in real time +mode. The default is to use the idx... values if set, else +the log... values. + +daemlogfilename +Override logfilename for the indexer in real time +mode. The default is to use the idx... values if set, else +the log... values. + +idxrundir +Indexing process current directory. The input +handlers sometimes leave temporary files in the current directory, so it +makes sense to have recollindex chdir to some temporary directory. If the +value is empty, the current directory is not changed. If the +value is (literal) tmp, we use the temporary directory as set by the +environment (RECOLL_TMPDIR else TMPDIR else /tmp). If the value is an +absolute path to a directory, we go there. + +checkneedretryindexscript +Script used to heuristically check if we need to retry indexing +files which previously failed. The default script checks +the modified dates on /usr/bin and /usr/local/bin. A relative path will +be looked up in the filters dirs, then in the path. Use an absolute path +to do otherwise. + +recollhelperpath +Additional places to search for helper executables. This is only used on Windows for now. + +idxabsmlen +Length of abstracts we store while indexing. Recoll stores an abstract for each indexed file. +The text can come from an actual 'abstract' section in the +document or will just be the beginning of the document. It is stored in +the index so that it can be displayed inside the result lists without +decoding the original file. The idxabsmlen parameter +defines the size of the stored abstract. The default value is 250 +bytes. The search interface gives you the choice to display this stored +text or a synthetic abstract built by extracting text around the search +terms. If you always prefer the synthetic abstract, you can reduce this +value and save a little space. + +idxmetastoredlen +Truncation length of stored metadata fields. This +does not affect indexing (the whole field is processed anyway), just the +amount of data stored in the index for the purpose of displaying fields +inside result lists or previews. The default value is 150 bytes which +may be too low if you have custom fields. + +aspellLanguage +Language definitions to use when creating the aspell +dictionary. The value must match a set of aspell language +definition files. You can type "aspell dicts" to see a list The default +if this is not set is to use the NLS environment to guess the +value. + +aspellAddCreateParam +Additional option and parameter to aspell dictionary creation +command. Some aspell packages may need an additional option +(e.g. on Debian Jessie: --local-data-dir=/usr/lib/aspell). See Debian bug +772415. + +aspellKeepStderr +Set this to have a look at aspell dictionary creation +errors. There are always many, so this is mostly for +debugging. + +noaspell +Disable aspell use. The aspell dictionary generation +takes time, and some combinations of aspell version, language, and local +terms, result in aspell crashing, so it sometimes makes sense to just +disable the thing. + +monauxinterval +Auxiliary database update interval. The real time +indexer only updates the auxiliary databases (stemdb, aspell) +periodically, because it would be too costly to do it for every document +change. The default period is one hour. + +monixinterval +Minimum interval (seconds) between processings of the indexing +queue. The real time indexer does not process each event +when it comes in, but lets the queue accumulate, to diminish overhead and +to aggregate multiple events affecting the same file. Default 30 +S. + +mondelaypatterns +Timing parameters for the real time indexing. Definitions for files which get a longer delay before reindexing +is allowed. This is for fast-changing files, that should only be +reindexed once in a while. A list of wildcardPattern:seconds pairs. The +patterns are matched with fnmatch(pattern, path, 0) You can quote entries +containing white space with double quotes (quote the whole entry, not the +pattern). The default is empty. +Example: mondelaypatterns = *.log:20 "*with spaces.*:30" + +monioniceclass +ionice class for the real time indexing process On platforms where this is supported. The default value is +3. + +monioniceclassdata +ionice class parameter for the real time indexing process. On platforms where this is supported. The default is +empty. + + +Query-time parameters (no impact on the index) + +autodiacsens +auto-trigger diacritics sensitivity (raw index only). IF the index is not stripped, decide if we automatically trigger +diacritics sensitivity if the search term has accented characters (not in +unac_except_trans). Else you need to use the query language and the "D" +modifier to specify diacritics sensitivity. Default is no. + +autocasesens +auto-trigger case sensitivity (raw index only). IF +the index is not stripped (see indexStripChars), decide if we +automatically trigger character case sensitivity if the search term has +upper-case characters in any but the first position. Else you need to use +the query language and the "C" modifier to specify character-case +sensitivity. Default is yes. + +maxTermExpand +Maximum query expansion count +for a single term (e.g.: when using wildcards). This only +affects queries, not indexing. We used to not limit this at all (except +for filenames where the limit was too low at 1000), but it is +unreasonable with a big index. Default 10000. + +maxXapianClauses +Maximum number of clauses +we add to a single Xapian query. This only affects queries, +not indexing. In some cases, the result of term expansion can be +multiplicative, and we want to avoid eating all the memory. Default +50000. + +snippetMaxPosWalk +Maximum number of positions we walk while populating a snippet for +the result list. The default of 1,000,000 may be +insufficient for very big documents, the consequence would be snippets +with possibly meaning-altering missing words. + + +Parameters for the PDF input script + +pdfocr +Attempt OCR of PDF files with no text content if both tesseract and +pdftoppm are installed. The default is off because OCR is so +very slow. + +pdfattach +Enable PDF attachment extraction by executing pdftk (if +available). This is +normally disabled, because it does slow down PDF indexing a bit even if +not one attachment is ever found. + + +Parameters set for specific locations + +mhmboxquirks +Enable thunderbird/mozilla-seamonkey mbox format quirks Set this for the directory where the email mbox files are +stored. + + diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index 8b54e0db..e5795671 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -20,8 +20,8 @@ alink="#0000FF">
-

Recoll user manual

+

Recoll user manual

@@ -109,13 +109,13 @@ alink="#0000FF"> multiple indexes
2.1.3. Document types
+ "#idp50836576">Document types
2.1.4. Indexing failures
+ "#idp50856176">Indexing failures
2.1.5. Recovery
+ "#idp50863632">Recovery @@ -150,6 +150,10 @@ alink="#0000FF"> diacritics sensitivity
2.3.3. Indexing thread + usage configuration GUI
+ +
2.3.4. The index configuration GUI
@@ -444,7 +448,7 @@ alink="#0000FF"> variables
5.4.2. The main + "#RCL.INSTALL.CONFIG.RECOLLCONF">Recoll main configuration file, recoll.conf
5.4.3.
-

2.1.3. Document types

+

2.1.3. Document types

@@ -1065,8 +1069,8 @@ indexedmimetypes = application/pdf indexedmimetypes, can be set either by editing the main - configuration file ( + main configuration file (recoll.conf), or from the GUI index configuration tool.

@@ -1075,8 +1079,8 @@ indexedmimetypes = application/pdf
-

2.1.4. Indexing +

2.1.4. Indexing failures

@@ -1116,8 +1120,8 @@ indexedmimetypes = application/pdf
-

2.1.5. Recovery

+

2.1.5. Recovery

@@ -1183,7 +1187,7 @@ recoll -c ~/.indexes-email

Using multiple configuration directories and + "5.4.2. Recoll main configuration file, recoll.conf"> configuration options allows you to tailor multiple configurations and indexes to handle whatever subset of the available data you wish to @@ -1197,7 +1201,7 @@ recoll -c ~/.indexes-email parameter in the configuration file (see the + "5.4.2. Recoll main configuration file, recoll.conf"> configuration section). This method would mainly be of use if you wanted to keep the configuration directory in its default location, but desired @@ -1315,7 +1319,7 @@ recoll -c ~/.indexes-email are processed. These variables can be set either by editing the text files or by using the dialogs in the + "2.3.4. The index configuration GUI">dialogs in the recoll GUI.

@@ -1435,7 +1439,7 @@ recoll -c ~/.indexes-email other constraints. Most of the relevant parameters are described in the + "5.4.2.2. Parameters affecting how we generate terms"> linked section.

@@ -1505,12 +1509,126 @@ recoll -c ~/.indexes-email multiplicative expansion may become unmanageable.

+
+
+
+
+

2.3.3. Indexing + thread usage configuration GUI

+
+
+
+ +

The Recoll indexing + process recollindex can use + multiple threads to speed up indexing on multiprocessor + systems. The work done to index files is divided in + several stages and some of the stages can be executed by + multiple threads. The stages are:

+ +
+
    +
  1. File system walking: this is + always performed by the main thread.
  2. + +
  3. File conversion and data + extraction.
  4. + +
  5. Text processing (splitting, + stemming, etc.)
  6. + +
  7. Xapian index update.
  8. +
+
+ +

You can also read a longer document about the + transformation of Recoll + indexing to multithreading.

+ +

The threads configuration is controlled by two + configuration file parameters.

+ +
+
+
thrQSizes
+ +
+

This variable defines the job input queues + configuration. There are three possible queues for + stages 2, 3 and 4, and this parameter should give + the queue depth for each stage (three integer + values). If a value of -1 is used for a given + stage, no queue is used, and the thread will go on + performing the next stage. In practise, deep queues + have not been shown to increase performance. A + value of 0 for the first queue tells Recoll to perform + autoconfiguration (no need for anything else in + this case, thrTCounts is not used) - this is the + default configuration.

+
+ +
thrTCounts
+ +
+

This defines the number of threads used for each + stage. If a value of -1 is used for one of the + queue depths, the corresponding thread count is + ignored. It makes no sense to use a value other + than 1 for the last stage because updating the + Xapian index is + necessarily single-threaded (and protected by a + mutex).

+
+
+
+ +

The following example would use three queues (of depth + 2), and 4 threads for converting source documents, 2 for + processing their text, and one to update the index. This + was tested to be the best configuration on the test + system (quadri-processor with multiple disks).

+
+thrQSizes = 2 2 2
+thrTCounts =  4 2 1
+
+ +

The following example would use a single queue, and + the complete processing for each document would be + performed by a single thread (several documents will + still be processed in parallel in most cases). The + threads will use mutual exclusion when entering the index + update stage. In practise the performance would be close + to the precedent case in general, but worse in certain + cases (e.g. a Zip archive would be performed purely + sequentially), so the previous approach is preferred. + YMMV... The 2 last values for thrTCounts are ignored.

+
+thrQSizes = 2 -1 -1
+thrTCounts =  6 1 1
+
+ +

The following example would disable multithreading. + Indexing will be performed by a single thread.

+
+thrQSizes = -1 -1 -1
+
+
+

2.3.3. The + id="RCL.INDEXING.CONFIG.GUI">2.3.4. The index configuration GUI

@@ -2084,7 +2202,7 @@ fs.inotify.max_user_watches=32768 "varname">mondelaypatterns
parameter in the configuration + "5.4.2.5. Miscellaneous parameters">configuration section.

@@ -4021,7 +4139,7 @@ export RECOLL_EXTRA_DBS=/some/place/xapiandb:/some/other/db stemming databases which were built during indexing (this is set in the + "5.4.2. Recoll main configuration file, recoll.conf"> main configuration file), or later added with recollindex -s (See the recollindex manual). @@ -8077,338 +8195,279 @@ thesame = "some string with spaces"

5.4.2. The + "RCL.INSTALL.CONFIG.RECOLLCONF">5.4.2. Recoll main configuration file, recoll.conf

-

recoll.conf is the main - configuration file. It defines things like what to index - (top directories and things to ignore), and the default - character set to use for document types which do not - specify it internally.

- -

The default configuration will index your home - directory. If this is not appropriate, start recoll to create a - blank configuration, click Cancel, and edit the configuration file - before restarting the command. This will start the - initial indexing, which may take some time.

- -

Most of the following parameters can be changed from - the Index Configuration - menu in the recoll interface. Some - can only be set by editing the configuration file.

-

5.4.2.1. Parameters - affecting what documents we index:

+ "RCL.INSTALL.CONFIG.RECOLLCONF.WHATDOCS" id= + "RCL.INSTALL.CONFIG.RECOLLCONF.WHATDOCS">5.4.2.1. Parameters + affecting what documents we index
-
-
-
topdirs
+
+
topdirs
-
-

Specifies the list of directories or files to - index (recursively for directories). You can use - symbolic links as elements of this list. See the - followLinks option - about following symbolic links found under the - top elements (not followed by default).

-
+
+

Space-separated list of files or directories to + recursively index. Default to ~ (indexes $HOME). + You can use symbolic links in the list, they will + be followed, independantly of the value of the + followLinks variable.

+
-
skippedNames
+
skippedNames
-
-

A space-separated list of wilcard patterns for - names of files or directories that should be - completely ignored. The list defined in the - default file is:

-
-skippedNames = #* bin CVS  Cache cache* caughtspam  tmp .thumbnails .svn \
-               *~ .beagle .git .hg .bzr loop.ps .xsession-errors \
-               .recoll* xapiandb recollrc recoll.conf 
-
+
+

Files and directories which should be ignored. + White space separated list of wildcard patterns + (simple ones, not paths, must contain no / ), which + will be tested against file and directory names. + The list in the default configuration does not + exclude hidden directories (names beginning with a + dot), which means that it may index quite a few + things that you do not want. On the other hand, + email user agents like Thunderbird usually store + messages in hidden directories, and you probably + want this indexed. One possible solution is to have + '.*' in 'skippedNames', and add things like + '~/.thunderbird' '~/.evolution' to 'topdirs'. Not + even the file names are indexed for patterns in + this list, see the 'noContentSuffixes' variable for + an alternative approach which indexes the file + names. Can be redefined for any subtree.

+
-

The list can be redefined at any sub-directory - in the indexed area.

+
noContentSuffixes
-

The top-level directories are not affected by - this list (that is, a directory in topdirs might match and would - still be indexed).

+
+

List of name endings (not necessarily + dot-separated suffixes) for which we don't try MIME + type identification, and don't uncompress or index + content. Only the names will be indexed. This + complements the now obsoleted recoll_noindex list + from the mimemap file, which will go away in a + future release (the move from mimemap to + recoll.conf allows editing the list through the + GUI). This is different from skippedNames because + these are name ending matches only (not wildcard + patterns), and the file name itself gets indexed + normally. This can be redefined for + subdirectories.

+
-

The list in the default configuration does not - exclude hidden directories (names beginning with - a dot), which means that it may index quite a few - things that you do not want. On the other hand, - email user agents like thunderbird usually store - messages in hidden directories, and you probably - want this indexed. One possible solution is to - have .* in - skippedNames, and - add things like ~/.thunderbird or ~/.evolution in topdirs.

+
skippedPaths
-

Not even the file names are indexed for - patterns in this list. See the noContentSuffixes variable for - an alternative approach which indexes the file - names.

- +
+

Paths we should not go into. Space-separated + list of wildcard expressions for filesystem paths. + Can contain files and directories. The database and + configuration directories will automatically be + added. The expressions are matched using + 'fnmatch(3)' with the FNM_PATHNAME flag set by + default. This means that '/' characters must be + matched explicitely. You can set + 'skippedPathsFnmPathname' to 0 to disable the use + of FNM_PATHNAME (meaning that '/*/dir3' will match + '/dir1/dir2/dir3'). The default value contains the + usual mount point for removable media to remind you + that it is a bad idea to have Recoll work on these + (esp. with the monitor: media gets indexed on + mount, all data gets erased on unmount). + Explicitely adding '/media/xxx' to the topdirs will + override this.

+
-
noContentSuffixes
+
+ skippedPathsFnmPathname
-
-

This is a list of file name endings (not - wildcard expressions, nor dot-delimited - suffixes). Only the names of matching files will - be indexed (no attempt at MIME type - identification, no decompression, no content - indexing). This can be redefined for - subdirectories, and edited from the GUI. The - default value is:

-
-noContentSuffixes = .md5 .map \
-       .o .lib .dll .a .sys .exe .com \
-       .mpp .mpt .vsd \
-           .img .img.gz .img.bz2 .img.xz .image .image.gz .image.bz2 .image.xz \
-       .dat .bak .rdf .log.gz .log .db .msf .pid \
-       ,v ~ #
-
-
+
+

Set to 0 to override use of FNM_PATHNAME for + matching skipped paths.

+
-
skippedPaths and daemSkippedPaths
+
daemSkippedPaths
-
-

A space-separated list of patterns for - paths of - files or directories that should be skipped. - There is no default in the sample configuration - file, but the code always adds the configuration - and database directories in there.

+
+

skippedPaths equivalent specific to real time + indexing. This enables having parts of the tree + which are initially indexed but not monitored. If + daemSkippedPaths is not set, the daemon uses + skippedPaths.

+
-

skippedPaths is - used both by batch and real time indexing. - daemSkippedPaths can - be used to specify things that should be indexed - at startup, but not monitored.

+
zipSkippedNames
-

Example of use for skipping text files only in - a specific directory:

-
-skippedPaths = ~/somedir/*.txt
-              
-
- +
+

Space-separated list of wildcard expressions for + names that should be ignored inside zip archives. + This is used directly by the zip handler, and has a + function similar to skippedNames, but works + independantly. Can be redefined for subdirectories. + Supported by recoll 1.20 and newer. See + https://bitbucket.org/medoc/recoll/wiki/Filtering%20out%20Zip%20archive%20members

+
-
- skippedPathsFnmPathname
+
followLinks
-
-

The values in the *skippedPaths variables are - matched by default with fnmatch(3), with the - FNM_PATHNAME flag. This means that '/' characters - must be matched explicitely. You can set - skippedPathsFnmPathname to 0 to - disable the use of FNM_PATHNAME (meaning that - /*/dir3 will match /dir1/dir2/dir3).

-
+
+

Follow symbolic links during indexing. The + default is to ignore symbolic links to avoid + multiple indexing of linked files. No effort is + made to avoid duplication when this option is set + to true. This option can be set individually for + each of the 'topdirs' members by using sections. It + can not be changed below the 'topdirs' level. Links + in the 'topdirs' list itself are always + followed.

+
-
zipSkippedNames
+
indexedmimetypes
-
-

A space-separated list of patterns for names - of files or directories that should be ignored - inside zip archives. This is used directly by the - zip handler, and has a function similar to - skippedNames, but works independantly. Can be - redefined for filesystem subdirectories. For - versions up to 1.19, you will need to update the - Zip handler and install a supplementary Python - module. The details are described on the Recoll wiki.

-
+
+

Restrictive list of indexed mime types. Normally + not set (in which case all supported types are + indexed). If it is set, only the types from the + list will have their contents indexed. The names + will be indexed anyway if indexallfilenames is set + (default). MIME type names should be taken from the + mimemap file. Can be redefined for subtrees.

+
-
followLinks
+
excludedmimetypes
-
-

Specifies if the indexer should follow - symbolic links while walking the file tree. The - default is to ignore symbolic links to avoid - multiple indexing of linked files. No effort is - made to avoid duplication when this option is set - to true. This option can be set individually for - each of the topdirs - members by using sections. It can not be changed - below the topdirs - level.

-
+
+

List of excluded MIME types. Lets you exclude + some types from indexing. Can be redefined for + subtrees.

+
-
indexedmimetypes
+
compressedfilemaxkbs
-
-

Recoll - normally indexes any file which it knows how to - read. This list lets you restrict the indexed - MIME types to what you specify. If the variable - is unspecified or the list empty (the default), - all supported types are processed. Can be - redefined for subdirectories.

-
+
+

Size limit for compressed files. We need to + decompress these in a temporary directory for + identification, which can be wasteful in some + cases. Limit the waste. Negative means no limit. 0 + results in no processing of any compressed file. + Default 50 MB.

+
-
excludedmimetypes
+
textfilemaxmbs
-
-

This list lets you exclude some MIME types - from indexing. Can be redefined for - subdirectories.

-
+
+

Size limit for text files. Mostly for skipping + monster logs. Default 20 MB.

+
-
compressedfilemaxkbs
+
indexallfilenames
-
-

Size limit for compressed (.gz or .bz2) files. - These need to be decompressed in a temporary - directory for identification, which can be very - wasteful if 'uninteresting' big compressed files - are present. Negative means no limit, 0 means no - processing of any compressed file. Defaults to - -1.

-
+
+

Index the file names of unprocessed files Index + the names of files the contents of which we don't + index because of an excluded or unsupported MIME + type.

+
-
textfilemaxmbs
+
usesystemfilecommand
-
-

Maximum size for text files. Very big text - files are often uninteresting logs. Set to -1 to - disable (default 20MB).

-
+
+

Use a system command for file MIME type guessing + as a final step in file type identification This is + generally useful, but will usually cause the + indexing of many bogus 'text' files. See + 'systemfilecommand' for the command used.

+
-
textfilepagekbs
+
systemfilecommand
-
-

If set to other than -1, text files will be - indexed as multiple documents of the given page - size. This may be useful if you do want to index - very big text files as it will both reduce memory - usage at index time and help with loading data to - the preview window. A size of a few megabytes - would seem reasonable (default: 1MB).

-
+
+

Command used to guess MIME types if the internal + methods fails This should be a "file -i" workalike. + The file path will be added as a last parameter to + the command line. 'xdg-mime' works better than the + traditional 'file' command, and is now the + configured default (with a hard-coded fallback to + 'file')

+
-
membermaxkbs
+
processwebqueue
-
-

This defines the maximum size in kilobytes for - an archive member (zip, tar or rar at the - moment). Bigger entries will be skipped.

-
+
+

Decide if we process the Web queue. The queue is + a directory where the Recoll Web browser plugins + create the copies of visited pages.

+
-
indexallfilenames
+
textfilepagekbs
-
-

Recoll - indexes file names in a special section of the - database to allow specific file names searches - using wild cards. This parameter decides if file - name indexing is performed only for files with - MIME types that would qualify them for full text - indexing, or for all files inside the selected - subtrees, independently of MIME type.

-
+
+

Page size for text files. If this is set, + text/plain files will be divided into documents of + approximately this size. Will reduce memory usage + at index time and help with loading data in the + preview window at query time. Particularly useful + with very big files, such as application or system + logs. Also see textfilemaxmbs and + compressedfilemaxkbs.

+
-
usesystemfilecommand
+
membermaxkbs
-
-

Decide if we execute a system command - (file - -i by default) as a - final step for determining the MIME type for a - file (the main procedure uses suffix associations - as defined in the mimemap file). This can be - useful for files with suffix-less names, but it - will also cause the indexing of many bogus "text" - files.

-
- -
systemfilecommand
- -
-

Command to use for mime for mime type - determination if usesystefilecommand is set. - Recent versions of xdg-mime - sometimes work better than file.

-
- -
processwebqueue
- -
-

If this is set, process the directory where - Web browser plugins copy visited pages for - indexing.

-
- -
webqueuedir
- -
-

The path to the web indexing queue. This is - hard-coded in the Firefox plugin as ~/.recollweb/ToIndex so there - should be no need to change it.

-
-
-
+
+

Size limit for archive members. This is passed + to the filters in the environment as + RECOLL_FILTER_MAXMEMBERKB.

+
+
@@ -8418,317 +8477,233 @@ skippedPaths = ~/somedir/*.txt

5.4.2.2. Parameters - affecting how we generate terms:

+ affecting how we generate terms
-

Changing some of these parameters will imply a full - reindex. Also, when using multiple indexes, it may not - make sense to search indexes that don't share the - values for these parameters, because they usually - affect both search and index operations.

+
+
indexStripChars
-
-
-
indexStripChars
+
+

Decide if we store character case and diacritics + in the index. If we do, searches sensitive to case + and diacritics can be performed, but the index will + be bigger, and some marginal weirdness may + sometimes occur. The default is a stripped index. + When using multiple indexes for a search, this + parameter must be defined identically for all. + Changing the value implies an index reset.

+
-
-

Decide if we strip characters of diacritics - and convert them to lower-case before terms are - indexed. If we don't, searches sensitive to case - and diacritics can be performed, but the index - will be bigger, and some marginal weirdness may - sometimes occur. The default is a stripped index - (indexStripChars = - 1) for now. When using multiple indexes - for a search, this parameter must be defined - identically for all. Changing the value implies - an index reset.

-
+
nonumbers
-
maxTermExpand
+
+

Decides if terms will be generated for numbers. + For example "123", "1.5e6", 192.168.1.4, would not + be indexed if nonumbers is set ("value123" would + still be). Numbers are often quite interesting to + search for, and this should probably not be set + except for special situations, ie, scientific + documents with huge amounts of numbers in them, + where setting nonumbers will reduce the index size. + This can only be set for a whole index, not for a + subtree.

+
-
-

Maximum expansion count for a single term - (e.g.: when using wildcards). The default of - 10000 is reasonable and will avoid queries that - appear frozen while the engine is walking the - term list.

-
+
dehyphenate
-
maxXapianClauses
+
+

Determines if we index 'coworker' also when the + input is 'co-worker'. This is new in version 1.22, + and on by default. Setting the variable to off + allows restoring the previous behaviour.

+
-
-

Maximum number of elementary clauses we can - add to a single Xapian query. In some cases, the - result of term expansion can be multiplicative, - and we want to avoid using excessive memory. The - default of 100 000 should be both high enough in - most cases and compatible with current typical - hardware configurations.

-
+
nocjk
-
nonumbers
+
+

Decides if specific East Asian (Chinese Korean + Japanese) characters/word splitting is turned off. + This will save a small amount of CPU if you have no + CJK documents. If your document base does include + such text but you are not interested in searching + it, setting nocjk may be a significant time and + space saver.

+
-
-

If this set to true, no terms will be - generated for numbers. For example "123", - "1.5e6", 192.168.1.4, would not be indexed - ("value123" would still be). Numbers are often - quite interesting to search for, and this should - probably not be set except for special - situations, ie, scientific documents with huge - amounts of numbers in them. This can only be set - for a whole index, not for a subtree.

-
+
cjkngramlen
-
dehyphenate
+
+

This lets you adjust the size of n-grams used + for indexing CJK text. The default value of 2 is + probably appropriate in most cases. A value of 3 + would allow more precision and efficiency on longer + words, but the index will be approximately twice as + large.

+
-
-

Determines if, given an input of co-worker, we add a term for - coworker. This - possibility is new in version 1.22, and on by - default. Setting the variable to off allows - restoring the previous behaviour.

-
+
+ indexstemminglanguages
-
nocjk
+
+

Languages for which to create stemming expansion + data. Stemmer names can be found by executing + 'recollindex -l', or this can also be set from a + list in the GUI.

+
-
-

If this set to true, specific east asian - (Chinese Korean Japanese) characters/word - splitting is turned off. This will save a small - amount of cpu if you have no CJK documents. If - your document base does include such text but you - are not interested in searching it, setting - nocjk may be a - significant time and space saver.

-
+
defaultcharset
-
cjkngramlen
+
+

Default character set. This is used for files + which do not contain a character set definition + (e.g.: text/plain). Values found inside files, e.g. + a 'charset' tag in HTML documents, will override + it. If this is not set, the default character set + is the one defined by the NLS environment ($LC_ALL, + $LC_CTYPE, $LANG), or ultimately iso-8859-1 + (cp-1252 in fact). If for some reason you want a + general default which does not match your LANG and + is not 8859-1, use this variable. This can be + redefined for any sub-directory.

+
-
-

This lets you adjust the size of n-grams used - for indexing CJK text. The default value of 2 is - probably appropriate in most cases. A value of 3 - would allow more precision and efficiency on - longer words, but the index will be approximately - twice as large.

-
+
unac_except_trans
-
indexstemminglanguages
+
+

A list of characters, encoded in UTF-8, which + should be handled specially when converting text to + unaccented lowercase. For example, in Swedish, the + letter a with diaeresis has full alphabet + citizenship and should not be turned into an a. + Each element in the space-separated list has the + special character as first element and the + translation following. The handling of both the + lowercase and upper-case versions of a character + should be specified, as appartenance to the list + will turn-off both standard accent and case + processing. The value is global and affects both + indexing and querying. Examples: Swedish: + unac_except_trans = ää Ää + öö Öö üü Üü + ßss œoe Œoe æae Æae + ffff fifi flfl åå + Åå . German: unac_except_trans = + ää Ää öö Öö + üü Üü ßss œoe + Œoe æae Æae ffff fifi + flfl In French, you probably want to + decompose oe and ae and nobody would type a German + ß unac_except_trans = ßss œoe + Œoe æae Æae ffff fifi + flfl . The default for all until someone + protests follows. These decompositions are not + performed by unac, but it is unlikely that someone + would type the composed forms in a search. + unac_except_trans = ßss œoe Œoe + æae Æae ffff fifi + flfl

+
-
-

A list of languages for which the stem - expansion databases will be built. See - recollindex(1) or - use the recollindex - -l command for - possible values. You can add a stem expansion - database for a different language by using - recollindex - -s, but it will be - deleted during the next indexing. Only languages - listed in the configuration file are - permanent.

-
+
maildefcharset
-
defaultcharset
+
+

Overrides the default character set for email + messages which don't specify one. This is mainly + useful for readpst (libpst) dumps, which are utf-8 + but do not say so.

+
-
-

The name of the character set used for files - that do not contain a character set definition - (ie: plain text files). This can be redefined for - any sub-directory. If it is not set at all, the - character set used is the one defined by the nls - environment ( LC_ALL, - LC_CTYPE, LANG), or iso8859-1 if nothing is set.

-
+
localfields
-
unac_except_trans
+
+

Set fields on all files (usually of a specific + fs area). Syntax is the usual: name = value ; attr1 + = val1 ; [...] value is empty so this needs an + initial semi-colon. This is useful, e.g., for + setting the rclaptg field for application selection + inside mimeview.

+
-
-

This is a list of characters, encoded in - UTF-8, which should be handled specially when - converting text to unaccented lowercase. For - example, in Swedish, the letter a with diaeresis has full - alphabet citizenship and should not be turned - into an a. Each - element in the space-separated list has the - special character as first element and the - translation following. The handling of both the - lowercase and upper-case versions of a character - should be specified, as appartenance to the list - will turn-off both standard accent and case - processing. Example for Swedish:

-
-unac_except_trans =  åå Åå ää Ää öö Öö
-            
-
+
testmodifusemtime
-

Note that the translation is not limited to a - single character, you could very well have - something like üue in the list.

+
+

Use mtime instead of ctime to test if a file has + been modified. The time is used in addition to the + size, which is always used. Setting this can reduce + re-indexing on systems where extended attributes + are used (by some other application), but not + indexed, because changing extended attributes only + affects ctime. Notes: - This may prevent detection + of change in some marginal file rename cases (the + target would need to have the same size and mtime). + - You should probably also set noxattrfields to 1 + in this case, except if you still prefer to perform + xattr indexing, for example if the local file + update pattern makes it of value (as in general, + there is a risk for pure extended attributes + updates without file modification to go + undetected). Perform a full index reset after + changing this.

+
-

The default value set for unac_except_trans can't be - listed here because I have trouble with SGML and - UTF-8, but it only contains ligature - decompositions: german ss, oe, ae, fi, fl.

+
noxattrfields
-

This parameter can't be defined for - subdirectories, it is global, because there is no - way to do otherwise when querying. If you have - document sets which would need different values, - you will have to index and query them - separately.

- +
+

Disable extended attributes conversion to + metadata fields. This probably needs to be set if + testmodifusemtime is set.

+
-
maildefcharset
+
metadatacmds
-
-

This can be used to define the default - character set specifically for email messages - which don't specify it. This is mainly useful for - readpst (libpst) dumps, which are utf-8 but do - not say so.

-
- -
localfields
- -
-

This allows setting fields for all documents - under a given directory. Typical usage would be - to set an "rclaptg" field, to be used in - mimeview to select - a specific viewer. If several fields are to be - set, they should be separated with a semi-colon - (';') character, which there is currently no way - to escape. Also note the initial semi-colon. - Example: localfields= - ;rclaptg=gnus;other = val, then select - specifier viewer with mimetype|tag=... in mimeview.

-
- -
testmodifusemtime
- -
-

If true, use mtime instead of default ctime to - determine if a file has been modified (in - addition to size, which is always used). Setting - this can reduce re-indexing on systems where - extended attributes are modified (by some other - application), but not indexed (changing extended - attributes only affects ctime). Notes:

- -
-
    -
  • -

    This may prevent detection of change in - some marginal file rename cases (the target - would need to have the same size and - mtime).

    -
  • - -
  • -

    You should probably also set - noxattrfields to 1 in this case, except if - you still prefer to perform xattr indexing, - for example if the local file update - pattern makes it of value (as in general, - there is a risk for pure extended - attributes updates without file - modification to go undetected).

    -
  • -
-
- -

Perform a full index reset after changing the - value of this parameter.

-
- -
noxattrfields
- -
-

Recoll versions 1.19 and later automatically - translate file extended attributes into document - fields (to be processed according to the - parameters from the fields file). Setting this - variable to 1 will disable the behaviour.

-
- -
metadatacmds
- -
-

This allows executing external commands for - each file and storing the output in Recoll document fields. This - could be used for example to index external tag - data. The value is a list of field names and - commands, don't forget an initial semi-colon. - Example:

-
-[/some/area/of/the/fs]
-metadatacmds = ; tags = tmsu tags %f; otherfield = somecmd -xx %f
-                
-
- -

As a specially disgusting hack brought by - Recoll 1.19.7, - if a "field name" begins with rclmulti, the data returned by - the command is expected to contain multiple field - values, in configuration file format. This allows - setting several fields by executing a single - command. Example:

-
-metadatacmds = ; rclmulti1 = somecmd %f
-                
-
- -

If somecmd - returns data in the form of:

-
-field1 = value1
-field2 = value for field2
-                
-
- -

field1 and - field2 will be set - inside the document metadata.

-
-
-
+
+

Define commands to gather external metadata, + e.g. tmsu tags. There can be several entries, + separated by semi-colons, each defining which field + name the data goes into and the command to use. + Don't forget the initial semi-colon. All the field + names must be different. You can use aliases in the + "field" file if necessary. As a not too pretty hack + conceded to convenience, any field name beginning + with "rclmulti" will be taken as an indication that + the command returns multiple field values inside a + text blob formatted as a recoll configuration file + ("fieldname = fieldvalue" lines). The rclmultixx + name will be ignored, and field names and values + will be parsed from the data. Example: metadatacmds + = ; tags = tmsu tags %f; rclmulti1 = cmdOutputsConf + %f

+
+
@@ -8736,142 +8711,161 @@ field2 = value for field2

5.4.2.3. Parameters - affecting where and how we store things:

+ "RCL.INSTALL.CONFIG.RECOLLCONF.STORE" id= + "RCL.INSTALL.CONFIG.RECOLLCONF.STORE">5.4.2.3. Parameters + affecting where and how we store things
-
-
-
cachedir
+
+
cachedir
-
-

When not explicitly specified, the - Recoll data - directories are stored relative to the - configuration directory. If cachedir is set, the directories - are stored under the specified value instead - (e.g. if cachedir is - set to ~/.cache/recoll, the default - dbdir would be - ~/.cache/recoll/xapiandb - instead of ~/.recoll/xapiandb ). This - affects the default values for dbdir, webcachedir, mboxcachedir, and aspellDicDir, which can still be - individually specified to override cachedir. Note that if you have - multiple configurations, each must have a - different cachedir.

-
+
+

Top directory for Recoll data. Recoll data + directories are normally located relative to the + configuration directory (e.g. ~/.recoll/xapiandb, + ~/.recoll/mboxcache). If 'cachedir' is set, the + directories are stored under the specified value + instead (e.g. if cachedir is ~/.cache/recoll, the + default dbdir would be ~/.cache/recoll/xapiandb). + This affects dbdir, webcachedir, mboxcachedir, + aspellDicDir, which can still be individually + specified to override cachedir. Note that if you + have multiple configurations, each must have a + different cachedir, there is no automatic + computation of a subpath under cachedir.

+
-
dbdir
+
maxfsoccuppc
-
-

The name of the Xapian data directory. It will - be created if needed when the index is - initialized. If this is not an absolute path, it - will be interpreted relative to the configuration - directory. The value can have embedded spaces but - starting or trailing spaces will be trimmed. You - cannot use quotes here.

-
+
+

Maximum file system occupation over which we + stop indexing. The value is a percentage, + corresponding to what the "Capacity" df output + column shows. The default value is 0, meaning no + checking.

+
-
idxstatusfile
+
xapiandb
-
-

The name of the scratch file where the indexer - process updates its status. Default: idxstatus.txt inside the - configuration directory.

-
+
+

Xapian database directory location. This will be + created on first indexing. If the value is not an + absolute path, it will be interpreted as relative + to cachedir if set, or the configuration directory + (-c argument or $RECOLL_CONFDIR). If nothing is + specified, the default is then + ~/.recoll/xapiandb/

+
-
maxfsoccuppc
+
idxstatusfile
-
-

Maximum file system occupation before we stop - indexing. The value is a percentage, - corresponding to what the "Capacity" df output - column shows. The default value is 0, meaning no - checking.

-
+
+

Name of the scratch file where the indexer + process updates its status. Default: idxstatus.txt + inside the configuration directory.

+
-
mboxcachedir
+
mboxcachedir
-
-

The directory where mbox message offsets cache - files are held. This is normally - $RECOLL_CONFDIR/mboxcache, but it may be useful - to share a directory between different - configurations.

-
+
+

Directory location for storing mbox message + offsets cache files. This is normally 'mboxcache' + under cachedir if set, or else under the + configuration directory, but it may be useful to + share a directory between different + configurations.

+
-
mboxcacheminmbs
+
mboxcacheminmbs
-
-

The minimum mbox file size over which we cache - the offsets. There is really no sense in caching - offsets for small files. The default is 5 MB.

-
+
+

Minimum mbox file size over which we cache the + offsets. There is really no sense in caching + offsets for small files. The default is 5 MB.

+
-
webcachedir
+
webcachedir
-
-

This is only used by the web browser plugin - indexing code, and defines where the cache for - visited pages will live. Default: $RECOLL_CONFDIR/webcache

-
+
+

Directory where we store the archived web pages. + This is only used by the web history indexing code + Default: cachedir/webcache if cachedir is set, else + $RECOLL_CONFDIR/webcache

+
-
webcachemaxmbs
+
webcachemaxmbs
-
-

This is only used by the web browser plugin - indexing code, and defines the maximum size for - the web page cache. Default: 40 MB. Quite - unfortunately, this is only taken into account - when creating the cache file. You need to delete - the file for a change to be taken into - account.

-
+
+

Maximum size in MB of the Web archive. This is + only used by the web history indexing code. + Default: 40 MB. Reducing the size will not + physically truncate the file.

+
-
idxflushmb
+
webqueuedir
-
-

Threshold (megabytes of new text data) where - we flush from memory to disk index. Setting this - can help control memory usage. A value of 0 means - no explicit flushing, letting Xapian use its own - default, which is flushing every 10000 (or - XAPIAN_FLUSH_THRESHOLD) documents, which gives - little memory usage control, as memory usage also - depends on average document size. The default - value is 10, and it is probably a bit low. If - your system usually has free memory, you can try - higher values between 20 and 80. In my - experience, values beyond 100 are always - counterproductive.

-
-
-
+
+

The path to the Web indexing queue. This is + hard-coded in the plugin as ~/.recollweb/ToIndex so + there should be no need or possibility to change + it.

+
+ +
aspellDicDir
+ +
+

Aspell dictionary storage directory location. + The aspell dictionary (aspdict.(lang).rws) is + normally stored in the directory specified by + cachedir if set, or under the configuration + directory.

+
+ +
filtersdir
+ +
+

Directory location for executable input + handlers. If RECOLL_FILTERSDIR is set in the + environment, we use it instead. Defaults to + $prefix/share/recoll/filters. Can be redefined for + subdirectories.

+
+ +
iconsdir
+ +
+

Directory location for icons. The only reason to + change this would be if you want to change the + icons displayed in the result list. Defaults to + $prefix/share/recoll/images

+
+
@@ -8879,117 +8873,102 @@ field2 = value for field2

5.4.2.4. Parameters - affecting multithread processing

+ "RCL.INSTALL.CONFIG.RECOLLCONF.PERFS" id= + "RCL.INSTALL.CONFIG.RECOLLCONF.PERFS">5.4.2.4. Parameters + affecting indexing performance and resource + usage
-

The Recoll indexing - process recollindex can use - multiple threads to speed up indexing on multiprocessor - systems. The work done to index files is divided in - several stages and some of the stages can be executed - by multiple threads. The stages are:

+
+
idxflushmb
-
-
    -
  1. File system walking: this is - always performed by the main thread.
  2. +
    +

    Threshold (megabytes of new data) where we flush + from memory to disk index. Setting this allows some + control over memory usage by the indexer process. A + value of 0 means no explicit flushing, which lets + Xapian perform its own thing, meaning flushing + every $XAPIAN_FLUSH_THRESHOLD documents created, + modified or deleted: as memory usage depends on + average document size, not only document count, the + Xapian approach is is not very useful, and you + should let Recoll manage the flushes. The default + value of idxflushmb is 10 MB, and may be a bit low. + If you are looking for maximum speed, you may want + to experiment with values between 20 and 80. In my + experience, values beyond 100 are always + counterproductive. If you find otherwise, please + drop me a note.

    +
    -
  3. File conversion and data - extraction.
  4. +
    filtermaxseconds
    -
  5. Text processing (splitting, - stemming, etc.)
  6. +
    +

    Maximum external filter execution time in + seconds. Default 1200 (20mn). Set to 0 for no + limit. This is mainly to avoid infinite loops in + postscript files (loop.ps)

    +
    -
  7. Xapian index update.
  8. -
-
+
filtermaxmbytes
-

You can also read a longer document about the - transformation of Recoll indexing to - multithreading.

+
+

Maximum virtual memory space for filter + processes (setrlimit(RLIMIT_AS)), in megabytes. + Note that this includes any mapped libs (there is + no reliable Linux way to limit the data space + only), so we need to be a bit generous here. + Anything over 2000 will be ignored on 32 bits + machines.

+
-

The threads configuration is controlled by two - configuration file parameters.

+
thrQSizes
-
-
-
thrQSizes
+
+

Stage input queues configuration. There are + three internal queues in the indexing pipeline + stages (file data extraction, terms generation, + index update). This parameter defines the queue + depths for each stage (three integer values). If a + value of -1 is given for a given stage, no queue is + used, and the thread will go on performing the next + stage. In practise, deep queues have not been shown + to increase performance. Default: a value of 0 for + the first queue tells Recoll to perform + autoconfiguration based on the detected number of + CPUs (no need for the two other values in this + case). Use thrQSizes = -1 -1 -1 to disable + multithreading entirely.

+
-
-

This variable defines the job input queues - configuration. There are three possible queues - for stages 2, 3 and 4, and this parameter should - give the queue depth for each stage (three - integer values). If a value of -1 is used for a - given stage, no queue is used, and the thread - will go on performing the next stage. In - practise, deep queues have not been shown to - increase performance. A value of 0 for the first - queue tells Recoll to perform - autoconfiguration (no need for the two other - values in this case) - this is the default - configuration.

-
+
thrTCounts
-
thrTCounts
- -
-

This defines the number of threads used for - each stage. If a value of -1 is used for one of - the queue depths, the corresponding thread count - is ignored. It makes no sense to use a value - other than 1 for the last stage because updating - the Xapian index - is necessarily single-threaded (and protected by - a mutex).

-
-
-
- -

The following example would use three queues (of - depth 2), and 4 threads for converting source - documents, 2 for processing their text, and one to - update the index. This was tested to be the best - configuration on the test system (quadri-processor with - multiple disks).

-
-thrQSizes = 2 2 2
-thrTCounts =  4 2 1
-
- -

The following example would use a single queue, and - the complete processing for each document would be - performed by a single thread (several documents will - still be processed in parallel in most cases). The - threads will use mutual exclusion when entering the - index update stage. In practise the performance would - be close to the precedent case in general, but worse in - certain cases (e.g. a Zip archive would be performed - purely sequentially), so the previous approach is - preferred. YMMV... The 2 last values for thrTCounts are - ignored.

-
-thrQSizes = 2 -1 -1
-thrTCounts =  6 1 1
-
- -

The following example would disable multithreading. - Indexing will be performed by a single thread.

-
-thrQSizes = -1 -1 -1
-
+
+

Number of threads used for each indexing stage. + The three stages are: file data extraction, terms + generation, index update). The use of the counts is + also controlled by some special values in + thrQSizes: if the first queue depth is 0, all + counts are ignored (autoconfigured); if a value of + -1 is used for a queue depth, the corresponding + thread count is ignored. It makes no sense to use a + value other than 1 for the last stage because + updating the Xapian index is necessarily + single-threaded (and protected by a mutex).

+
+
@@ -8999,259 +8978,400 @@ thrQSizes = -1 -1 -1

5.4.2.5. Miscellaneous - parameters:

+ parameters
-
-
-
autodiacsens
+
+
loglevel
-
-

IF the index is not stripped, decide if we - automatically trigger diacritics sensitivity if - the search term has accented characters (not in - unac_except_trans). - Else you need to use the query language and the - D modifier to - specify diacritics sensitivity. Default is - no.

-
+
+

Log file verbosity 1-6. A value of 2 will print + only errors and warnings. 3 will print information + like document updates, 4 is quite verbose and 6 + very verbose.

+
-
autocasesens
+
logfilename
-
-

IF the index is not stripped, decide if we - automatically trigger character case sensitivity - if the search term has upper-case characters in - any but the first position. Else you need to use - the query language and the C modifier to specify - character-case sensitivity. Default is yes.

-
+
+

Log file destination. Use 'stderr' (default) to + write to the console.

+
-
loglevel,daemloglevel
+
idxloglevel
-
-

Verbosity level for recoll and recollindex. A - value of 4 lists quite a lot of debug/information - messages. 2 only lists errors. The daemversion is specific to the - indexing monitor daemon.

-
+
+

Override loglevel for the indexer.

+
-
logfilename, - daemlogfilename
+
idxlogfilename
-
-

Where the messages should go. 'stderr' can be - used as a special value, and is the default. The - daemversion is - specific to the indexing monitor daemon.

-
+
+

Override logfilename for the indexer.

+
-
checkneedretryindexscript
+
daemloglevel
-
-

This defines the name for a command executed - by recollindex - when starting indexing. If the exit status of the - command is 0, recollindex - retries to index all files which previously could - not be indexed because of data extraction errors. - The default value is a script which checks if any - of the common bin - directories have changed (indicating that a - helper program may have been installed).

-
+
+

Override loglevel for the indexer in real time + mode. The default is to use the idx... values if + set, else the log... values.

+
-
mondelaypatterns
+
daemlogfilename
-
-

This allows specify wildcard path patterns - (processed with fnmatch(3) with 0 flag), to match - files which change too often and for which a - delay should be observed before re-indexing. This - is a space-separated list, each entry being a - pattern and a time in seconds, separated by a - colon. You can use double quotes if a path entry - contains white space. Example:

-
-mondelaypatterns = *.log:20 "this one has spaces*:10"
-              
-
-
+
+

Override logfilename for the indexer in real + time mode. The default is to use the idx... values + if set, else the log... values.

+
-
monixinterval
+
idxrundir
-
-

Minimum interval (seconds) for processing the - indexing queue. The real time monitor does not - process each event when it comes in, but will - wait this time for the queue to accumulate to - diminish overhead and in order to aggregate - multiple events to the same file. Default 30 - S.

-
+
+

Indexing process current directory. The input + handlers sometimes leave temporary files in the + current directory, so it makes sense to have + recollindex chdir to some temporary directory. If + the value is empty, the current directory is not + changed. If the value is (literal) tmp, we use the + temporary directory as set by the environment + (RECOLL_TMPDIR else TMPDIR else /tmp). If the value + is an absolute path to a directory, we go + there.

+
-
monauxinterval
+
+ checkneedretryindexscript
-
-

Period (in seconds) at which the real time - monitor will regenerate the auxiliary databases - (spelling, stemming) if needed. The default is - one hour.

-
+
+

Script used to heuristically check if we need to + retry indexing files which previously failed. The + default script checks the modified dates on + /usr/bin and /usr/local/bin. A relative path will + be looked up in the filters dirs, then in the path. + Use an absolute path to do otherwise.

+
-
monioniceclass, - monioniceclassdata
+
recollhelperpath
-
-

These allow defining the ionice class and data used - by the indexer (default class 3, no data).

-
+
+

Additional places to search for helper + executables. This is only used on Windows for + now.

+
-
filtermaxseconds
+
idxabsmlen
-
-

Maximum handler execution time, after which it - is aborted. Some postscript programs just - loop...

-
+
+

Length of abstracts we store while indexing. + Recoll stores an abstract for each indexed file. + The text can come from an actual 'abstract' section + in the document or will just be the beginning of + the document. It is stored in the index so that it + can be displayed inside the result lists without + decoding the original file. The idxabsmlen + parameter defines the size of the stored abstract. + The default value is 250 bytes. The search + interface gives you the choice to display this + stored text or a synthetic abstract built by + extracting text around the search terms. If you + always prefer the synthetic abstract, you can + reduce this value and save a little space.

+
-
filtermaxmbytes
+
idxmetastoredlen
-
-

Recoll 1.20.7 - and later. Maximum handler memory utilisation. - This uses setrlimit(RLIMIT_AS) on most systems - (total virtual memory space size limit). Some - programs may start with 500 MBytes of mapped - shared libraries, so take this into account when - choosing a value. The default is a liberal - 2000MB.

-
+
+

Truncation length of stored metadata fields. + This does not affect indexing (the whole field is + processed anyway), just the amount of data stored + in the index for the purpose of displaying fields + inside result lists or previews. The default value + is 150 bytes which may be too low if you have + custom fields.

+
-
filtersdir
+
aspellLanguage
-
-

A directory to search for the external input - handler scripts used to index some types of - files. The value should not be changed, except if - you want to modify one of the default scripts. - The value can be redefined for any - sub-directory.

-
+
+

Language definitions to use when creating the + aspell dictionary. The value must match a set of + aspell language definition files. You can type + "aspell dicts" to see a list The default if this is + not set is to use the NLS environment to guess the + value.

+
-
iconsdir
+
aspellAddCreateParam
-
-

The name of the directory where recoll result - list icons are stored. You can change this if you - want different images.

-
+
+

Additional option and parameter to aspell + dictionary creation command. Some aspell packages + may need an additional option (e.g. on Debian + Jessie: --local-data-dir=/usr/lib/aspell). See + Debian bug 772415.

+
-
idxabsmlen
+
aspellKeepStderr
-
-

Recoll stores - an abstract for each indexed file inside the - database. The text can come from an actual - 'abstract' section in the document or will just - be the beginning of the document. It is stored in - the index so that it can be displayed inside the - result lists without decoding the original file. - The idxabsmlen - parameter defines the size of the stored - abstract. The default value is 250 bytes. The - search interface gives you the choice to display - this stored text or a synthetic abstract built by - extracting text around the search terms. If you - always prefer the synthetic abstract, you can - reduce this value and save a little space.

-
+
+

Set this to have a look at aspell dictionary + creation errors. There are always many, so this is + mostly for debugging.

+
-
idxmetastoredlen
+
noaspell
-
-

Maximum stored length for metadata fields. - This does not affect indexing (the whole field is - processed anyway), just the amount of data stored - in the index for the purpose of displaying fields - inside result lists or previews. The default - value is 150 bytes which may be too low if you - have custom fields.

-
+
+

Disable aspell use. The aspell dictionary + generation takes time, and some combinations of + aspell version, language, and local terms, result + in aspell crashing, so it sometimes makes sense to + just disable the thing.

+
-
aspellLanguage
+
monauxinterval
-
-

Language definitions to use when creating the - aspell dictionary. The value must match a set of - aspell language definition files. You can type - "aspell config" to see where these are installed - (look for data-dir). The default if the variable - is not set is to use your desktop national - language environment to guess the value.

-
+
+

Auxiliary database update interval. The real + time indexer only updates the auxiliary databases + (stemdb, aspell) periodically, because it would be + too costly to do it for every document change. The + default period is one hour.

+
-
noaspell
+
monixinterval
-
-

If this is set, the aspell dictionary - generation is turned off. Useful for cases where - you don't need the functionality or when it is - unusable because aspell crashes during dictionary - generation.

-
+
+

Minimum interval (seconds) between processings + of the indexing queue. The real time indexer does + not process each event when it comes in, but lets + the queue accumulate, to diminish overhead and to + aggregate multiple events affecting the same file. + Default 30 S.

+
-
mhmboxquirks
+
mondelaypatterns
-
-

This allows definining location-related quirks - for the mailbox handler. Currently only the - tbird flag is - defined, and it should be set for directories - which hold Thunderbird data, as their - folder format is weird. Example:

-
-[/path/to/my/mozilla/mail] 
-mhmboxquirks = tbird
-
+
+

Timing parameters for the real time indexing. + Definitions for files which get a longer delay + before reindexing is allowed. This is for + fast-changing files, that should only be reindexed + once in a while. A list of wildcardPattern:seconds + pairs. The patterns are matched with + fnmatch(pattern, path, 0) You can quote entries + containing white space with double quotes (quote + the whole entry, not the pattern). The default is + empty. Example: mondelaypatterns = *.log:20 "*with + spaces.*:30"

+
-

It should be noted that later Recoll versions have - improved automatic detection of Thunderbird folders, so that - this should not be needed at all in most - cases.

- -
+
monioniceclass
+ +
+

ionice class for the real time indexing process + On platforms where this is supported. The default + value is 3.

+
+ +
+ monioniceclassdata
+ +
+

ionice class parameter for the real time + indexing process. On platforms where this is + supported. The default is empty.

+
+
+
+ +
+
+
+
+

5.4.2.6. Query-time + parameters (no impact on the index)

+
+
+ +
+
autodiacsens
+ +
+

auto-trigger diacritics sensitivity (raw index + only). IF the index is not stripped, decide if we + automatically trigger diacritics sensitivity if the + search term has accented characters (not in + unac_except_trans). Else you need to use the query + language and the "D" modifier to specify diacritics + sensitivity. Default is no.

+
+ +
autocasesens
+ +
+

auto-trigger case sensitivity (raw index only). + IF the index is not stripped (see indexStripChars), + decide if we automatically trigger character case + sensitivity if the search term has upper-case + characters in any but the first position. Else you + need to use the query language and the "C" modifier + to specify character-case sensitivity. Default is + yes.

+
+ +
maxTermExpand
+ +
+

Maximum query expansion count for a single term + (e.g.: when using wildcards). This only affects + queries, not indexing. We used to not limit this at + all (except for filenames where the limit was too + low at 1000), but it is unreasonable with a big + index. Default 10000.

+
+ +
maxXapianClauses
+ +
+

Maximum number of clauses we add to a single + Xapian query. This only affects queries, not + indexing. In some cases, the result of term + expansion can be multiplicative, and we want to + avoid eating all the memory. Default 50000.

+
+ +
snippetMaxPosWalk
+ +
+

Maximum number of positions we walk while + populating a snippet for the result list. The + default of 1,000,000 may be insufficient for very + big documents, the consequence would be snippets + with possibly meaning-altering missing words.

+
+
+
+ +
+
+
+
+

5.4.2.7. Parameters + for the PDF input script

+
+
+
+ +
+
pdfocr
+ +
+

Attempt OCR of PDF files with no text content if + both tesseract and pdftoppm are installed. The + default is off because OCR is so very slow.

+
+ +
pdfattach
+ +
+

Enable PDF attachment extraction by executing + pdftk (if available). This is normally disabled, + because it does slow down PDF indexing a bit even + if not one attachment is ever found.

+
+
+
+ +
+
+
+
+

5.4.2.8. Parameters + set for specific locations

+
+
+
+ +
+
mhmboxquirks
+ +
+

Enable thunderbird/mozilla-seamonkey mbox format + quirks Set this for the directory where the email + mbox files are stored.

+
+
diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index f196efb9..68eea15a 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -5651,880 +5651,10 @@ thesame = "some string with spaces" - - The main configuration file, recoll.conf + + - recoll.conf is the main - configuration file. It defines things like - what to index (top directories and things to ignore), and the - default character set to use for document types which do not - specify it internally. - - The default configuration will index your home - directory. If this is not appropriate, start - recoll to create a blank - configuration, click Cancel, and edit - the configuration file before restarting the command. This - will start the initial indexing, which may take some time. - - Most of the following parameters can be changed from the - Index Configuration menu in the - recoll interface. Some can only be set by - editing the configuration file. - - - Parameters affecting what documents we index: - - - - - topdirs - Specifies the list of directories or files to - index (recursively for directories). You can use symbolic links - as elements of this list. See the - followLinks option about following symbolic links - found under the top elements (not followed by default). - - - - skippedNames - - A space-separated list of wilcard patterns for - names of files or directories that should be completely - ignored. The list defined in the default file is: - -skippedNames = #* bin CVS Cache cache* caughtspam tmp .thumbnails .svn \ - *~ .beagle .git .hg .bzr loop.ps .xsession-errors \ - .recoll* xapiandb recollrc recoll.conf - - The list can be redefined at any sub-directory in the - indexed area. - The top-level directories are not affected by this - list (that is, a directory in topdirs - might match and would still be indexed). - The list in the default configuration does not - exclude hidden directories (names beginning with a - dot), which means that it may index quite a few things - that you do not want. On the other hand, email user - agents like thunderbird - usually store messages in hidden directories, and you - probably want this indexed. One possible solution is to - have .* in - skippedNames, and add things like - ~/.thunderbird or - ~/.evolution in - topdirs. - - Not even the file names are indexed for patterns - in this list. See the - noContentSuffixes variable for an alternative - approach which indexes the file names. - - - - noContentSuffixes - This is a list of file name endings (not - wildcard expressions, nor dot-delimited suffixes). Only the - names of matching files will be indexed (no attempt at MIME - type identification, no decompression, no content - indexing). This can be redefined for - subdirectories, and edited from the GUI. The default value is: - -noContentSuffixes = .md5 .map \ - .o .lib .dll .a .sys .exe .com \ - .mpp .mpt .vsd \ - .img .img.gz .img.bz2 .img.xz .image .image.gz .image.bz2 .image.xz \ - .dat .bak .rdf .log.gz .log .db .msf .pid \ - ,v ~ # - - - - - skippedPaths and - daemSkippedPaths - - A space-separated list of patterns for - paths of files or directories that should be skipped. - There is no default in the sample configuration file, - but the code always adds the configuration and database - directories in there. - skippedPaths is used both by - batch and real time - indexing. daemSkippedPaths can be - used to specify things that should be indexed at - startup, but not monitored. - Example of use for skipping text files only in a - specific directory: - -skippedPaths = ~/somedir/*.txt - - - - - - skippedPathsFnmPathname - The values in the - *skippedPaths variables are matched by - default with fnmatch(3), with the - FNM_PATHNAME flag. This means that '/' - characters must be matched explicitely. You can set - skippedPathsFnmPathname to 0 to disable - the use of FNM_PATHNAME (meaning that /*/dir3 will match - /dir1/dir2/dir3). - - - - - - zipSkippedNames - A space-separated list of patterns for - names of files or directories that should be ignored - inside zip archives. This is used directly by the zip - handler, and has a function similar to skippedNames, but - works independantly. Can be redefined for filesystem - subdirectories. For versions up to 1.19, you will need - to update the Zip handler and install a supplementary - Python module. The details are - described on - the &RCL; wiki. - - - - - followLinks - Specifies if the indexer should follow - symbolic links while walking the file tree. The default is - to ignore symbolic links to avoid multiple indexing of - linked files. No effort is made to avoid duplication when - this option is set to true. This option can be set - individually for each of the topdirs - members by using sections. It can not be changed below the - topdirs level. - - - - indexedmimetypes - &RCL; normally indexes any file which it - knows how to read. This list lets you restrict the indexed - MIME types to what you specify. If the variable is - unspecified or the list empty (the default), all supported - types are processed. Can be redefined for subdirectories. - - - - excludedmimetypes - This list lets you exclude some MIME types from - indexing. Can be redefined for subdirectories. - - - - compressedfilemaxkbs - Size limit for compressed (.gz or .bz2) - files. These need to be decompressed in a temporary - directory for identification, which can be very wasteful - if 'uninteresting' big compressed files are present. - Negative means no limit, 0 means no processing of any - compressed file. Defaults to -1. - - - - textfilemaxmbs - Maximum size for text files. Very big text - files are often uninteresting logs. Set to -1 to disable - (default 20MB). - - - - textfilepagekbs - If set to other than -1, text files will be - indexed as multiple documents of the given page size. This may - be useful if you do want to index very big text files as it - will both reduce memory usage at index time and help with - loading data to the preview window. A size of a few megabytes - would seem reasonable (default: 1MB). - - - - membermaxkbs - This defines the maximum size in kilobytes for - an archive member (zip, tar or rar at the moment). Bigger - entries will be skipped. - - - - indexallfilenames - &RCL; indexes file names in a special - section of the database to allow specific file names - searches using wild cards. This parameter decides if - file name indexing is performed only for files with MIME - types that would qualify them for full text indexing, or - for all files inside the selected subtrees, independently of - MIME type. - - - - usesystemfilecommand - Decide if we execute a system command - (file by default) - as a final step for determining the MIME type for a file - (the main procedure uses suffix associations as defined in - the mimemap file). This can be useful - for files with suffix-less names, but it will also cause - the indexing of many bogus "text" files. - - - - systemfilecommand - Command to use for mime for mime type - determination if usesystefilecommand is - set. Recent versions of xdg-mime sometimes - work better than file. - - - - processwebqueue - If this is set, process the directory where - Web browser plugins copy visited pages for indexing. - - - - webqueuedir - The path to the web indexing queue. This is - hard-coded in the Firefox plugin as - ~/.recollweb/ToIndex so there should be no - need to change it. - - - - - - - - Parameters affecting how we generate terms: - - Changing some of these parameters will imply a full - reindex. Also, when using multiple indexes, it may not make sense - to search indexes that don't share the values for these parameters, - because they usually affect both search and index operations. - - - - indexStripChars - Decide if we strip characters of diacritics and - convert them to lower-case before terms are indexed. If we - don't, searches sensitive to case and diacritics can be - performed, but the index will be bigger, and some marginal - weirdness may sometimes occur. The default is a stripped - index (indexStripChars = 1) for - now. When using multiple indexes for a search, - this parameter must be defined identically for - all. Changing the value implies an index reset. - - - - maxTermExpand - Maximum expansion count for a single term (e.g.: - when using wildcards). The default of 10000 is reasonable and - will avoid queries that appear frozen while the engine is - walking the term list. - - - - maxXapianClauses - Maximum number of elementary clauses we can add - to a single Xapian query. In some cases, the result of term - expansion can be multiplicative, and we want to avoid using - excessive memory. The default of 100 000 should be both - high enough in most cases and compatible with current - typical hardware configurations. - - - - nonumbers - If this set to true, no terms will be generated - for numbers. For example "123", "1.5e6", 192.168.1.4, would not - be indexed ("value123" would still be). Numbers are often quite - interesting to search for, and this should probably not be set - except for special situations, ie, scientific documents with huge - amounts of numbers in them. This can only be set for a whole - index, not for a subtree. - - - - dehyphenate - Determines if, given an input of - co-worker, we add a term for - coworker. This possibility is new in version - 1.22, and on by default. Setting the variable to off allows - restoring the previous behaviour. - - - - nocjk - If this set to true, specific east asian - (Chinese Korean Japanese) characters/word splitting is - turned off. This will save a small amount of cpu if you - have no CJK documents. If your document base does include - such text but you are not interested in searching it, - setting nocjk may be a significant time - and space saver. - - - - cjkngramlen - This lets you adjust the size of n-grams - used for indexing CJK text. The default value of 2 is - probably appropriate in most cases. A value of 3 would - allow more precision and efficiency on longer words, but - the index will be approximately twice as large. - - - - indexstemminglanguages - A list of languages for which the stem - expansion databases will be built. See - recollindex - 1 or use the - recollindex command - for possible values. You can add a stem expansion database - for a different language by using - recollindex , but it - will be deleted during the next indexing. Only languages - listed in the configuration file are permanent. - - - - defaultcharset - The name of the character set used for - files that do not contain a character set definition (ie: - plain text files). This can be redefined for any - sub-directory. If it is not set at all, the character set - used is the one defined by the nls environment ( - LC_ALL, LC_CTYPE, - LANG), or iso8859-1 - if nothing is set. - - - - unac_except_trans - This is a list of characters, encoded in UTF-8, - which should be handled specially when converting text to - unaccented lowercase. For example, in Swedish, the letter - a with diaeresis has full alphabet - citizenship and should not be turned into an - a. Each element in the space-separated list - has the special character as first element and the translation - following. The handling of both the lowercase and upper-case - versions of a character should be specified, as appartenance to - the list will turn-off both standard accent and case - processing. Example for Swedish: - -unac_except_trans = åå Åå ää Ää öö Öö - - - Note that the translation is not limited to a single - character, you could very well have something like - üue in the list. - - The default value set for - unac_except_trans can't be listed here - because I have trouble with SGML and UTF-8, but it only - contains ligature decompositions: german ss, oe, ae, fi, - fl. - - This parameter can't be defined for subdirectories, it - is global, because there is no way to do otherwise when - querying. If you have document sets which would need different - values, you will have to index and query them separately. - - - - maildefcharset - This can be used to define the default - character set specifically for email messages which don't - specify it. This is mainly useful for readpst (libpst) dumps, - which are utf-8 but do not say so. - - - - localfields - This allows setting fields for all documents - under a given directory. Typical usage would be to set an - "rclaptg" field, to be used in mimeview to - select a specific viewer. If several fields are to be set, they - should be separated with a semi-colon (';') character, which there - is currently no way to escape. Also note the initial semi-colon. - Example: - localfields= ;rclaptg=gnus;other = val, then - select specifier viewer with - mimetype|tag=... in - mimeview. - - - - testmodifusemtime - If true, use mtime instead of default ctime to - determine if a file has been modified (in addition to - size, which is always used). Setting this can reduce - re-indexing on systems where extended attributes are - modified (by some other application), but not indexed - (changing extended attributes only affects - ctime). Notes: - - This may prevent detection of change - in some marginal file rename cases (the target would - need to have the same size and - mtime). - You should probably also set - noxattrfields to 1 in this case, except if you still - prefer to perform xattr indexing, for example if the - local file update pattern makes it of value (as in - general, there is a risk for pure extended attributes - updates without file modification to go - undetected). - - Perform a full index reset after changing the value of - this parameter. - - - - noxattrfields - Recoll versions 1.19 and later - automatically translate file extended attributes into - document fields (to be processed according to the - parameters from the fields - file). Setting this variable to 1 will disable the - behaviour. - - - - metadatacmds - This allows executing external commands - for each file and storing the output in &RCL; document - fields. This could be used for example to index - external tag data. The value is a list of field names - and commands, don't forget an initial - semi-colon. Example: - -[/some/area/of/the/fs] -metadatacmds = ; tags = tmsu tags %f; otherfield = somecmd -xx %f - - As a specially disgusting hack brought by - &RCL; 1.19.7, if a "field name" begins - with rclmulti, the data returned by - the command is expected to contain multiple field - values, in configuration file format. This allows - setting several fields by executing a single - command. Example: - -metadatacmds = ; rclmulti1 = somecmd %f - - If somecmd returns data in the form - of: - -field1 = value1 -field2 = value for field2 - - field1 - and field2 will be set inside the - document metadata. - - - - - - - - - - Parameters affecting where and how we store things: - - - - - cachedir - - When not explicitly specified, the &RCL; data directories - are stored relative to the configuration directory. If - cachedir is set, the directories are stored - under the specified value instead (e.g. if - cachedir is set to - ~/.cache/recoll, the default - dbdir would be - ~/.cache/recoll/xapiandb instead of - ~/.recoll/xapiandb ). This affects the - default values for dbdir, - webcachedir, - mboxcachedir, and - aspellDicDir, which can still be - individually specified to override - cachedir. Note that if you have multiple - configurations, each must have a different - cachedir. - - - - dbdir - The name of the Xapian data directory. It - will be created if needed when the index is - initialized. If this is not an absolute path, it will be - interpreted relative to the configuration directory. The - value can have embedded spaces but starting or trailing - spaces will be trimmed. You cannot use quotes here. - - - - idxstatusfile - The name of the scratch file where the indexer - process updates its status. Default: - idxstatus.txt inside the configuration - directory. - - - - maxfsoccuppc - Maximum file system occupation before we - stop indexing. The value is a percentage, corresponding to - what the "Capacity" df output column shows. The default - value is 0, meaning no checking. - - - - mboxcachedir - The directory where mbox message offsets cache - files are held. This is normally $RECOLL_CONFDIR/mboxcache, but - it may be useful to share a directory between different - configurations. - - - - mboxcacheminmbs - The minimum mbox file size over which we - cache the offsets. There is really no sense in caching - offsets for small files. The default is 5 MB. - - - - webcachedir - This is only used by the web browser - plugin indexing code, and defines where the cache for visited - pages will live. Default: - $RECOLL_CONFDIR/webcache - - - - webcachemaxmbs - This is only used by the web browser - plugin indexing code, and defines the maximum size for the web - page cache. Default: 40 MB. Quite unfortunately, this is only - taken into account when creating the cache file. You need to - delete the file for a change to be taken into account. - - - - - idxflushmb - Threshold (megabytes of new text data) where we - flush from memory to disk index. Setting this can help control - memory usage. A value of 0 means no explicit flushing, letting - Xapian use its own default, which is flushing every 10000 (or - XAPIAN_FLUSH_THRESHOLD) documents, which gives little memory - usage control, as memory usage also depends on average document - size. The default value is 10, and it is probably a bit low. If - your system usually has free memory, you can try higher values - between 20 and 80. In my experience, values beyond 100 are - always counterproductive. - - - - - - - - Parameters affecting multithread processing - - The &RCL; indexing process - recollindex can use multiple threads to - speed up indexing on multiprocessor systems. The work done - to index files is divided in several stages and some of the - stages can be executed by multiple threads. The stages are: - - File system walking: this is always performed by - the main thread. - File conversion and data extraction. - Text processing (splitting, stemming, - etc.) - &XAP; index update. - - - You can also read a - - longer document about the transformation of - &RCL; indexing to multithreading. - - The threads configuration is controlled by two - configuration file parameters. - - - - thrQSizes - This variable defines the job input queues - configuration. There are three possible queues for - stages 2, 3 and 4, and this parameter should give the - queue depth for each stage (three integer values). If - a value of -1 is used for a given stage, no queue is - used, and the thread will go on performing the next - stage. In practise, deep queues have not been shown to - increase performance. A value of 0 for the first queue - tells &RCL; to perform autoconfiguration (no need for - the two other values in this case) - this is the - default configuration. - - - - thrTCounts - This defines the number of threads used - for each stage. If a value of -1 is used for one of - the queue depths, the corresponding thread count is - ignored. It makes no sense to use a value other than 1 - for the last stage because updating the &XAP; index is - necessarily single-threaded (and protected by a - mutex). - - - - - - The following example would use three queues (of depth 2), - and 4 threads for converting source documents, 2 for - processing their text, and one to update the index. This was - tested to be the best configuration on the test system - (quadri-processor with multiple disks). - -thrQSizes = 2 2 2 -thrTCounts = 4 2 1 - - - - The following example would use a single queue, and the - complete processing for each document would be performed by - a single thread (several documents will still be processed - in parallel in most cases). The threads will use mutual - exclusion when entering the index update stage. In practise - the performance would be close to the precedent case in - general, but worse in certain cases (e.g. a Zip archive - would be performed purely sequentially), so the previous - approach is preferred. YMMV... The 2 last values for - thrTCounts are ignored. - -thrQSizes = 2 -1 -1 -thrTCounts = 6 1 1 - - - - The following example would disable - multithreading. Indexing will be performed by a single - thread. - -thrQSizes = -1 -1 -1 - - - - - - - Miscellaneous parameters: - - - - autodiacsens - IF the index is not stripped, decide if we - automatically trigger diacritics sensitivity if the search - term has accented characters (not in - unac_except_trans). Else you need to use - the query language and the D modifier to - specify diacritics sensitivity. Default is no. - - - - autocasesens - IF the index is not stripped, decide if we - automatically trigger character case sensitivity if the - search term has upper-case characters in any but the first - position. Else you need to use the query language and the - C modifier to specify character-case - sensitivity. Default is yes. - - - - loglevel,daemloglevel - Verbosity level for recoll and - recollindex. A value of 4 lists quite a lot of - debug/information messages. 2 only lists errors. The - daemversion is specific to the indexing monitor - daemon. - - - - logfilename, - daemlogfilename - Where the messages should go. 'stderr' can - be used as a special value, and is the default. The - daemversion is specific to the indexing monitor - daemon. - - - - checkneedretryindexscript - This defines the name for a command - executed by recollindex when starting - indexing. If the exit status of the command is 0, - recollindex retries to index all files - which previously could not be indexed because of data - extraction errors. The default value is a script which - checks if any of the common bin - directories have changed (indicating that a helper program - may have been installed). - - - - mondelaypatterns - This allows specify wildcard path patterns - (processed with fnmatch(3) with 0 flag), to match files which - change too often and for which a delay should be observed before - re-indexing. This is a space-separated list, each entry being a - pattern and a time in seconds, separated by a colon. You can - use double quotes if a path entry contains white - space. Example: - -mondelaypatterns = *.log:20 "this one has spaces*:10" - - - - - monixinterval - Minimum interval (seconds) for processing the - indexing queue. The real time monitor does not process each - event when it comes in, but will wait this time for the queue - to accumulate to diminish overhead and in order to aggregate - multiple events to the same file. Default 30 S. - - - - monauxinterval - Period (in seconds) at which the real time - monitor will regenerate the auxiliary databases (spelling, - stemming) if needed. The default is one hour. - - - - monioniceclass, monioniceclassdata - These allow defining the - ionice class and data used by the - indexer (default class 3, no data). - - - - filtermaxseconds - Maximum handler execution time, after which it - is aborted. Some postscript programs just loop... - - - - filtermaxmbytes - &RCL; 1.20.7 and later. Maximum handler memory - utilisation. This uses setrlimit(RLIMIT_AS) on most systems - (total virtual memory space size limit). Some programs may start - with 500 MBytes of mapped shared libraries, so take this into - account when choosing a value. The default is a liberal - 2000MB. - - - - filtersdir - A directory to search for the external - input handler scripts used to index some types of files. The - value should not be changed, except if you want to modify - one of the default scripts. The value can be redefined for - any sub-directory. - - - - iconsdir - The name of the directory where - recoll result list icons are - stored. You can change this if you want different - images. - - - - idxabsmlen - &RCL; stores an abstract for each indexed - file inside the database. The text can come from an actual - 'abstract' section in the document or will just be the - beginning of the document. It is stored in the index so - that it can be displayed inside the result lists without - decoding the original - file. The idxabsmlen parameter defines - the size of the stored abstract. The default value is 250 bytes. - The search interface gives you the choice to display this - stored text or a synthetic abstract built by extracting - text around the search terms. If you always - prefer the synthetic abstract, you can reduce this value - and save a little space. - - - - - idxmetastoredlen - Maximum stored length for metadata - fields. This does not affect indexing (the whole field is - processed anyway), just the amount of data stored in the - index for the purpose of displaying fields inside result - lists or previews. The default value is 150 bytes which - may be too low if you have custom fields. - - - - aspellLanguage - Language definitions to use when creating - the aspell dictionary. The value must match a set of - aspell language definition files. You can type "aspell - config" to see where these are installed (look for - data-dir). The default if the variable is not set is to - use your desktop national language environment to guess - the value. - - - - noaspell - If this is set, the aspell dictionary - generation is turned off. Useful for cases where you don't - need the functionality or when it is unusable because - aspell crashes during dictionary generation. - - - - mhmboxquirks - This allows definining location-related quirks - for the mailbox handler. Currently only the - tbird flag is defined, and it should be set - for directories which hold - Thunderbird data, as their folder - format is weird. Example: - [/path/to/my/mozilla/mail] -mhmboxquirks = tbird - It should be noted that later &RCL; - versions have improved automatic detection of - Thunderbird folders, so that this - should not be needed at all in most cases. - - - - - - - The fields file diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index 38f34896..806cdfd2 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -1,4 +1,4 @@ -# Recoll default main configuration file +# Recoll main configuration file, recoll.conf # The XML tags in the comments are used to help produce the documentation # from the sample/reference file, and not at all at run time, where @@ -11,7 +11,8 @@ # Most of the important values in this file can be set from the GUI # configuration menus, which may be an easier approach than direct editing. -# Parameters affecting what documents we index +# Parameters affecting what documents we +# index # Space-separated list of files or # directories to recursively index.Default to ~ (indexes @@ -19,34 +20,37 @@ # independantly of the value of the followLinks variable. topdirs = ~ -# Wildcard expressions for -# names of files and directories that we should ignore. -# White space separated list of wildcard patterns (simple -# ones, not paths, must contain no / ), which will be tested against file -# and directory names. The list in the default configuration does not -# exclude hidden directories (names beginning with a dot), which means that -# it may index quite a few things that you do not want. On the other hand, -# email user agents like Thunderbird usually store messages in hidden -# directories, and you probably want this indexed. One possible solution is -# to have '.*' in 'skippedNames', and add things like '~/.thunderbird' -# '~/.evolution' to 'topdirs'. Not even the file names are indexed for -# patterns in this list, see the 'noContentSuffixes' variable for an -# alternative approach which indexes the file names. Can be redefined for -# any subtree. +# +# +# Files and directories which should be ignored. +# White space separated list of wildcard patterns (simple ones, not paths, +# must contain no / ), which will be tested against file and directory +# names. The list in the default configuration does not exclude hidden +# directories (names beginning with a dot), which means that it may index +# quite a few things that you do not want. On the other hand, email user +# agents like Thunderbird usually store messages in hidden directories, and +# you probably want this indexed. One possible solution is to have '.*' in +# 'skippedNames', and add things like '~/.thunderbird' '~/.evolution' to +# 'topdirs'. Not even the file names are indexed for patterns in this +# list, see the 'noContentSuffixes' variable for an alternative approach +# which indexes the file names. Can be redefined for any +# subtree. skippedNames = #* bin CVS Cache cache* .cache caughtspam tmp \ .thumbnails .svn \ *~ .beagle .git .hg .bzr loop.ps .xsession-errors \ .recoll* xapiandb recollrc recoll.conf -# List of name endings (not -# necessarily dot-separated suffixes) for which we don't try MIME type -# identification, and don't uncompress or index content.Only -# the names will be indexed. This complements the now obsoleted mimemap -# recoll_noindex list, which will go away in a future release (the move -# from mimemap to recoll.conf allows editing the list through the -# GUI). This is different from skippedNames because these are name ending -# matches only (not wildcard patterns), and the file name itself gets -# indexed normally. This can be redefined for subdirectories. +# +# +# List of name endings (not necessarily dot-separated suffixes) for +# which we don't try MIME type identification, and don't uncompress or +# index content.Only the names will be indexed. This +# complements the now obsoleted recoll_noindex list from the mimemap file, +# which will go away in a future release (the move from mimemap to +# recoll.conf allows editing the list through the GUI). This is different +# from skippedNames because these are name ending matches only (not +# wildcard patterns), and the file name itself gets indexed normally. This +# can be redefined for subdirectories. noContentSuffixes = .md5 .map \ .o .lib .dll .a .sys .exe .com \ .mpp .mpt .vsd \ @@ -54,20 +58,20 @@ noContentSuffixes = .md5 .map \ .dat .bak .rdf .log.gz .log .db .msf .pid \ ,v ~ # -# Space-separated list of -# wildcard expressions for paths we shouldn't go into.Can -# contain files and directories. The database and configuration directories -# will automatically be added. The expressions are matched 'fnmatch(3)' +# +# +# Paths we should not go into.Space-separated list of +# wildcard expressions for filesystem paths. Can contain files and +# directories. The database and configuration directories will +# automatically be added. The expressions are matched using 'fnmatch(3)' # with the FNM_PATHNAME flag set by default. This means that '/' characters # must be matched explicitely. You can set 'skippedPathsFnmPathname' to 0 # to disable the use of FNM_PATHNAME (meaning that '/*/dir3' will match -# '/dir1/dir2/dir3'). The default contains the usual mount point for -# removable media by default to remind people that it is a bad idea to -# naively have recoll work on these (esp. with the monitor: media gets -# indexed on mount, all data gets erased on unmount). Typically the -# presence of '/media' is mostly a reminder, it would only have effect for -# someone who is indexing '/'. Explicitely adding '/media/xxx' to the -# topdirs will override this. +# '/dir1/dir2/dir3'). The default value contains the usual mount point for +# removable media to remind you that it is a bad idea to have Recoll work +# on these (esp. with the monitor: media gets indexed on mount, all data +# gets erased on unmount). Explicitely adding '/media/xxx' to the topdirs +# will override this. skippedPaths = /media # Set to 0 to @@ -75,19 +79,22 @@ skippedPaths = /media # paths. #skippedPathsFnmPathname = 1 -# skippedPaths equivalent specific to +# +# +# skippedPaths equivalent specific to # real time indexing.This enables having parts of the tree # which are initially indexed but not monitored. If daemSkippedPaths is # not set, the daemon uses skippedPaths. #daemSkippedPaths = -# Space-separated list of -# wildcard expresions for names that should be ignored -# inside zip archives.This is used directly by the zip -# handler, and has a function similar to skippedNames, but -# works independantly. Can be redefined for subdirectories. Supported by -# recoll 1.20 and newer. See +# +# +# Space-separated list of wildcard expressions for names that should +# be ignored inside zip archives.This is used directly by +# the zip handler, and has a function similar to skippedNames, but works +# independantly. Can be redefined for subdirectories. Supported by recoll +# 1.20 and newer. See # https://bitbucket.org/medoc/recoll/wiki/Filtering%20out%20Zip%20archive%20members # #zipSkippedNames = @@ -119,12 +126,12 @@ skippedPaths = /media # files.We need to decompress these in a # temporary directory for identification, which can be wasteful in some # cases. Limit the waste. Negative means no limit. 0 results in no -# processing of any compressed file. +# processing of any compressed file. Default 50 MB. compressedfilemaxkbs = 50000 # Size limit for text # files.Mostly for skipping monster -# logs. +# logs. Default 20 MB. textfilemaxmbs = 20 # Index the file names of @@ -158,7 +165,8 @@ processwebqueue = 0 # into documents of approximately this size. Will reduce memory usage at # index time and help with loading data in the preview window at query # time. Particularly useful with very big files, such as application or -# system logs. +# system logs. Also see textfilemaxmbs and +# compressedfilemaxkbs. textfilepagekbs = 1000 # Size limit for archive @@ -168,7 +176,8 @@ membermaxkbs = 50000 -# Parameters affecting how we generate terms +# Parameters affecting how we generate +# terms # Changing some of these parameters will imply a full # reindex. Also, when using multiple indexes, it may not make sense @@ -201,9 +210,9 @@ indexStripChars = 1 # restoring the previous behaviour. #dehyphenate = 1 -# Decides if specific east asian +# Decides if specific East Asian # (Chinese Korean Japanese) characters/word splitting is turned -# off.This will save a small amount of cpu if you have no CJK +# off.This will save a small amount of CPU if you have no CJK # documents. If your document base does include such text but you are not # interested in searching it, setting nocjk may be a # significant time and space saver. @@ -216,10 +225,11 @@ indexStripChars = 1 # as large. #cjkngramlen = 2 -# Languages for -# which to create stemming expansion data.Stemmer names can -# be found on http://www.xapian.org, or by executing 'recollindex -l', or -# this can also be set from a list in the GUI +# +# +# Languages for which to create stemming expansion +# data.Stemmer names can be found by executing 'recollindex +# -l', or this can also be set from a list in the GUI. indexstemminglanguages = english # Default character @@ -246,14 +256,14 @@ indexstemminglanguages = english # Examples: # Swedish: # unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl åå Åå -# German: +# . German: # unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl # In French, you probably want to decompose oe and ae and nobody would type # a German ß # unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl -# Reasonable default for all until someone protests. These decompositions -# are not performed by unac, but I cant imagine someone typing the composed -# forms in a search. +# . The default for all until someone protests follows. These decompositions +# are not performed by unac, but it is unlikely that someone would type the +# composed forms in a search. # unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl @@ -274,7 +284,7 @@ unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl # Use mtime instead of # ctime to test if a file has been modified.The time is used -# in in addition to the size, which is always used. +# in addition to the size, which is always used. # Setting this can reduce re-indexing on systems where extended attributes # are used (by some other application), but not indexed, because changing # extended attributes only affects ctime. @@ -305,6 +315,7 @@ noxattrfields = 0 # returns multiple field values inside a text blob formatted as a recoll # configuration file ("fieldname = fieldvalue" lines). The rclmultixx name # will be ignored, and field names and values will be parsed from the data. +# Example: metadatacmds = ; tags = tmsu tags %f; rclmulti1 = cmdOutputsConf %f # #[/some/area/of/the/fs] #metadatacmds = ; tags = tmsu tags %f; rclmulti1 = cmdOutputsConf %f @@ -312,24 +323,27 @@ noxattrfields = 0 -# Parameters affecting where and how we store things +# Parameters affecting where and how we store +# things -# Top directory for Recoll -# dataRecoll data directories are normally located relative -# to the configuration directory (e.g. ~/.recoll/xapiandb, -# ~/.recoll/mboxcache). If 'cachedir' is set, the directories are stored under -# the specified value instead (e.g. if cachedir is ~/.cache/recoll, the -# default dbdir would be ~/.cache/recoll/xapiandb). This affects dbdir, -# webcachedir, mboxcachedir, aspellDicDir, which can still be individually -# specified to override cachedir. Note that if you have multiple -# configurations, each must have a different cachedir, there is no -# automatic computation of a subpath under cachedir. +# +# +# Top directory for Recoll data.Recoll data +# directories are normally located relative to the configuration directory +# (e.g. ~/.recoll/xapiandb, ~/.recoll/mboxcache). If 'cachedir' is set, the +# directories are stored under the specified value instead (e.g. if +# cachedir is ~/.cache/recoll, the default dbdir would be +# ~/.cache/recoll/xapiandb). This affects dbdir, webcachedir, +# mboxcachedir, aspellDicDir, which can still be individually specified to +# override cachedir. Note that if you have multiple configurations, each +# must have a different cachedir, there is no automatic computation of a +# subpath under cachedir. #cachedir = ~/.cache/recoll # Maximum file system occupation # over which we stop indexing.The value is a percentage, # corresponding to what the "Capacity" df output column shows. The default -# value is 0, meaning no checking. +# value is 0, meaning no checking. maxfsoccuppc = 0 # Xapian database directory @@ -340,9 +354,11 @@ maxfsoccuppc = 0 # ~/.recoll/xapiandb/ dbdir = xapiandb -# Name of the scratch file where -# the indexer process updates its status. Default: -# idxstatus.txt inside the configuration directory +# +# +# Name of the scratch file where the indexer process updates its +# status.Default: idxstatus.txt inside the configuration +# directory. #idxstatusfile = idxstatus.txt # @@ -371,9 +387,9 @@ webcachedir = webcache # # Maximum size in MB of the Web archive. # This is only used by the web history indexing code. -# Default: 100 MB. +# Default: 40 MB. # Reducing the size will not physically truncate the file. -webcachemaxmbs = 100 +webcachemaxmbs = 40 # # @@ -405,21 +421,21 @@ webcachemaxmbs = 100 # result list. Defaults to $prefix/share/recoll/images #iconsdir = /path/to/my/icons -# Parameters affecting indexing performance and resource -# usage +# Parameters affecting indexing performance and +# resource usage # # -# Threshold (megabytes of new data) where we flush from memory to disk -# index. -# Setting this allows some control over memory usage by the indexer -# process. A value of 0 means no explicit flushing, which lets Xapian -# perform its own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD -# documents created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an -# environment variable. As memory usage depends on average document size, -# not only document count, this is not very useful. -# The default value of 10 MB may be a bit low. If you are looking for -# maximum speed, you may want to experiment with values between 20 and +# Threshold (megabytes of new data) where we flush from memory to +# disk index. Setting this allows some control over memory +# usage by the indexer process. A value of 0 means no explicit flushing, +# which lets Xapian perform its own thing, meaning flushing every +# $XAPIAN_FLUSH_THRESHOLD documents created, modified or deleted: as memory +# usage depends on average document size, not only document count, the +# Xapian approach is is not very useful, and you should let Recoll manage +# the flushes. The default value of idxflushmb is 10 MB, and may be a bit +# low. If you are looking for maximum speed, you may want to experiment +# with values between 20 and # 80. In my experience, values beyond 100 are always counterproductive. If # you find otherwise, please drop me a note. idxflushmb = 10 @@ -449,7 +465,7 @@ filtermaxmbytes = 2000 # for each stage (three integer values). If a value of -1 is given for a # given stage, no queue is used, and the thread will go on performing the # next stage. In practise, deep queues have not been shown to increase -# performance. Default: a value of 0 for the first queue tells &RCL; to +# performance. Default: a value of 0 for the first queue tells Recoll to # perform autoconfiguration based on the detected number of CPUs (no need # for the two other values in this case). Use thrQSizes = -1 -1 -1 to # disable multithreading entirely. @@ -463,23 +479,23 @@ thrQSizes = 0 # in thrQSizes: if the first queue depth is 0, all counts are ignored # (autoconfigured); if a value of -1 is used for a queue depth, the # corresponding thread count is ignored. It makes no sense to use a value -# other than 1 for the last stage because updating the &XAP; index is +# other than 1 for the last stage because updating the Xapian index is # necessarily single-threaded (and protected by a mutex). #thrTCounts = 4 2 1 -# Miscellaneous parameters +# Miscellaneous parameters # # -# Debug log verbosity 1-6 2 is errors/warnings -# only. 3 information like document updates, 4 is quite verbose and 6 very -# verbose. +# Log file verbosity 1-6. A value of 2 will print +# only errors and warnings. 3 will print information like document updates, +# 4 is quite verbose and 6 very verbose. loglevel = 3 # # -# Debug log destination. Use 'stderr' (default) to write to the +# Log file destination. Use 'stderr' (default) to write to the # console. logfilename = stderr @@ -511,12 +527,11 @@ logfilename = stderr # # Indexing process current directory. The input # handlers sometimes leave temporary files in the current directory, so it -# makes sense to have recollindex chdir to some temporary directory. Three -# possible types of values: -# - (literal) tmp : go to temp dir as set by environment (RECOLL_TMPDIR else -# TMPDIR else /tmp) -# - Empty: stay where started -# - Absolute path value: go there. +# makes sense to have recollindex chdir to some temporary directory. If the +# value is empty, the current directory is not changed. If the +# value is (literal) tmp, we use the temporary directory as set by the +# environment (RECOLL_TMPDIR else TMPDIR else /tmp). If the value is an +# absolute path to a directory, we go there. idxrundir = tmp # @@ -525,7 +540,7 @@ idxrundir = tmp # files which previously failed. The default script checks # the modified dates on /usr/bin and /usr/local/bin. A relative path will # be looked up in the filters dirs, then in the path. Use an absolute path -# to do otherwise. +# to do otherwise. checkneedretryindexscript = rclcheckneedretry.sh # @@ -569,9 +584,10 @@ checkneedretryindexscript = rclcheckneedretry.sh # # -# Additional parameter to aspell dictionary creation +# Additional option and parameter to aspell dictionary creation # command.Some aspell packages may need an additional option -# (e.g. on Debian Jessie). See Debian bug 772415. +# (e.g. on Debian Jessie: --local-data-dir=/usr/lib/aspell). See Debian bug +# 772415. #aspellAddCreateParam = --local-data-dir=/usr/lib/aspell # @@ -589,18 +605,21 @@ checkneedretryindexscript = rclcheckneedretry.sh # disable the thing. #noaspell = 1 -# +# # -# Seconds between auxiliary databases updates (stemdb, -# aspell).The default is one hour. +# Auxiliary database update interval.The real time +# indexer only updates the auxiliary databases (stemdb, aspell) +# periodically, because it would be too costly to do it for every document +# change. The default period is one hour. #monauxinterval = 3600 # # # Minimum interval (seconds) between processings of the indexing -# queue. The real time monitor does not process each event +# queue.The real time indexer does not process each event # when it comes in, but lets the queue accumulate, to diminish overhead and -# to aggregate multiple events to the same file. Default 30 S. +# to aggregate multiple events affecting the same file. Default 30 +# S. #monixinterval = 30 # @@ -611,14 +630,14 @@ checkneedretryindexscript = rclcheckneedretry.sh # reindexed once in a while. A list of wildcardPattern:seconds pairs. The # patterns are matched with fnmatch(pattern, path, 0) You can quote entries # containing white space with double quotes (quote the whole entry, not the -# pattern). The default is empty. Example:mondelaypatterns = *.log:20 -# "*with spaces.*:30" +# pattern). The default is empty. +# Example: mondelaypatterns = *.log:20 "*with spaces.*:30" #mondelaypatterns = *.log:20 "*with spaces.*:30" # # # ionice class for the real time indexing process -# On platforms where this is supported, the default value is +# On platforms where this is supported. The default value is # 3. # monioniceclass = 3 @@ -631,11 +650,12 @@ checkneedretryindexscript = rclcheckneedretry.sh -# Query-time parameters (no impact on the index) +# Query-time parameters (no impact on the +# index) # # -# auto-trigger diacritics sensitivity (raw index only) +# auto-trigger diacritics sensitivity (raw index only). # IF the index is not stripped, decide if we automatically trigger # diacritics sensitivity if the search term has accented characters (not in # unac_except_trans). Else you need to use the query language and the "D" @@ -644,7 +664,7 @@ autodiacsens = 0 # # -# auto-trigger case sensitivity (raw index only) IF +# auto-trigger case sensitivity (raw index only).IF # the index is not stripped (see indexStripChars), decide if we # automatically trigger character case sensitivity if the search term has # upper-case characters in any but the first position. Else you need to use @@ -668,14 +688,14 @@ maxXapianClauses = 50000 # # -# Maximum number of positions we walk while populating a snippet for the -# result list.The default of 1,000,000 may be insufficient -# for big documents, the consequence would be snippets with possibly -# meaning-altering missing words. +# Maximum number of positions we walk while populating a snippet for +# the result list.The default of 1,000,000 may be +# insufficient for very big documents, the consequence would be snippets +# with possibly meaning-altering missing words. snippetMaxPosWalk = 1000000 -# Parameters for the PDF input script +# Parameters for the PDF input script # # @@ -693,7 +713,8 @@ snippetMaxPosWalk = 1000000 #pdfattach = 0 -# Parameters set for specific locations +# Parameters set for specific +# locations # You could specify different parameters for a subdirectory like this: #[~/hungariandocs/plain]