From a1a2bbf952d44575afebe1ad4436bfb8753872d8 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 26 May 2016 10:19:46 +0200 Subject: [PATCH] doc:added multithreading section --- packaging/debian/buildppa.sh | 6 +- src/doc/user/usermanual.xml | 97 ++++ src/sampleconf/recoll.conf | 995 ++++++++++++++++++++++------------- 3 files changed, 732 insertions(+), 366 deletions(-) diff --git a/packaging/debian/buildppa.sh b/packaging/debian/buildppa.sh index cc86724e..e7bb4cd0 100644 --- a/packaging/debian/buildppa.sh +++ b/packaging/debian/buildppa.sh @@ -19,7 +19,7 @@ case $RCLVERS in 1.14*) PPANAME=recoll-ppa;; *) PPANAME=recoll15-ppa;; esac -PPANAME=recollexp-ppa +#PPANAME=recollexp-ppa echo "PPA: $PPANAME. Type CR if Ok, else ^C" read rep @@ -42,7 +42,7 @@ check_recoll_orig() debdir=debian # Note: no new releases for lucid: no webkit. Or use old debianrclqt4 dir. series="precise trusty utopic vivid wily xenial" -series=trusty +series= if test "X$series" != X ; then check_recoll_orig @@ -141,7 +141,7 @@ done ### Unity Scope series="trusty utopic vivid wily xenial" -series= +series=xenial debdir=debianunityscope if test ! -d ${debdir}/ ; then diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index cee8e2ea..f196efb9 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -800,6 +800,103 @@ indexedmimetypes = application/pdf + + + + + Indexing thread usage configuration GUI + + The &RCL; indexing process + recollindex can use multiple threads to + speed up indexing on multiprocessor systems. The work done + to index files is divided in several stages and some of the + stages can be executed by multiple threads. The stages are: + + File system walking: this is always performed by + the main thread. + File conversion and data extraction. + Text processing (splitting, stemming, + etc.) + &XAP; index update. + + + You can also read a + + longer document about the transformation of + &RCL; indexing to multithreading. + + The threads configuration is controlled by two + configuration file parameters. + + + + thrQSizes + This variable defines the job input queues + configuration. There are three possible queues for stages + 2, 3 and 4, and this parameter should give the queue depth + for each stage (three integer values). If a value of -1 is + used for a given stage, no queue is used, and the thread + will go on performing the next stage. In practise, deep + queues have not been shown to increase performance. A value + of 0 for the first queue tells &RCL; to perform + autoconfiguration (no need for anything else in this case, + thrTCounts is not used) - this is the default + configuration. + + + + thrTCounts + This defines the number of threads used + for each stage. If a value of -1 is used for one of + the queue depths, the corresponding thread count is + ignored. It makes no sense to use a value other than 1 + for the last stage because updating the &XAP; index is + necessarily single-threaded (and protected by a + mutex). + + + + + + The following example would use three queues (of depth 2), + and 4 threads for converting source documents, 2 for + processing their text, and one to update the index. This was + tested to be the best configuration on the test system + (quadri-processor with multiple disks). + +thrQSizes = 2 2 2 +thrTCounts = 4 2 1 + + + + The following example would use a single queue, and the + complete processing for each document would be performed by + a single thread (several documents will still be processed + in parallel in most cases). The threads will use mutual + exclusion when entering the index update stage. In practise + the performance would be close to the precedent case in + general, but worse in certain cases (e.g. a Zip archive + would be performed purely sequentially), so the previous + approach is preferred. YMMV... The 2 last values for + thrTCounts are ignored. + +thrQSizes = 2 -1 -1 +thrTCounts = 6 1 1 + + + + The following example would disable + multithreading. Indexing will be performed by a single + thread. + +thrQSizes = -1 -1 -1 + + + + + + + The index configuration GUI diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index feeb6868..38f34896 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -1,43 +1,52 @@ -# (C) 2004 J.F.Dockes. License: GPL -# -# Recoll default configuration file. This typically lives in -# $prefix/share/recoll/examples and provides default values. You can -# override selected parameters by adding assigments to -# ~/.recoll/recoll.conf (or $RECOLL_CONFDIR/recoll.conf) -# -# Almost all values in this file can be set from the GUI configuration -# menus, which may be an easier approach than direct editing. -# +# Recoll default main configuration file -# Space-separated list of directories to index. Next line indexes $HOME +# The XML tags in the comments are used to help produce the documentation +# from the sample/reference file, and not at all at run time, where +# comments are just comments. Edit at will. + +# This typically lives in $prefix/share/recoll/examples and provides +# default values. You can override selected parameters by adding assigments +# to ~/.recoll/recoll.conf (or $RECOLL_CONFDIR/recoll.conf) +# +# Most of the important values in this file can be set from the GUI +# configuration menus, which may be an easier approach than direct editing. + +# Parameters affecting what documents we index + +# Space-separated list of files or +# directories to recursively index.Default to ~ (indexes +# $HOME). You can use symbolic links in the list, they will be followed, +# independantly of the value of the followLinks variable. topdirs = ~ -# Wildcard expressions for names of files and directories that we should -# ignore. If you need index mozilla/thunderbird mail folders, don't put -# ".*" in there (as was the case with an older sample config) -# These are simple names, not paths (must contain no / ) +# Wildcard expressions for +# names of files and directories that we should ignore. +# White space separated list of wildcard patterns (simple +# ones, not paths, must contain no / ), which will be tested against file +# and directory names. The list in the default configuration does not +# exclude hidden directories (names beginning with a dot), which means that +# it may index quite a few things that you do not want. On the other hand, +# email user agents like Thunderbird usually store messages in hidden +# directories, and you probably want this indexed. One possible solution is +# to have '.*' in 'skippedNames', and add things like '~/.thunderbird' +# '~/.evolution' to 'topdirs'. Not even the file names are indexed for +# patterns in this list, see the 'noContentSuffixes' variable for an +# alternative approach which indexes the file names. Can be redefined for +# any subtree. skippedNames = #* bin CVS Cache cache* .cache caughtspam tmp \ .thumbnails .svn \ *~ .beagle .git .hg .bzr loop.ps .xsession-errors \ .recoll* xapiandb recollrc recoll.conf -# Wildcard expressions for paths we shouldn't go into. The database and -# configuration directories will automatically be added in there. -# We add the usual mount point for removable media by default to remind -# people that it is a bad idea to naively have recoll work on these -# (esp. with the monitor: media gets indexed on mount, all data gets erased -# on unmount...). Typically the presence of /media is mostly a reminder, it -# would only have effect for someone who's indexing / ... -# Explicitely adding /media/xxx to the topdirs will override this. -skippedPaths = /media - -# List of suffixes for which we don't try mime type identification (and -# don't uncompress or index content obviously). This complements the now -# obsoleted mimemap recoll_noindex list, which will go away in a future -# release (the move from mimemap to recoll.conf allows editing the list -# through the GUI). This is different from skippedNames because these are -# name ending matches only (not wildcard patterns), and the file name -# itself gets indexed normally. +# List of name endings (not +# necessarily dot-separated suffixes) for which we don't try MIME type +# identification, and don't uncompress or index content.Only +# the names will be indexed. This complements the now obsoleted mimemap +# recoll_noindex list, which will go away in a future release (the move +# from mimemap to recoll.conf allows editing the list through the +# GUI). This is different from skippedNames because these are name ending +# matches only (not wildcard patterns), and the file name itself gets +# indexed normally. This can be redefined for subdirectories. noContentSuffixes = .md5 .map \ .o .lib .dll .a .sys .exe .com \ .mpp .mpt .vsd \ @@ -45,89 +54,196 @@ noContentSuffixes = .md5 .map \ .dat .bak .rdf .log.gz .log .db .msf .pid \ ,v ~ # -# Same for real time indexing. The idea here is that there is stuff that -# you might want to initially index but not monitor. If daemSkippedPaths is -# not set, the daemon uses skippedPaths. -#daemSkippedPaths = +# Space-separated list of +# wildcard expressions for paths we shouldn't go into.Can +# contain files and directories. The database and configuration directories +# will automatically be added. The expressions are matched 'fnmatch(3)' +# with the FNM_PATHNAME flag set by default. This means that '/' characters +# must be matched explicitely. You can set 'skippedPathsFnmPathname' to 0 +# to disable the use of FNM_PATHNAME (meaning that '/*/dir3' will match +# '/dir1/dir2/dir3'). The default contains the usual mount point for +# removable media by default to remind people that it is a bad idea to +# naively have recoll work on these (esp. with the monitor: media gets +# indexed on mount, all data gets erased on unmount). Typically the +# presence of '/media' is mostly a reminder, it would only have effect for +# someone who is indexing '/'. Explicitely adding '/media/xxx' to the +# topdirs will override this. +skippedPaths = /media -# Recoll uses FNM_PATHNAME by default when matching skipped paths, which -# means that /dir1/dir2/dir3 is not matched by */dir3. Can't change the -# default now, but you can set the following variable to 0 to disable the -# use of FNM_PATHNAME (see fnmatch(3) man page) +# Set to 0 to +# override use of FNM_PATHNAME for matching skipped +# paths. #skippedPathsFnmPathname = 1 -# Option to follow symbolic links. We normally don't, to avoid duplicated -# indexing (in any case, no effort is made to identify or avoid multiple -# indexing of linked files) +# skippedPaths equivalent specific to +# real time indexing.This enables having parts of the tree +# which are initially indexed but not monitored. If daemSkippedPaths is +# not set, the daemon uses skippedPaths. +#daemSkippedPaths = + + +# Space-separated list of +# wildcard expresions for names that should be ignored +# inside zip archives.This is used directly by the zip +# handler, and has a function similar to skippedNames, but +# works independantly. Can be redefined for subdirectories. Supported by +# recoll 1.20 and newer. See +# https://bitbucket.org/medoc/recoll/wiki/Filtering%20out%20Zip%20archive%20members +# +#zipSkippedNames = + +# Follow symbolic links during +# indexing.The default is to ignore symbolic links to avoid +# multiple indexing of linked files. No effort is made to avoid duplication +# when this option is set to true. This option can be set individually for +# each of the 'topdirs' members by using sections. It can not be changed +# below the 'topdirs' level. Links in the 'topdirs' list itself are always +# followed. #followLinks = 0 -# Debug messages. 2 is errors/warnings only. 3 information like doc -# updates, 4 is quite verbose and 6 very verbose -loglevel = 3 -logfilename = stderr +# Restrictive list of +# indexed mime types.Normally not set (in which case all +# supported types are indexed). If it is set, +# only the types from the list will have their contents indexed. The names +# will be indexed anyway if indexallfilenames is set (default). MIME +# type names should be taken from the mimemap file. Can be redefined for +# subtrees. +#indexedmimetypes = -# The following can be used to set different values for logging by the -# indexer (recollindex). The default is to use loglevel/logfilename -#idxloglevel = 3 -#idxlogfilename = stderr +# List of excluded MIME +# types.Lets you exclude some types from indexing. Can be +# redefined for subtrees. +#excludedmimetypes = -# Specific versions of log file name and level for the indexing daemon. The -# default is to use the idx... values if set, else the log... values. -#daemloglevel = 3 -#daemlogfilename = /dev/null +# Size limit for compressed +# files.We need to decompress these in a +# temporary directory for identification, which can be wasteful in some +# cases. Limit the waste. Negative means no limit. 0 results in no +# processing of any compressed file. +compressedfilemaxkbs = 50000 -# Run directory for the indexing process. The filters sometimes leave -# garbage in the current directory, so it makes sense to have recollindex -# chdir to some garbage bin. 3 possible values: -# - (literal) tmp : go to temp dir as set by env (RECOLL_TMPDIR else -# TMPDIR else /tmp) -# - Empty: stay where started -# - Absolute path value: go there. -idxrundir = tmp +# Size limit for text +# files.Mostly for skipping monster +# logs. +textfilemaxmbs = 20 -# Decide if we store character case and diacritics in the index. If we do, +# Index the file names of +# unprocessed filesIndex the names of files the contents of +# which we don't index because of an excluded or unsupported MIME +# type. +indexallfilenames = 1 + +# Use a system command +# for file MIME type guessing as a final step in file type +# identificationThis is generally useful, but will usually +# cause the indexing of many bogus 'text' files. See 'systemfilecommand' +# for the command used. +usesystemfilecommand = 1 + +# Command used to guess +# MIME types if the internal methods failsThis should be a +# "file -i" workalike. The file path will be added as a last parameter to +# the command line. 'xdg-mime' works better than the traditional 'file' +# command, and is now the configured default (with a hard-coded fallback to +# 'file') +systemfilecommand = xdg-mime query filetype + +# Decide if we process the +# Web queue.The queue is a directory where the Recoll Web +# browser plugins create the copies of visited pages. +processwebqueue = 0 + +# Page size for text +# files.If this is set, text/plain files will be divided +# into documents of approximately this size. Will reduce memory usage at +# index time and help with loading data in the preview window at query +# time. Particularly useful with very big files, such as application or +# system logs. +textfilepagekbs = 1000 + +# Size limit for archive +# members.This is passed to the filters in the environment +# as RECOLL_FILTER_MAXMEMBERKB. +membermaxkbs = 50000 + + + +# Parameters affecting how we generate terms + +# Changing some of these parameters will imply a full +# reindex. Also, when using multiple indexes, it may not make sense +# to search indexes that don't share the values for these parameters, +# because they usually affect both search and index operations. + + +# Decide if we store +# character case and diacritics in the index.If we do, # searches sensitive to case and diacritics can be performed, but the index -# will be bigger, and some marginal weirdness may sometimes occur. We -# default to a stripped index for now. +# will be bigger, and some marginal weirdness may sometimes occur. The +# default is a stripped index. When using multiple indexes for a search, +# this parameter must be defined identically for all. Changing the value +# implies an index reset. indexStripChars = 1 -# IF the index is not stripped. Decide if we automatically trigger -# diacritics sensitivity if the search term has accented characters (not in -# unac_except_trans). Else you need to use the query language and the "D" -# modifier to specify diacritics sensitivity. Default is no. -autodiacsens = 0 +# Decides if terms will be +# generated for numbers.For example "123", "1.5e6", +# 192.168.1.4, would not be indexed if nonumbers is set ("value123" would +# still be). Numbers are often quite interesting to search for, and this +# should probably not be set except for special situations, ie, scientific +# documents with huge amounts of numbers in them, where setting nonumbers +# will reduce the index size. This can only be set for a whole index, not +# for a subtree. +#nonumbers = 0 -# IF the index is not stripped. Decide if we automatically trigger -# character case sensitivity if the search term has upper-case characters -# in any but the first position. Else you need to use the query language -# and the "C" modifier to specify character-case sensitivity. Default is -# yes. -autocasesens = 1 +# Determines if we index +# 'coworker' also when the input is 'co-worker'.This is new +# in version 1.22, and on by default. Setting the variable to off allows +# restoring the previous behaviour. +#dehyphenate = 1 -# Languages for which to build stemming databases at the end of -# indexing. Stemmer names can be found on http://www.xapian.org -# The flag to perform stem expansion at query time is now set from the GUI +# Decides if specific east asian +# (Chinese Korean Japanese) characters/word splitting is turned +# off.This will save a small amount of cpu if you have no CJK +# documents. If your document base does include such text but you are not +# interested in searching it, setting nocjk may be a +# significant time and space saver. +#nocjk = 0 + +# This lets you adjust the size of +# n-grams used for indexing CJK text.The default value of 2 is +# probably appropriate in most cases. A value of 3 would allow more precision +# and efficiency on longer words, but the index will be approximately twice +# as large. +#cjkngramlen = 2 + +# Languages for +# which to create stemming expansion data.Stemmer names can +# be found on http://www.xapian.org, or by executing 'recollindex -l', or +# this can also be set from a list in the GUI indexstemminglanguages = english -# Default character set. Values found inside files, ie content tag in html -# documents, will override this. It can be specified per directory (see -# below). Used when converting to utf-8 (internal storage format), so it -# may be quite important for pure text files. -# The default used to be set to iso8859-1, but we now take it from the nls -# environment (LC_ALL/LC_CTYPE/LANG). The ultimate hardwired default is -# still 8859-1. If for some reason you want a general default which doesnt -# match your LANG and is not 8859-1, set it here. -# defaultcharset = iso-8859-1 +# Default character +# set.This is used for files which do not contain a +# character set definition (e.g.: text/plain). Values found inside files, +# e.g. a 'charset' tag in HTML documents, will override it. If this is not +# set, the default character set is the one defined by the NLS environment +# ($LC_ALL, $LC_CTYPE, $LANG), or ultimately iso-8859-1 (cp-1252 in fact). +# If for some reason you want a general default which does not match your +# LANG and is not 8859-1, use this variable. This can be redefined for any +# sub-directory. +#defaultcharset = iso-8859-1 -# A list of characters, encoded in UTF-8, which should be handled specially -# when converting text to unaccented lowercase. For example, in Swedish, -# the letter a with diaeresis has full alphabet citizenship and should not -# be turned into an a. +# A list of characters, +# encoded in UTF-8, which should be handled specially +# when converting text to unaccented lowercase.For +# example, in Swedish, the letter a with diaeresis has full alphabet +# citizenship and should not be turned into an a. # Each element in the space-separated list has the special character as # first element and the translation following. The handling of both the # lowercase and upper-case versions of a character should be specified, as # appartenance to the list will turn-off both standard accent and case -# processing. Examples: +# processing. The value is global and affects both indexing and querying. +# Examples: # Swedish: # unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl åå Åå # German: @@ -138,289 +254,48 @@ indexstemminglanguages = english # Reasonable default for all until someone protests. These decompositions # are not performed by unac, but I cant imagine someone typing the composed # forms in a search. +# unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl -# Turn off the indexing of numbers: may reduce the index size if you have -# no use for them -# nonumbers = 0 -# Turn off indexing "coworker" for an input of "co-worker" (in addition to -# co, worker, "co worker". Default is on as of version 1.22 -# dehyphenate = 1 +# Overrides the default +# character set for email messages which don't specify +# one.This is mainly useful for readpst (libpst) dumps, +# which are utf-8 but do not say so. +#maildefcharset= -# Maximum expansion count for a single term (ie: when using wildcards). -# We used to not limit this at all (except for filenames where the limit -# was too low at 1000), but it is unreasonable with a big index. -# Default 10 000 -maxTermExpand = 10000 +# Set fields on all files +# (usually of a specific fs area).Syntax is the usual: +# name = value ; attr1 = val1 ; [...] +# value is empty so this needs an initial semi-colon. This is useful, e.g., +# for setting the rclaptg field for application selection inside +# mimeview. +#[/some/app/directory] +#localfields = ; rclaptg = someapp; otherfield = somevalue -# Maximum number of clauses we add to a single Xapian query. In some cases, -# the result of term expansion can be multiplicative, and we want to avoid -# eating all the memory. Default 50000 -maxXapianClauses = 50000 - -# Recoll data directories are normally stored relative to the configuration -# directory (e.g. ~/.recoll/xapiandb, ~/.recoll/mboxcache). If this is set, -# the directories are stored under the specified value instead -# (e.g. if cachedir is ~/.cache/recoll, the default dbdir would be -# ~/.cache/recoll/xapiandb). -# This affects dbdir, webcachedir, mboxcachedir, aspellDicDir, which can -# still be individually specified to override cachedir. -# Note that if you have multiple configurations, each must have a different -# cachedir, there is no automatic computation of a subpath under cachedir. -#cachedir = ~/.cache/recoll - -# Where to store the database (directory). This may be an absolute path, -# else it is taken as relative to cachedir if set, or the configuration -# directory (-c argument or $RECOLL_CONFDIR). If nothing is specified, the -# default is then ~/.recoll/xapiandb/ -dbdir = xapiandb - -# Indexing process threads configuration. If Recoll is configured for -# multithreading, this defines what queues are active and how many threads -# to start for any of them. The default values were found good on a -# quad-core processor. The three steps are file conversion, term extraction -# and conversion and Xapian index update. The three queue values define the -# max number of jobs waiting on one of the corresponding queues. Setting a -# value to -1 disables a queue (replaced by a direct call). The thrTcounts -# values define the number of threads to start for each queue. The last -# value can only be one (as Xapian is single-threaded). -# If the first element in thrQSizes is 0, recollindex will attempt to set -# roughly guestimated values based on the number of CPUs. -# -# The following are the best setup on my core i5 system (4 cores, no -# hyperthreading, multiple disks). -#thrQSizes = 2 2 2 -#thrTCounts = 4 2 1 -# The default is to let recoll guess. -thrQSizes = 0 - -# Maximum file system occupation before we stop indexing. The default value -# is 0, meaning no checking. The value is a percentage, corresponding to -# what the "Capacity" df output column shows. -maxfsoccuppc = 0 - -# Threshold (megabytes of new data) where we flush from memory to disk -# index. Setting this (ie to 10) can help control memory usage. -# -# A value of 0 means no explicit flushing, which lets Xapian perform its -# own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD documents -# created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an environment -# variable. As memory usage depends on average document size, not only -# document count, this is not very useful. -# -# The default value of 10 MB may be a bit low. If you are looking for -# maximum speed, you may want to experiment with values between 20 and -# 80. In my experience, values beyond 100 are always counterproductive. If -# you find otherwise, please drop me a note. -idxflushmb = 10 - -# Place to search for executable filters. If RECOLL_FILTERSDIR is set in -# the environment, we use it instead. Defaults to $prefix/share/recoll/filters -# filtersdir = /path/to/my/filters - -# Additional places to search for helper executables. This is only used on -# Windows for now -# recollhelperpath = c:/someprog/bin;c:/someotherprog/bin - -# Place to search for icons. The only reason to change this would be if you -# want to change the icons displayed in the result list. -# Defaults to $prefix/share/recoll/images -# iconsdir = /path/to/my/icons - -# Should we use the system's 'file -i' command as a final step in file type -# identification ? This may be useful, but will usually cause the -# indexation of many bogus 'text' files -usesystemfilecommand = 1 -# Actual command to use as "file -i" workalike. -# The file path will be added as a last parameter to the command line. If -# that's not what your preferred command would like, use an intermediary -# script. -# xdg-mime now works better than the traditional "file" command, and is now -# the configured default (with a hard-coded fallback to "file") -systemfilecommand = xdg-mime query filetype -# systemfilecommand = file -i filetype - -# Should we index the file names of files with mime types we don't -# know? (we can otherwise just ignore them) -indexallfilenames = 1 - -# A restrictive list of indexed mime types. Normally not set. If it is set, -# only the types from the list will have their contents indexed (the names -# will be indexed anyway if indexallfilenames is set as by default). Mime -# type names should be taken from the mimemap file. -# -# indexedmimetypes = - -# An excluded list of mime types. It can be redefined in subdirectories, -# so can be used to locally exclude some types. -# -# excludedmimetypes = - -# -# Size limit for archive members. This is passed to the filters in the -# environment as RECOLL_FILTER_MAXMEMBERKB -# -membermaxkbs = 50000 - -# Size limit for compressed files. We need to decompress these in a -# temporary directory for identification, which can be wasteful in some -# cases. Limit the waste. Negative means no limit. 0 results in no -# processing of any compressed file. Used to be -1 by default. -compressedfilemaxkbs = 50000 - -# Size limit for text files. This is for skipping monster logs -textfilemaxmbs = 20 - -# Page size for text files. If this is set, text/plain files will be -# divided into documents of approximately this size. May be useful to -# access pieces of big text files which would be problematic to load as one -# piece into the preview window. Might be useful for big logs -textfilepagekbs = 1000 - -# Maximum external filter execution time. Default 20mn. This is mainly -# to avoid infinite loops in postscript files (loop.ps) -filtermaxseconds = 1200 -# Maximum virtual memory space for filter process (setrlimit(RLIMIT_AS)), -# in megabytes. Note that this includes any mapped libs (there is no -# reliable Linux way to limit the data space only), so we need to be a -# bit generous here. Anything over 2000 will be ignored on 32 bits machines. -filtermaxmbytes = 2000 - -# Length of abstracts we store while indexing. Longer will make for a -# bigger db -# idxabsmlen = 250 - -# Truncation length of stored metadata fields. This does not affect -# indexing, just what can be displayed inside results. -# idxmetastoredlen = 150 - -# Language definitions to use when creating the aspell dictionary. -# The value must match a set of aspell language definition files. -# You can type "aspell dicts" to see a list -# The default if this is not set is to use the NLS environment to guess the -# value -# aspellLanguage = en - -# Somme aspell packages may need an additional option (e.g. on Debian -# Jessie). See Debian bug 772415 -# aspellAddCreateParam = --local-data-dir=/usr/lib/aspell - -# The aspell dictionary (aspdict.(lang).rws) is normally stored in the -# directory specified by cachedir if set, or under the configuration -# directory. Set the following to change: -#aspellDicDir = - -# You may also want to set this to have a look at aspell dictionary -# creation errors. But there are always many, so this is mostly for debugging -# aspellKeepStderr = 1 - -# Disabling aspell use. The aspell dictionary generation takes some time, -# and some combinations of aspell version, language, and local terms, -# result in aspell dumping core each time. You can disable the aspell -# dictionary generation by setting the following variable: -# noaspell = 1 - -# Timing parameters for the real time mode: -# -# Seconds between auxiliary databases updates (stemdb, aspell): -# monauxinterval = 3600 -# -# Resting time (seconds) during which we let the queue accumulate, in hope -# that events to the same file will merge, before we start indexing: -# monixinterval = 30 -# -# Definitions for files which get a longer delay before reindexing is -# allowed. This is for fast-changing files, that should only be reindexed -# once in a while. A list of wildcardPattern:seconds pairs. The patterns -# are matched with fnmatch(pattern, path, 0) You can quote entries containing -# white space with double quotes. The default is empty, here follows an -# example: -# mondelaypatterns = *.log:20 "*with spaces.*:30" - -# ionice class for monitor (on platforms where this is supported) -# monioniceclass = 3 -# ionice class param for monitor (on platforms where this is supported) -# monioniceclassdata = - -# If this is set, process the directory where the Recoll Web browser plugins -# copy visited pages for indexing. -processwebqueue = 0 -# The path to the Web indexing queue. This is hard-coded in the -# plugin as ~/.recollweb/ToIndex so there should be no need to change it. -#webqueuedir = ~/.recollweb/ToIndex -# This is only used by the web history indexing code, and -# defines where the cache for visited pages will live. Default: -# cachedir/webcache if cachedir is set, else $RECOLL_CONFDIR/webcache -webcachedir = webcache -# This is only used by the web history indexing code, and -# defines the maximum size for the web page cache. Default: 40 MB. -# Reducing the size will not physically truncate the file. -webcachemaxmbs = 40 - -# The directory where mbox message offsets cache files are held. This is -# normally named mboxcache under cachedir if set, or else under the -# configuration directory, but it may be useful to share a -# directory between different configurations. -#mboxcachedir = mboxcache - -# The minimum mbox file size over which we cache the offsets. There is -# really no sense in caching offsets for small files. The default is 5 MB. -#mboxcacheminmbs = 5 - -# Maximum number of positions we walk while populating a snippet for the -# result list. The default of 1 000 000 may be insufficient for big -# documents, the consequence would be snippets with possibly -# meaning-altering missing words. -snippetMaxPosWalk = 1000000 - -# Use mtime instead of default ctime to determine if a file has been -# modified (in addition to size, which is always used). +# Use mtime instead of +# ctime to test if a file has been modified.The time is used +# in in addition to the size, which is always used. # Setting this can reduce re-indexing on systems where extended attributes -# are used (by some other applications), but not indexed (changing -# ext. attrs. only affects ctime). +# are used (by some other application), but not indexed, because changing +# extended attributes only affects ctime. # Notes: -# - this may prevent detection of change in some marginal file rename cases +# - This may prevent detection of change in some marginal file rename cases # (the target would need to have the same size and mtime). # - You should probably also set noxattrfields to 1 in this case, except if # you still prefer to perform xattr indexing, for example if the local # file update pattern makes it of value (as in general, there is a risk # for pure extended attributes updates without file modification to go # undetected). Perform a full index reset after changing this. +# testmodifusemtime = 0 -# Disable extended attributes conversion to metadata fields. This probably -# needs to be set if testmodifusemtime is set. +# Disable extended attributes +# conversion to metadata fields.This probably needs to be +# set if testmodifusemtime is set. noxattrfields = 0 -# Script used to heuristically check if we need to retry indexing files -# which previously failed. The default script checks the modified dates on -# /usr/bin and /usr/local/bin. A relative path will be looked up in the -# filters dirs, then in the path. Use an absolute path to do otherwise. -checkneedretryindexscript = rclcheckneedretry.sh - -# Parameters for the PDF input script -# Attempt OCR of PDF files with no text content if both tesseract and -# pdftoppm are installed. The default is not to do it because OCR is so -# very slow -#pdfocr = 0 -# Enable PDF attachment extraction, using pdftk (if available). This is -# normally disabled, because it does slow down PDF indexing a bit even if -# not one attachment is ever found. -#pdfattach = 0 - -# You could specify different parameters for a subdirectory like this: -#[~/hungariandocs/plain] -#defaultcharset = iso-8859-2 - -# You can set fields on all files of a specific fs area. (rclaptg can be -# used for application selection inside mimeview). -# Syntax is the usual name = value ; attr1 = val1 ; ... with an empty value -# so needs initial semi-colon -#[/some/app/directory] -#localfields = ; rclaptg = someapp; otherfield = somevalue - -# It's also possible to execute external commands to gather external -# metadata, for example tmsu tags. +# Define commands to +# gather external metadata, e.g. tmsu tags. # There can be several entries, separated by semi-colons, each defining # which field name the data goes into and the command to use. Don't forget the # initial semi-colon. All the field names must be different. You can use @@ -430,14 +305,408 @@ checkneedretryindexscript = rclcheckneedretry.sh # returns multiple field values inside a text blob formatted as a recoll # configuration file ("fieldname = fieldvalue" lines). The rclmultixx name # will be ignored, and field names and values will be parsed from the data. +# #[/some/area/of/the/fs] #metadatacmds = ; tags = tmsu tags %f; rclmulti1 = cmdOutputsConf %f + + + +# Parameters affecting where and how we store things + +# Top directory for Recoll +# dataRecoll data directories are normally located relative +# to the configuration directory (e.g. ~/.recoll/xapiandb, +# ~/.recoll/mboxcache). If 'cachedir' is set, the directories are stored under +# the specified value instead (e.g. if cachedir is ~/.cache/recoll, the +# default dbdir would be ~/.cache/recoll/xapiandb). This affects dbdir, +# webcachedir, mboxcachedir, aspellDicDir, which can still be individually +# specified to override cachedir. Note that if you have multiple +# configurations, each must have a different cachedir, there is no +# automatic computation of a subpath under cachedir. +#cachedir = ~/.cache/recoll + +# Maximum file system occupation +# over which we stop indexing.The value is a percentage, +# corresponding to what the "Capacity" df output column shows. The default +# value is 0, meaning no checking. +maxfsoccuppc = 0 + +# Xapian database directory +# location.This will be created on first indexing. If the +# value is not an absolute path, it will be interpreted as relative to +# cachedir if set, or the configuration directory (-c argument or +# $RECOLL_CONFDIR). If nothing is specified, the default is then +# ~/.recoll/xapiandb/ +dbdir = xapiandb + +# Name of the scratch file where +# the indexer process updates its status. Default: +# idxstatus.txt inside the configuration directory +#idxstatusfile = idxstatus.txt + +# +# +# Directory location for storing mbox message offsets cache +# files.This is normally 'mboxcache' under cachedir if set, +# or else under the configuration directory, but it may be useful to share +# a directory between different configurations. +#mboxcachedir = mboxcache + +# +# +# Minimum mbox file size over which we cache the offsets. +# There is really no sense in caching offsets for small files. The +# default is 5 MB. +#mboxcacheminmbs = 5 + +# +# +# Directory where we store the archived web pages. +# This is only used by the web history indexing code +# Default: cachedir/webcache if cachedir is set, else +# $RECOLL_CONFDIR/webcache +webcachedir = webcache + +# +# Maximum size in MB of the Web archive. +# This is only used by the web history indexing code. +# Default: 100 MB. +# Reducing the size will not physically truncate the file. +webcachemaxmbs = 100 + +# +# +# The path to the Web indexing queue.This is +# hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no +# need or possibility to change it. +#webqueuedir = ~/.recollweb/ToIndex + +# +# +# Aspell dictionary storage directory location. The +# aspell dictionary (aspdict.(lang).rws) is normally stored in the +# directory specified by cachedir if set, or under the configuration +# directory. +#aspellDicDir = + +# +# +# Directory location for executable input handlers.If +# RECOLL_FILTERSDIR is set in the environment, we use it instead. Defaults +# to $prefix/share/recoll/filters. Can be redefined for +# subdirectories. +#filtersdir = /path/to/my/filters + +# +# +# Directory location for icons.The only reason to +# change this would be if you want to change the icons displayed in the +# result list. Defaults to $prefix/share/recoll/images +#iconsdir = /path/to/my/icons + +# Parameters affecting indexing performance and resource +# usage + +# +# +# Threshold (megabytes of new data) where we flush from memory to disk +# index. +# Setting this allows some control over memory usage by the indexer +# process. A value of 0 means no explicit flushing, which lets Xapian +# perform its own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD +# documents created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an +# environment variable. As memory usage depends on average document size, +# not only document count, this is not very useful. +# The default value of 10 MB may be a bit low. If you are looking for +# maximum speed, you may want to experiment with values between 20 and +# 80. In my experience, values beyond 100 are always counterproductive. If +# you find otherwise, please drop me a note. +idxflushmb = 10 + +# +# +# Maximum external filter execution time in +# seconds.Default 1200 (20mn). Set to 0 for no limit. This +# is mainly to avoid infinite loops in postscript files +# (loop.ps) +filtermaxseconds = 1200 + +# +# +# Maximum virtual memory space for filter processes +# (setrlimit(RLIMIT_AS)), in megabytes. Note that this +# includes any mapped libs (there is no reliable Linux way to limit the +# data space only), so we need to be a bit generous here. Anything over +# 2000 will be ignored on 32 bits machines. +filtermaxmbytes = 2000 + +# +# +# Stage input queues configuration. There are three +# internal queues in the indexing pipeline stages (file data extraction, +# terms generation, index update). This parameter defines the queue depths +# for each stage (three integer values). If a value of -1 is given for a +# given stage, no queue is used, and the thread will go on performing the +# next stage. In practise, deep queues have not been shown to increase +# performance. Default: a value of 0 for the first queue tells &RCL; to +# perform autoconfiguration based on the detected number of CPUs (no need +# for the two other values in this case). Use thrQSizes = -1 -1 -1 to +# disable multithreading entirely. +thrQSizes = 0 + +# +# +# Number of threads used for each indexing stage. The +# three stages are: file data extraction, terms generation, index +# update). The use of the counts is also controlled by some special values +# in thrQSizes: if the first queue depth is 0, all counts are ignored +# (autoconfigured); if a value of -1 is used for a queue depth, the +# corresponding thread count is ignored. It makes no sense to use a value +# other than 1 for the last stage because updating the &XAP; index is +# necessarily single-threaded (and protected by a mutex). +#thrTCounts = 4 2 1 + + +# Miscellaneous parameters + +# +# +# Debug log verbosity 1-6 2 is errors/warnings +# only. 3 information like document updates, 4 is quite verbose and 6 very +# verbose. +loglevel = 3 + +# +# +# Debug log destination. Use 'stderr' (default) to write to the +# console. +logfilename = stderr + +# +# +# Override loglevel for the indexer. +#idxloglevel = 3 + +# +# +# Override logfilename for the indexer. +#idxlogfilename = stderr + +# +# +# Override loglevel for the indexer in real time +# mode.The default is to use the idx... values if set, else +# the log... values. +#daemloglevel = 3 + +# +# +# Override logfilename for the indexer in real time +# mode.The default is to use the idx... values if set, else +# the log... values. +#daemlogfilename = /dev/null + +# +# +# Indexing process current directory. The input +# handlers sometimes leave temporary files in the current directory, so it +# makes sense to have recollindex chdir to some temporary directory. Three +# possible types of values: +# - (literal) tmp : go to temp dir as set by environment (RECOLL_TMPDIR else +# TMPDIR else /tmp) +# - Empty: stay where started +# - Absolute path value: go there. +idxrundir = tmp + +# +# +# Script used to heuristically check if we need to retry indexing +# files which previously failed. The default script checks +# the modified dates on /usr/bin and /usr/local/bin. A relative path will +# be looked up in the filters dirs, then in the path. Use an absolute path +# to do otherwise. +checkneedretryindexscript = rclcheckneedretry.sh + +# +# +# Additional places to search for helper executables. +# This is only used on Windows for now. +#recollhelperpath = c:/someprog/bin;c:/someotherprog/bin + +# +# +# Length of abstracts we store while indexing. +# Recoll stores an abstract for each indexed file. +# The text can come from an actual 'abstract' section in the +# document or will just be the beginning of the document. It is stored in +# the index so that it can be displayed inside the result lists without +# decoding the original file. The idxabsmlen parameter +# defines the size of the stored abstract. The default value is 250 +# bytes. The search interface gives you the choice to display this stored +# text or a synthetic abstract built by extracting text around the search +# terms. If you always prefer the synthetic abstract, you can reduce this +# value and save a little space. +#idxabsmlen = 250 + +# +# +# Truncation length of stored metadata fields.This +# does not affect indexing (the whole field is processed anyway), just the +# amount of data stored in the index for the purpose of displaying fields +# inside result lists or previews. The default value is 150 bytes which +# may be too low if you have custom fields. +#idxmetastoredlen = 150 + +# +# +# Language definitions to use when creating the aspell +# dictionary.The value must match a set of aspell language +# definition files. You can type "aspell dicts" to see a list The default +# if this is not set is to use the NLS environment to guess the +# value. +#aspellLanguage = en + +# +# +# Additional parameter to aspell dictionary creation +# command.Some aspell packages may need an additional option +# (e.g. on Debian Jessie). See Debian bug 772415. +#aspellAddCreateParam = --local-data-dir=/usr/lib/aspell + +# +# +# Set this to have a look at aspell dictionary creation +# errors.There are always many, so this is mostly for +# debugging. +#aspellKeepStderr = 1 + +# +# +# Disable aspell use.The aspell dictionary generation +# takes time, and some combinations of aspell version, language, and local +# terms, result in aspell crashing, so it sometimes makes sense to just +# disable the thing. +#noaspell = 1 + +# +# +# Seconds between auxiliary databases updates (stemdb, +# aspell).The default is one hour. +#monauxinterval = 3600 + +# +# +# Minimum interval (seconds) between processings of the indexing +# queue. The real time monitor does not process each event +# when it comes in, but lets the queue accumulate, to diminish overhead and +# to aggregate multiple events to the same file. Default 30 S. +#monixinterval = 30 + +# +# +# Timing parameters for the real time indexing. +# Definitions for files which get a longer delay before reindexing +# is allowed. This is for fast-changing files, that should only be +# reindexed once in a while. A list of wildcardPattern:seconds pairs. The +# patterns are matched with fnmatch(pattern, path, 0) You can quote entries +# containing white space with double quotes (quote the whole entry, not the +# pattern). The default is empty. Example:mondelaypatterns = *.log:20 +# "*with spaces.*:30" +#mondelaypatterns = *.log:20 "*with spaces.*:30" + +# +# +# ionice class for the real time indexing process +# On platforms where this is supported, the default value is +# 3. +# monioniceclass = 3 + +# +# +# ionice class parameter for the real time indexing process. +# On platforms where this is supported. The default is +# empty. +#monioniceclassdata = + + + +# Query-time parameters (no impact on the index) + +# +# +# auto-trigger diacritics sensitivity (raw index only) +# IF the index is not stripped, decide if we automatically trigger +# diacritics sensitivity if the search term has accented characters (not in +# unac_except_trans). Else you need to use the query language and the "D" +# modifier to specify diacritics sensitivity. Default is no. +autodiacsens = 0 + +# +# +# auto-trigger case sensitivity (raw index only) IF +# the index is not stripped (see indexStripChars), decide if we +# automatically trigger character case sensitivity if the search term has +# upper-case characters in any but the first position. Else you need to use +# the query language and the "C" modifier to specify character-case +# sensitivity. Default is yes. +autocasesens = 1 + +# Maximum query expansion count +# for a single term (e.g.: when using wildcards).This only +# affects queries, not indexing. We used to not limit this at all (except +# for filenames where the limit was too low at 1000), but it is +# unreasonable with a big index. Default 10000. +maxTermExpand = 10000 + +# Maximum number of clauses +# we add to a single Xapian query.This only affects queries, +# not indexing. In some cases, the result of term expansion can be +# multiplicative, and we want to avoid eating all the memory. Default +# 50000. +maxXapianClauses = 50000 + +# +# +# Maximum number of positions we walk while populating a snippet for the +# result list.The default of 1,000,000 may be insufficient +# for big documents, the consequence would be snippets with possibly +# meaning-altering missing words. +snippetMaxPosWalk = 1000000 + + +# Parameters for the PDF input script + +# +# +# Attempt OCR of PDF files with no text content if both tesseract and +# pdftoppm are installed.The default is off because OCR is so +# very slow. +#pdfocr = 0 + +# +# +# Enable PDF attachment extraction by executing pdftk (if +# available).This is +# normally disabled, because it does slow down PDF indexing a bit even if +# not one attachment is ever found. +#pdfattach = 0 + + +# Parameters set for specific locations + +# You could specify different parameters for a subdirectory like this: +#[~/hungariandocs/plain] +#defaultcharset = iso-8859-2 + [/usr/share/man] followLinks = 1 -# Enable thunderbird mbox format quirks where appropriate, and same for -# mozilla/seamonkey +# +# +# Enable thunderbird/mozilla-seamonkey mbox format quirks +# Set this for the directory where the email mbox files are +# stored. [~/.thunderbird] mhmboxquirks = tbird [~/.mozilla]