diff --git a/packaging/debian/buildppa.sh b/packaging/debian/buildppa.sh
index cc86724e..e7bb4cd0 100644
--- a/packaging/debian/buildppa.sh
+++ b/packaging/debian/buildppa.sh
@@ -19,7 +19,7 @@ case $RCLVERS in
1.14*) PPANAME=recoll-ppa;;
*) PPANAME=recoll15-ppa;;
esac
-PPANAME=recollexp-ppa
+#PPANAME=recollexp-ppa
echo "PPA: $PPANAME. Type CR if Ok, else ^C"
read rep
@@ -42,7 +42,7 @@ check_recoll_orig()
debdir=debian
# Note: no new releases for lucid: no webkit. Or use old debianrclqt4 dir.
series="precise trusty utopic vivid wily xenial"
-series=trusty
+series=
if test "X$series" != X ; then
check_recoll_orig
@@ -141,7 +141,7 @@ done
### Unity Scope
series="trusty utopic vivid wily xenial"
-series=
+series=xenial
debdir=debianunityscope
if test ! -d ${debdir}/ ; then
diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml
index cee8e2ea..f196efb9 100644
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@@ -800,6 +800,103 @@ indexedmimetypes = application/pdf
+
+
+
+
+ Indexing thread usage configuration GUI
+
+ The &RCL; indexing process
+ recollindex can use multiple threads to
+ speed up indexing on multiprocessor systems. The work done
+ to index files is divided in several stages and some of the
+ stages can be executed by multiple threads. The stages are:
+
+ File system walking: this is always performed by
+ the main thread.
+ File conversion and data extraction.
+ Text processing (splitting, stemming,
+ etc.)
+ &XAP; index update.
+
+
+ You can also read a
+
+ longer document about the transformation of
+ &RCL; indexing to multithreading.
+
+ The threads configuration is controlled by two
+ configuration file parameters.
+
+
+
+ thrQSizes
+ This variable defines the job input queues
+ configuration. There are three possible queues for stages
+ 2, 3 and 4, and this parameter should give the queue depth
+ for each stage (three integer values). If a value of -1 is
+ used for a given stage, no queue is used, and the thread
+ will go on performing the next stage. In practise, deep
+ queues have not been shown to increase performance. A value
+ of 0 for the first queue tells &RCL; to perform
+ autoconfiguration (no need for anything else in this case,
+ thrTCounts is not used) - this is the default
+ configuration.
+
+
+
+ thrTCounts
+ This defines the number of threads used
+ for each stage. If a value of -1 is used for one of
+ the queue depths, the corresponding thread count is
+ ignored. It makes no sense to use a value other than 1
+ for the last stage because updating the &XAP; index is
+ necessarily single-threaded (and protected by a
+ mutex).
+
+
+
+
+
+ The following example would use three queues (of depth 2),
+ and 4 threads for converting source documents, 2 for
+ processing their text, and one to update the index. This was
+ tested to be the best configuration on the test system
+ (quadri-processor with multiple disks).
+
+thrQSizes = 2 2 2
+thrTCounts = 4 2 1
+
+
+
+ The following example would use a single queue, and the
+ complete processing for each document would be performed by
+ a single thread (several documents will still be processed
+ in parallel in most cases). The threads will use mutual
+ exclusion when entering the index update stage. In practise
+ the performance would be close to the precedent case in
+ general, but worse in certain cases (e.g. a Zip archive
+ would be performed purely sequentially), so the previous
+ approach is preferred. YMMV... The 2 last values for
+ thrTCounts are ignored.
+
+thrQSizes = 2 -1 -1
+thrTCounts = 6 1 1
+
+
+
+ The following example would disable
+ multithreading. Indexing will be performed by a single
+ thread.
+
+thrQSizes = -1 -1 -1
+
+
+
+
+
+
+
The index configuration GUI
diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf
index feeb6868..38f34896 100644
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@@ -1,43 +1,52 @@
-# (C) 2004 J.F.Dockes. License: GPL
-#
-# Recoll default configuration file. This typically lives in
-# $prefix/share/recoll/examples and provides default values. You can
-# override selected parameters by adding assigments to
-# ~/.recoll/recoll.conf (or $RECOLL_CONFDIR/recoll.conf)
-#
-# Almost all values in this file can be set from the GUI configuration
-# menus, which may be an easier approach than direct editing.
-#
+# Recoll default main configuration file
-# Space-separated list of directories to index. Next line indexes $HOME
+# The XML tags in the comments are used to help produce the documentation
+# from the sample/reference file, and not at all at run time, where
+# comments are just comments. Edit at will.
+
+# This typically lives in $prefix/share/recoll/examples and provides
+# default values. You can override selected parameters by adding assigments
+# to ~/.recoll/recoll.conf (or $RECOLL_CONFDIR/recoll.conf)
+#
+# Most of the important values in this file can be set from the GUI
+# configuration menus, which may be an easier approach than direct editing.
+
+# Parameters affecting what documents we index
+
+# Space-separated list of files or
+# directories to recursively index.Default to ~ (indexes
+# $HOME). You can use symbolic links in the list, they will be followed,
+# independantly of the value of the followLinks variable.
topdirs = ~
-# Wildcard expressions for names of files and directories that we should
-# ignore. If you need index mozilla/thunderbird mail folders, don't put
-# ".*" in there (as was the case with an older sample config)
-# These are simple names, not paths (must contain no / )
+# Wildcard expressions for
+# names of files and directories that we should ignore.
+# White space separated list of wildcard patterns (simple
+# ones, not paths, must contain no / ), which will be tested against file
+# and directory names. The list in the default configuration does not
+# exclude hidden directories (names beginning with a dot), which means that
+# it may index quite a few things that you do not want. On the other hand,
+# email user agents like Thunderbird usually store messages in hidden
+# directories, and you probably want this indexed. One possible solution is
+# to have '.*' in 'skippedNames', and add things like '~/.thunderbird'
+# '~/.evolution' to 'topdirs'. Not even the file names are indexed for
+# patterns in this list, see the 'noContentSuffixes' variable for an
+# alternative approach which indexes the file names. Can be redefined for
+# any subtree.
skippedNames = #* bin CVS Cache cache* .cache caughtspam tmp \
.thumbnails .svn \
*~ .beagle .git .hg .bzr loop.ps .xsession-errors \
.recoll* xapiandb recollrc recoll.conf
-# Wildcard expressions for paths we shouldn't go into. The database and
-# configuration directories will automatically be added in there.
-# We add the usual mount point for removable media by default to remind
-# people that it is a bad idea to naively have recoll work on these
-# (esp. with the monitor: media gets indexed on mount, all data gets erased
-# on unmount...). Typically the presence of /media is mostly a reminder, it
-# would only have effect for someone who's indexing / ...
-# Explicitely adding /media/xxx to the topdirs will override this.
-skippedPaths = /media
-
-# List of suffixes for which we don't try mime type identification (and
-# don't uncompress or index content obviously). This complements the now
-# obsoleted mimemap recoll_noindex list, which will go away in a future
-# release (the move from mimemap to recoll.conf allows editing the list
-# through the GUI). This is different from skippedNames because these are
-# name ending matches only (not wildcard patterns), and the file name
-# itself gets indexed normally.
+# List of name endings (not
+# necessarily dot-separated suffixes) for which we don't try MIME type
+# identification, and don't uncompress or index content.Only
+# the names will be indexed. This complements the now obsoleted mimemap
+# recoll_noindex list, which will go away in a future release (the move
+# from mimemap to recoll.conf allows editing the list through the
+# GUI). This is different from skippedNames because these are name ending
+# matches only (not wildcard patterns), and the file name itself gets
+# indexed normally. This can be redefined for subdirectories.
noContentSuffixes = .md5 .map \
.o .lib .dll .a .sys .exe .com \
.mpp .mpt .vsd \
@@ -45,89 +54,196 @@ noContentSuffixes = .md5 .map \
.dat .bak .rdf .log.gz .log .db .msf .pid \
,v ~ #
-# Same for real time indexing. The idea here is that there is stuff that
-# you might want to initially index but not monitor. If daemSkippedPaths is
-# not set, the daemon uses skippedPaths.
-#daemSkippedPaths =
+# Space-separated list of
+# wildcard expressions for paths we shouldn't go into.Can
+# contain files and directories. The database and configuration directories
+# will automatically be added. The expressions are matched 'fnmatch(3)'
+# with the FNM_PATHNAME flag set by default. This means that '/' characters
+# must be matched explicitely. You can set 'skippedPathsFnmPathname' to 0
+# to disable the use of FNM_PATHNAME (meaning that '/*/dir3' will match
+# '/dir1/dir2/dir3'). The default contains the usual mount point for
+# removable media by default to remind people that it is a bad idea to
+# naively have recoll work on these (esp. with the monitor: media gets
+# indexed on mount, all data gets erased on unmount). Typically the
+# presence of '/media' is mostly a reminder, it would only have effect for
+# someone who is indexing '/'. Explicitely adding '/media/xxx' to the
+# topdirs will override this.
+skippedPaths = /media
-# Recoll uses FNM_PATHNAME by default when matching skipped paths, which
-# means that /dir1/dir2/dir3 is not matched by */dir3. Can't change the
-# default now, but you can set the following variable to 0 to disable the
-# use of FNM_PATHNAME (see fnmatch(3) man page)
+# Set to 0 to
+# override use of FNM_PATHNAME for matching skipped
+# paths.
#skippedPathsFnmPathname = 1
-# Option to follow symbolic links. We normally don't, to avoid duplicated
-# indexing (in any case, no effort is made to identify or avoid multiple
-# indexing of linked files)
+# skippedPaths equivalent specific to
+# real time indexing.This enables having parts of the tree
+# which are initially indexed but not monitored. If daemSkippedPaths is
+# not set, the daemon uses skippedPaths.
+#daemSkippedPaths =
+
+
+# Space-separated list of
+# wildcard expresions for names that should be ignored
+# inside zip archives.This is used directly by the zip
+# handler, and has a function similar to skippedNames, but
+# works independantly. Can be redefined for subdirectories. Supported by
+# recoll 1.20 and newer. See
+# https://bitbucket.org/medoc/recoll/wiki/Filtering%20out%20Zip%20archive%20members
+#
+#zipSkippedNames =
+
+# Follow symbolic links during
+# indexing.The default is to ignore symbolic links to avoid
+# multiple indexing of linked files. No effort is made to avoid duplication
+# when this option is set to true. This option can be set individually for
+# each of the 'topdirs' members by using sections. It can not be changed
+# below the 'topdirs' level. Links in the 'topdirs' list itself are always
+# followed.
#followLinks = 0
-# Debug messages. 2 is errors/warnings only. 3 information like doc
-# updates, 4 is quite verbose and 6 very verbose
-loglevel = 3
-logfilename = stderr
+# Restrictive list of
+# indexed mime types.Normally not set (in which case all
+# supported types are indexed). If it is set,
+# only the types from the list will have their contents indexed. The names
+# will be indexed anyway if indexallfilenames is set (default). MIME
+# type names should be taken from the mimemap file. Can be redefined for
+# subtrees.
+#indexedmimetypes =
-# The following can be used to set different values for logging by the
-# indexer (recollindex). The default is to use loglevel/logfilename
-#idxloglevel = 3
-#idxlogfilename = stderr
+# List of excluded MIME
+# types.Lets you exclude some types from indexing. Can be
+# redefined for subtrees.
+#excludedmimetypes =
-# Specific versions of log file name and level for the indexing daemon. The
-# default is to use the idx... values if set, else the log... values.
-#daemloglevel = 3
-#daemlogfilename = /dev/null
+# Size limit for compressed
+# files.We need to decompress these in a
+# temporary directory for identification, which can be wasteful in some
+# cases. Limit the waste. Negative means no limit. 0 results in no
+# processing of any compressed file.
+compressedfilemaxkbs = 50000
-# Run directory for the indexing process. The filters sometimes leave
-# garbage in the current directory, so it makes sense to have recollindex
-# chdir to some garbage bin. 3 possible values:
-# - (literal) tmp : go to temp dir as set by env (RECOLL_TMPDIR else
-# TMPDIR else /tmp)
-# - Empty: stay where started
-# - Absolute path value: go there.
-idxrundir = tmp
+# Size limit for text
+# files.Mostly for skipping monster
+# logs.
+textfilemaxmbs = 20
-# Decide if we store character case and diacritics in the index. If we do,
+# Index the file names of
+# unprocessed filesIndex the names of files the contents of
+# which we don't index because of an excluded or unsupported MIME
+# type.
+indexallfilenames = 1
+
+# Use a system command
+# for file MIME type guessing as a final step in file type
+# identificationThis is generally useful, but will usually
+# cause the indexing of many bogus 'text' files. See 'systemfilecommand'
+# for the command used.
+usesystemfilecommand = 1
+
+# Command used to guess
+# MIME types if the internal methods failsThis should be a
+# "file -i" workalike. The file path will be added as a last parameter to
+# the command line. 'xdg-mime' works better than the traditional 'file'
+# command, and is now the configured default (with a hard-coded fallback to
+# 'file')
+systemfilecommand = xdg-mime query filetype
+
+# Decide if we process the
+# Web queue.The queue is a directory where the Recoll Web
+# browser plugins create the copies of visited pages.
+processwebqueue = 0
+
+# Page size for text
+# files.If this is set, text/plain files will be divided
+# into documents of approximately this size. Will reduce memory usage at
+# index time and help with loading data in the preview window at query
+# time. Particularly useful with very big files, such as application or
+# system logs.
+textfilepagekbs = 1000
+
+# Size limit for archive
+# members.This is passed to the filters in the environment
+# as RECOLL_FILTER_MAXMEMBERKB.
+membermaxkbs = 50000
+
+
+
+# Parameters affecting how we generate terms
+
+# Changing some of these parameters will imply a full
+# reindex. Also, when using multiple indexes, it may not make sense
+# to search indexes that don't share the values for these parameters,
+# because they usually affect both search and index operations.
+
+
+# Decide if we store
+# character case and diacritics in the index.If we do,
# searches sensitive to case and diacritics can be performed, but the index
-# will be bigger, and some marginal weirdness may sometimes occur. We
-# default to a stripped index for now.
+# will be bigger, and some marginal weirdness may sometimes occur. The
+# default is a stripped index. When using multiple indexes for a search,
+# this parameter must be defined identically for all. Changing the value
+# implies an index reset.
indexStripChars = 1
-# IF the index is not stripped. Decide if we automatically trigger
-# diacritics sensitivity if the search term has accented characters (not in
-# unac_except_trans). Else you need to use the query language and the "D"
-# modifier to specify diacritics sensitivity. Default is no.
-autodiacsens = 0
+# Decides if terms will be
+# generated for numbers.For example "123", "1.5e6",
+# 192.168.1.4, would not be indexed if nonumbers is set ("value123" would
+# still be). Numbers are often quite interesting to search for, and this
+# should probably not be set except for special situations, ie, scientific
+# documents with huge amounts of numbers in them, where setting nonumbers
+# will reduce the index size. This can only be set for a whole index, not
+# for a subtree.
+#nonumbers = 0
-# IF the index is not stripped. Decide if we automatically trigger
-# character case sensitivity if the search term has upper-case characters
-# in any but the first position. Else you need to use the query language
-# and the "C" modifier to specify character-case sensitivity. Default is
-# yes.
-autocasesens = 1
+# Determines if we index
+# 'coworker' also when the input is 'co-worker'.This is new
+# in version 1.22, and on by default. Setting the variable to off allows
+# restoring the previous behaviour.
+#dehyphenate = 1
-# Languages for which to build stemming databases at the end of
-# indexing. Stemmer names can be found on http://www.xapian.org
-# The flag to perform stem expansion at query time is now set from the GUI
+# Decides if specific east asian
+# (Chinese Korean Japanese) characters/word splitting is turned
+# off.This will save a small amount of cpu if you have no CJK
+# documents. If your document base does include such text but you are not
+# interested in searching it, setting nocjk may be a
+# significant time and space saver.
+#nocjk = 0
+
+# This lets you adjust the size of
+# n-grams used for indexing CJK text.The default value of 2 is
+# probably appropriate in most cases. A value of 3 would allow more precision
+# and efficiency on longer words, but the index will be approximately twice
+# as large.
+#cjkngramlen = 2
+
+# Languages for
+# which to create stemming expansion data.Stemmer names can
+# be found on http://www.xapian.org, or by executing 'recollindex -l', or
+# this can also be set from a list in the GUI
indexstemminglanguages = english
-# Default character set. Values found inside files, ie content tag in html
-# documents, will override this. It can be specified per directory (see
-# below). Used when converting to utf-8 (internal storage format), so it
-# may be quite important for pure text files.
-# The default used to be set to iso8859-1, but we now take it from the nls
-# environment (LC_ALL/LC_CTYPE/LANG). The ultimate hardwired default is
-# still 8859-1. If for some reason you want a general default which doesnt
-# match your LANG and is not 8859-1, set it here.
-# defaultcharset = iso-8859-1
+# Default character
+# set.This is used for files which do not contain a
+# character set definition (e.g.: text/plain). Values found inside files,
+# e.g. a 'charset' tag in HTML documents, will override it. If this is not
+# set, the default character set is the one defined by the NLS environment
+# ($LC_ALL, $LC_CTYPE, $LANG), or ultimately iso-8859-1 (cp-1252 in fact).
+# If for some reason you want a general default which does not match your
+# LANG and is not 8859-1, use this variable. This can be redefined for any
+# sub-directory.
+#defaultcharset = iso-8859-1
-# A list of characters, encoded in UTF-8, which should be handled specially
-# when converting text to unaccented lowercase. For example, in Swedish,
-# the letter a with diaeresis has full alphabet citizenship and should not
-# be turned into an a.
+# A list of characters,
+# encoded in UTF-8, which should be handled specially
+# when converting text to unaccented lowercase.For
+# example, in Swedish, the letter a with diaeresis has full alphabet
+# citizenship and should not be turned into an a.
# Each element in the space-separated list has the special character as
# first element and the translation following. The handling of both the
# lowercase and upper-case versions of a character should be specified, as
# appartenance to the list will turn-off both standard accent and case
-# processing. Examples:
+# processing. The value is global and affects both indexing and querying.
+# Examples:
# Swedish:
# unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl åå Åå
# German:
@@ -138,289 +254,48 @@ indexstemminglanguages = english
# Reasonable default for all until someone protests. These decompositions
# are not performed by unac, but I cant imagine someone typing the composed
# forms in a search.
+# unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl
unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl
-# Turn off the indexing of numbers: may reduce the index size if you have
-# no use for them
-# nonumbers = 0
-# Turn off indexing "coworker" for an input of "co-worker" (in addition to
-# co, worker, "co worker". Default is on as of version 1.22
-# dehyphenate = 1
+# Overrides the default
+# character set for email messages which don't specify
+# one.This is mainly useful for readpst (libpst) dumps,
+# which are utf-8 but do not say so.
+#maildefcharset=
-# Maximum expansion count for a single term (ie: when using wildcards).
-# We used to not limit this at all (except for filenames where the limit
-# was too low at 1000), but it is unreasonable with a big index.
-# Default 10 000
-maxTermExpand = 10000
+# Set fields on all files
+# (usually of a specific fs area).Syntax is the usual:
+# name = value ; attr1 = val1 ; [...]
+# value is empty so this needs an initial semi-colon. This is useful, e.g.,
+# for setting the rclaptg field for application selection inside
+# mimeview.
+#[/some/app/directory]
+#localfields = ; rclaptg = someapp; otherfield = somevalue
-# Maximum number of clauses we add to a single Xapian query. In some cases,
-# the result of term expansion can be multiplicative, and we want to avoid
-# eating all the memory. Default 50000
-maxXapianClauses = 50000
-
-# Recoll data directories are normally stored relative to the configuration
-# directory (e.g. ~/.recoll/xapiandb, ~/.recoll/mboxcache). If this is set,
-# the directories are stored under the specified value instead
-# (e.g. if cachedir is ~/.cache/recoll, the default dbdir would be
-# ~/.cache/recoll/xapiandb).
-# This affects dbdir, webcachedir, mboxcachedir, aspellDicDir, which can
-# still be individually specified to override cachedir.
-# Note that if you have multiple configurations, each must have a different
-# cachedir, there is no automatic computation of a subpath under cachedir.
-#cachedir = ~/.cache/recoll
-
-# Where to store the database (directory). This may be an absolute path,
-# else it is taken as relative to cachedir if set, or the configuration
-# directory (-c argument or $RECOLL_CONFDIR). If nothing is specified, the
-# default is then ~/.recoll/xapiandb/
-dbdir = xapiandb
-
-# Indexing process threads configuration. If Recoll is configured for
-# multithreading, this defines what queues are active and how many threads
-# to start for any of them. The default values were found good on a
-# quad-core processor. The three steps are file conversion, term extraction
-# and conversion and Xapian index update. The three queue values define the
-# max number of jobs waiting on one of the corresponding queues. Setting a
-# value to -1 disables a queue (replaced by a direct call). The thrTcounts
-# values define the number of threads to start for each queue. The last
-# value can only be one (as Xapian is single-threaded).
-# If the first element in thrQSizes is 0, recollindex will attempt to set
-# roughly guestimated values based on the number of CPUs.
-#
-# The following are the best setup on my core i5 system (4 cores, no
-# hyperthreading, multiple disks).
-#thrQSizes = 2 2 2
-#thrTCounts = 4 2 1
-# The default is to let recoll guess.
-thrQSizes = 0
-
-# Maximum file system occupation before we stop indexing. The default value
-# is 0, meaning no checking. The value is a percentage, corresponding to
-# what the "Capacity" df output column shows.
-maxfsoccuppc = 0
-
-# Threshold (megabytes of new data) where we flush from memory to disk
-# index. Setting this (ie to 10) can help control memory usage.
-#
-# A value of 0 means no explicit flushing, which lets Xapian perform its
-# own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD documents
-# created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an environment
-# variable. As memory usage depends on average document size, not only
-# document count, this is not very useful.
-#
-# The default value of 10 MB may be a bit low. If you are looking for
-# maximum speed, you may want to experiment with values between 20 and
-# 80. In my experience, values beyond 100 are always counterproductive. If
-# you find otherwise, please drop me a note.
-idxflushmb = 10
-
-# Place to search for executable filters. If RECOLL_FILTERSDIR is set in
-# the environment, we use it instead. Defaults to $prefix/share/recoll/filters
-# filtersdir = /path/to/my/filters
-
-# Additional places to search for helper executables. This is only used on
-# Windows for now
-# recollhelperpath = c:/someprog/bin;c:/someotherprog/bin
-
-# Place to search for icons. The only reason to change this would be if you
-# want to change the icons displayed in the result list.
-# Defaults to $prefix/share/recoll/images
-# iconsdir = /path/to/my/icons
-
-# Should we use the system's 'file -i' command as a final step in file type
-# identification ? This may be useful, but will usually cause the
-# indexation of many bogus 'text' files
-usesystemfilecommand = 1
-# Actual command to use as "file -i" workalike.
-# The file path will be added as a last parameter to the command line. If
-# that's not what your preferred command would like, use an intermediary
-# script.
-# xdg-mime now works better than the traditional "file" command, and is now
-# the configured default (with a hard-coded fallback to "file")
-systemfilecommand = xdg-mime query filetype
-# systemfilecommand = file -i filetype
-
-# Should we index the file names of files with mime types we don't
-# know? (we can otherwise just ignore them)
-indexallfilenames = 1
-
-# A restrictive list of indexed mime types. Normally not set. If it is set,
-# only the types from the list will have their contents indexed (the names
-# will be indexed anyway if indexallfilenames is set as by default). Mime
-# type names should be taken from the mimemap file.
-#
-# indexedmimetypes =
-
-# An excluded list of mime types. It can be redefined in subdirectories,
-# so can be used to locally exclude some types.
-#
-# excludedmimetypes =
-
-#
-# Size limit for archive members. This is passed to the filters in the
-# environment as RECOLL_FILTER_MAXMEMBERKB
-#
-membermaxkbs = 50000
-
-# Size limit for compressed files. We need to decompress these in a
-# temporary directory for identification, which can be wasteful in some
-# cases. Limit the waste. Negative means no limit. 0 results in no
-# processing of any compressed file. Used to be -1 by default.
-compressedfilemaxkbs = 50000
-
-# Size limit for text files. This is for skipping monster logs
-textfilemaxmbs = 20
-
-# Page size for text files. If this is set, text/plain files will be
-# divided into documents of approximately this size. May be useful to
-# access pieces of big text files which would be problematic to load as one
-# piece into the preview window. Might be useful for big logs
-textfilepagekbs = 1000
-
-# Maximum external filter execution time. Default 20mn. This is mainly
-# to avoid infinite loops in postscript files (loop.ps)
-filtermaxseconds = 1200
-# Maximum virtual memory space for filter process (setrlimit(RLIMIT_AS)),
-# in megabytes. Note that this includes any mapped libs (there is no
-# reliable Linux way to limit the data space only), so we need to be a
-# bit generous here. Anything over 2000 will be ignored on 32 bits machines.
-filtermaxmbytes = 2000
-
-# Length of abstracts we store while indexing. Longer will make for a
-# bigger db
-# idxabsmlen = 250
-
-# Truncation length of stored metadata fields. This does not affect
-# indexing, just what can be displayed inside results.
-# idxmetastoredlen = 150
-
-# Language definitions to use when creating the aspell dictionary.
-# The value must match a set of aspell language definition files.
-# You can type "aspell dicts" to see a list
-# The default if this is not set is to use the NLS environment to guess the
-# value
-# aspellLanguage = en
-
-# Somme aspell packages may need an additional option (e.g. on Debian
-# Jessie). See Debian bug 772415
-# aspellAddCreateParam = --local-data-dir=/usr/lib/aspell
-
-# The aspell dictionary (aspdict.(lang).rws) is normally stored in the
-# directory specified by cachedir if set, or under the configuration
-# directory. Set the following to change:
-#aspellDicDir =
-
-# You may also want to set this to have a look at aspell dictionary
-# creation errors. But there are always many, so this is mostly for debugging
-# aspellKeepStderr = 1
-
-# Disabling aspell use. The aspell dictionary generation takes some time,
-# and some combinations of aspell version, language, and local terms,
-# result in aspell dumping core each time. You can disable the aspell
-# dictionary generation by setting the following variable:
-# noaspell = 1
-
-# Timing parameters for the real time mode:
-#
-# Seconds between auxiliary databases updates (stemdb, aspell):
-# monauxinterval = 3600
-#
-# Resting time (seconds) during which we let the queue accumulate, in hope
-# that events to the same file will merge, before we start indexing:
-# monixinterval = 30
-#
-# Definitions for files which get a longer delay before reindexing is
-# allowed. This is for fast-changing files, that should only be reindexed
-# once in a while. A list of wildcardPattern:seconds pairs. The patterns
-# are matched with fnmatch(pattern, path, 0) You can quote entries containing
-# white space with double quotes. The default is empty, here follows an
-# example:
-# mondelaypatterns = *.log:20 "*with spaces.*:30"
-
-# ionice class for monitor (on platforms where this is supported)
-# monioniceclass = 3
-# ionice class param for monitor (on platforms where this is supported)
-# monioniceclassdata =
-
-# If this is set, process the directory where the Recoll Web browser plugins
-# copy visited pages for indexing.
-processwebqueue = 0
-# The path to the Web indexing queue. This is hard-coded in the
-# plugin as ~/.recollweb/ToIndex so there should be no need to change it.
-#webqueuedir = ~/.recollweb/ToIndex
-# This is only used by the web history indexing code, and
-# defines where the cache for visited pages will live. Default:
-# cachedir/webcache if cachedir is set, else $RECOLL_CONFDIR/webcache
-webcachedir = webcache
-# This is only used by the web history indexing code, and
-# defines the maximum size for the web page cache. Default: 40 MB.
-# Reducing the size will not physically truncate the file.
-webcachemaxmbs = 40
-
-# The directory where mbox message offsets cache files are held. This is
-# normally named mboxcache under cachedir if set, or else under the
-# configuration directory, but it may be useful to share a
-# directory between different configurations.
-#mboxcachedir = mboxcache
-
-# The minimum mbox file size over which we cache the offsets. There is
-# really no sense in caching offsets for small files. The default is 5 MB.
-#mboxcacheminmbs = 5
-
-# Maximum number of positions we walk while populating a snippet for the
-# result list. The default of 1 000 000 may be insufficient for big
-# documents, the consequence would be snippets with possibly
-# meaning-altering missing words.
-snippetMaxPosWalk = 1000000
-
-# Use mtime instead of default ctime to determine if a file has been
-# modified (in addition to size, which is always used).
+# Use mtime instead of
+# ctime to test if a file has been modified.The time is used
+# in in addition to the size, which is always used.
# Setting this can reduce re-indexing on systems where extended attributes
-# are used (by some other applications), but not indexed (changing
-# ext. attrs. only affects ctime).
+# are used (by some other application), but not indexed, because changing
+# extended attributes only affects ctime.
# Notes:
-# - this may prevent detection of change in some marginal file rename cases
+# - This may prevent detection of change in some marginal file rename cases
# (the target would need to have the same size and mtime).
# - You should probably also set noxattrfields to 1 in this case, except if
# you still prefer to perform xattr indexing, for example if the local
# file update pattern makes it of value (as in general, there is a risk
# for pure extended attributes updates without file modification to go
# undetected). Perform a full index reset after changing this.
+#
testmodifusemtime = 0
-# Disable extended attributes conversion to metadata fields. This probably
-# needs to be set if testmodifusemtime is set.
+# Disable extended attributes
+# conversion to metadata fields.This probably needs to be
+# set if testmodifusemtime is set.
noxattrfields = 0
-# Script used to heuristically check if we need to retry indexing files
-# which previously failed. The default script checks the modified dates on
-# /usr/bin and /usr/local/bin. A relative path will be looked up in the
-# filters dirs, then in the path. Use an absolute path to do otherwise.
-checkneedretryindexscript = rclcheckneedretry.sh
-
-# Parameters for the PDF input script
-# Attempt OCR of PDF files with no text content if both tesseract and
-# pdftoppm are installed. The default is not to do it because OCR is so
-# very slow
-#pdfocr = 0
-# Enable PDF attachment extraction, using pdftk (if available). This is
-# normally disabled, because it does slow down PDF indexing a bit even if
-# not one attachment is ever found.
-#pdfattach = 0
-
-# You could specify different parameters for a subdirectory like this:
-#[~/hungariandocs/plain]
-#defaultcharset = iso-8859-2
-
-# You can set fields on all files of a specific fs area. (rclaptg can be
-# used for application selection inside mimeview).
-# Syntax is the usual name = value ; attr1 = val1 ; ... with an empty value
-# so needs initial semi-colon
-#[/some/app/directory]
-#localfields = ; rclaptg = someapp; otherfield = somevalue
-
-# It's also possible to execute external commands to gather external
-# metadata, for example tmsu tags.
+# Define commands to
+# gather external metadata, e.g. tmsu tags.
# There can be several entries, separated by semi-colons, each defining
# which field name the data goes into and the command to use. Don't forget the
# initial semi-colon. All the field names must be different. You can use
@@ -430,14 +305,408 @@ checkneedretryindexscript = rclcheckneedretry.sh
# returns multiple field values inside a text blob formatted as a recoll
# configuration file ("fieldname = fieldvalue" lines). The rclmultixx name
# will be ignored, and field names and values will be parsed from the data.
+#
#[/some/area/of/the/fs]
#metadatacmds = ; tags = tmsu tags %f; rclmulti1 = cmdOutputsConf %f
+
+
+
+# Parameters affecting where and how we store things
+
+# Top directory for Recoll
+# dataRecoll data directories are normally located relative
+# to the configuration directory (e.g. ~/.recoll/xapiandb,
+# ~/.recoll/mboxcache). If 'cachedir' is set, the directories are stored under
+# the specified value instead (e.g. if cachedir is ~/.cache/recoll, the
+# default dbdir would be ~/.cache/recoll/xapiandb). This affects dbdir,
+# webcachedir, mboxcachedir, aspellDicDir, which can still be individually
+# specified to override cachedir. Note that if you have multiple
+# configurations, each must have a different cachedir, there is no
+# automatic computation of a subpath under cachedir.
+#cachedir = ~/.cache/recoll
+
+# Maximum file system occupation
+# over which we stop indexing.The value is a percentage,
+# corresponding to what the "Capacity" df output column shows. The default
+# value is 0, meaning no checking.
+maxfsoccuppc = 0
+
+# Xapian database directory
+# location.This will be created on first indexing. If the
+# value is not an absolute path, it will be interpreted as relative to
+# cachedir if set, or the configuration directory (-c argument or
+# $RECOLL_CONFDIR). If nothing is specified, the default is then
+# ~/.recoll/xapiandb/
+dbdir = xapiandb
+
+# Name of the scratch file where
+# the indexer process updates its status. Default:
+# idxstatus.txt inside the configuration directory
+#idxstatusfile = idxstatus.txt
+
+#
+#
+# Directory location for storing mbox message offsets cache
+# files.This is normally 'mboxcache' under cachedir if set,
+# or else under the configuration directory, but it may be useful to share
+# a directory between different configurations.
+#mboxcachedir = mboxcache
+
+#
+#
+# Minimum mbox file size over which we cache the offsets.
+# There is really no sense in caching offsets for small files. The
+# default is 5 MB.
+#mboxcacheminmbs = 5
+
+#
+#
+# Directory where we store the archived web pages.
+# This is only used by the web history indexing code
+# Default: cachedir/webcache if cachedir is set, else
+# $RECOLL_CONFDIR/webcache
+webcachedir = webcache
+
+#
+# Maximum size in MB of the Web archive.
+# This is only used by the web history indexing code.
+# Default: 100 MB.
+# Reducing the size will not physically truncate the file.
+webcachemaxmbs = 100
+
+#
+#
+# The path to the Web indexing queue.This is
+# hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no
+# need or possibility to change it.
+#webqueuedir = ~/.recollweb/ToIndex
+
+#
+#
+# Aspell dictionary storage directory location. The
+# aspell dictionary (aspdict.(lang).rws) is normally stored in the
+# directory specified by cachedir if set, or under the configuration
+# directory.
+#aspellDicDir =
+
+#
+#
+# Directory location for executable input handlers.If
+# RECOLL_FILTERSDIR is set in the environment, we use it instead. Defaults
+# to $prefix/share/recoll/filters. Can be redefined for
+# subdirectories.
+#filtersdir = /path/to/my/filters
+
+#
+#
+# Directory location for icons.The only reason to
+# change this would be if you want to change the icons displayed in the
+# result list. Defaults to $prefix/share/recoll/images
+#iconsdir = /path/to/my/icons
+
+# Parameters affecting indexing performance and resource
+# usage
+
+#
+#
+# Threshold (megabytes of new data) where we flush from memory to disk
+# index.
+# Setting this allows some control over memory usage by the indexer
+# process. A value of 0 means no explicit flushing, which lets Xapian
+# perform its own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD
+# documents created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an
+# environment variable. As memory usage depends on average document size,
+# not only document count, this is not very useful.
+# The default value of 10 MB may be a bit low. If you are looking for
+# maximum speed, you may want to experiment with values between 20 and
+# 80. In my experience, values beyond 100 are always counterproductive. If
+# you find otherwise, please drop me a note.
+idxflushmb = 10
+
+#
+#
+# Maximum external filter execution time in
+# seconds.Default 1200 (20mn). Set to 0 for no limit. This
+# is mainly to avoid infinite loops in postscript files
+# (loop.ps)
+filtermaxseconds = 1200
+
+#
+#
+# Maximum virtual memory space for filter processes
+# (setrlimit(RLIMIT_AS)), in megabytes. Note that this
+# includes any mapped libs (there is no reliable Linux way to limit the
+# data space only), so we need to be a bit generous here. Anything over
+# 2000 will be ignored on 32 bits machines.
+filtermaxmbytes = 2000
+
+#
+#
+# Stage input queues configuration. There are three
+# internal queues in the indexing pipeline stages (file data extraction,
+# terms generation, index update). This parameter defines the queue depths
+# for each stage (three integer values). If a value of -1 is given for a
+# given stage, no queue is used, and the thread will go on performing the
+# next stage. In practise, deep queues have not been shown to increase
+# performance. Default: a value of 0 for the first queue tells &RCL; to
+# perform autoconfiguration based on the detected number of CPUs (no need
+# for the two other values in this case). Use thrQSizes = -1 -1 -1 to
+# disable multithreading entirely.
+thrQSizes = 0
+
+#
+#
+# Number of threads used for each indexing stage. The
+# three stages are: file data extraction, terms generation, index
+# update). The use of the counts is also controlled by some special values
+# in thrQSizes: if the first queue depth is 0, all counts are ignored
+# (autoconfigured); if a value of -1 is used for a queue depth, the
+# corresponding thread count is ignored. It makes no sense to use a value
+# other than 1 for the last stage because updating the &XAP; index is
+# necessarily single-threaded (and protected by a mutex).
+#thrTCounts = 4 2 1
+
+
+# Miscellaneous parameters
+
+#
+#
+# Debug log verbosity 1-6 2 is errors/warnings
+# only. 3 information like document updates, 4 is quite verbose and 6 very
+# verbose.
+loglevel = 3
+
+#
+#
+# Debug log destination. Use 'stderr' (default) to write to the
+# console.
+logfilename = stderr
+
+#
+#
+# Override loglevel for the indexer.
+#idxloglevel = 3
+
+#
+#
+# Override logfilename for the indexer.
+#idxlogfilename = stderr
+
+#
+#
+# Override loglevel for the indexer in real time
+# mode.The default is to use the idx... values if set, else
+# the log... values.
+#daemloglevel = 3
+
+#
+#
+# Override logfilename for the indexer in real time
+# mode.The default is to use the idx... values if set, else
+# the log... values.
+#daemlogfilename = /dev/null
+
+#
+#
+# Indexing process current directory. The input
+# handlers sometimes leave temporary files in the current directory, so it
+# makes sense to have recollindex chdir to some temporary directory. Three
+# possible types of values:
+# - (literal) tmp : go to temp dir as set by environment (RECOLL_TMPDIR else
+# TMPDIR else /tmp)
+# - Empty: stay where started
+# - Absolute path value: go there.
+idxrundir = tmp
+
+#
+#
+# Script used to heuristically check if we need to retry indexing
+# files which previously failed. The default script checks
+# the modified dates on /usr/bin and /usr/local/bin. A relative path will
+# be looked up in the filters dirs, then in the path. Use an absolute path
+# to do otherwise.
+checkneedretryindexscript = rclcheckneedretry.sh
+
+#
+#
+# Additional places to search for helper executables.
+# This is only used on Windows for now.
+#recollhelperpath = c:/someprog/bin;c:/someotherprog/bin
+
+#
+#
+# Length of abstracts we store while indexing.
+# Recoll stores an abstract for each indexed file.
+# The text can come from an actual 'abstract' section in the
+# document or will just be the beginning of the document. It is stored in
+# the index so that it can be displayed inside the result lists without
+# decoding the original file. The idxabsmlen parameter
+# defines the size of the stored abstract. The default value is 250
+# bytes. The search interface gives you the choice to display this stored
+# text or a synthetic abstract built by extracting text around the search
+# terms. If you always prefer the synthetic abstract, you can reduce this
+# value and save a little space.
+#idxabsmlen = 250
+
+#
+#
+# Truncation length of stored metadata fields.This
+# does not affect indexing (the whole field is processed anyway), just the
+# amount of data stored in the index for the purpose of displaying fields
+# inside result lists or previews. The default value is 150 bytes which
+# may be too low if you have custom fields.
+#idxmetastoredlen = 150
+
+#
+#
+# Language definitions to use when creating the aspell
+# dictionary.The value must match a set of aspell language
+# definition files. You can type "aspell dicts" to see a list The default
+# if this is not set is to use the NLS environment to guess the
+# value.
+#aspellLanguage = en
+
+#
+#
+# Additional parameter to aspell dictionary creation
+# command.Some aspell packages may need an additional option
+# (e.g. on Debian Jessie). See Debian bug 772415.
+#aspellAddCreateParam = --local-data-dir=/usr/lib/aspell
+
+#
+#
+# Set this to have a look at aspell dictionary creation
+# errors.There are always many, so this is mostly for
+# debugging.
+#aspellKeepStderr = 1
+
+#
+#
+# Disable aspell use.The aspell dictionary generation
+# takes time, and some combinations of aspell version, language, and local
+# terms, result in aspell crashing, so it sometimes makes sense to just
+# disable the thing.
+#noaspell = 1
+
+#
+#
+# Seconds between auxiliary databases updates (stemdb,
+# aspell).The default is one hour.
+#monauxinterval = 3600
+
+#
+#
+# Minimum interval (seconds) between processings of the indexing
+# queue. The real time monitor does not process each event
+# when it comes in, but lets the queue accumulate, to diminish overhead and
+# to aggregate multiple events to the same file. Default 30 S.
+#monixinterval = 30
+
+#
+#
+# Timing parameters for the real time indexing.
+# Definitions for files which get a longer delay before reindexing
+# is allowed. This is for fast-changing files, that should only be
+# reindexed once in a while. A list of wildcardPattern:seconds pairs. The
+# patterns are matched with fnmatch(pattern, path, 0) You can quote entries
+# containing white space with double quotes (quote the whole entry, not the
+# pattern). The default is empty. Example:mondelaypatterns = *.log:20
+# "*with spaces.*:30"
+#mondelaypatterns = *.log:20 "*with spaces.*:30"
+
+#
+#
+# ionice class for the real time indexing process
+# On platforms where this is supported, the default value is
+# 3.
+# monioniceclass = 3
+
+#
+#
+# ionice class parameter for the real time indexing process.
+# On platforms where this is supported. The default is
+# empty.
+#monioniceclassdata =
+
+
+
+# Query-time parameters (no impact on the index)
+
+#
+#
+# auto-trigger diacritics sensitivity (raw index only)
+# IF the index is not stripped, decide if we automatically trigger
+# diacritics sensitivity if the search term has accented characters (not in
+# unac_except_trans). Else you need to use the query language and the "D"
+# modifier to specify diacritics sensitivity. Default is no.
+autodiacsens = 0
+
+#
+#
+# auto-trigger case sensitivity (raw index only) IF
+# the index is not stripped (see indexStripChars), decide if we
+# automatically trigger character case sensitivity if the search term has
+# upper-case characters in any but the first position. Else you need to use
+# the query language and the "C" modifier to specify character-case
+# sensitivity. Default is yes.
+autocasesens = 1
+
+# Maximum query expansion count
+# for a single term (e.g.: when using wildcards).This only
+# affects queries, not indexing. We used to not limit this at all (except
+# for filenames where the limit was too low at 1000), but it is
+# unreasonable with a big index. Default 10000.
+maxTermExpand = 10000
+
+# Maximum number of clauses
+# we add to a single Xapian query.This only affects queries,
+# not indexing. In some cases, the result of term expansion can be
+# multiplicative, and we want to avoid eating all the memory. Default
+# 50000.
+maxXapianClauses = 50000
+
+#
+#
+# Maximum number of positions we walk while populating a snippet for the
+# result list.The default of 1,000,000 may be insufficient
+# for big documents, the consequence would be snippets with possibly
+# meaning-altering missing words.
+snippetMaxPosWalk = 1000000
+
+
+# Parameters for the PDF input script
+
+#
+#
+# Attempt OCR of PDF files with no text content if both tesseract and
+# pdftoppm are installed.The default is off because OCR is so
+# very slow.
+#pdfocr = 0
+
+#
+#
+# Enable PDF attachment extraction by executing pdftk (if
+# available).This is
+# normally disabled, because it does slow down PDF indexing a bit even if
+# not one attachment is ever found.
+#pdfattach = 0
+
+
+# Parameters set for specific locations
+
+# You could specify different parameters for a subdirectory like this:
+#[~/hungariandocs/plain]
+#defaultcharset = iso-8859-2
+
[/usr/share/man]
followLinks = 1
-# Enable thunderbird mbox format quirks where appropriate, and same for
-# mozilla/seamonkey
+#
+#
+# Enable thunderbird/mozilla-seamonkey mbox format quirks
+# Set this for the directory where the email mbox files are
+# stored.
[~/.thunderbird]
mhmboxquirks = tbird
[~/.mozilla]