diff --git a/src/doc/man/recollindex.1 b/src/doc/man/recollindex.1
index 761c94a1..c8147d01 100644
--- a/src/doc/man/recollindex.1
+++ b/src/doc/man/recollindex.1
@@ -125,16 +125,18 @@ is given, the database will be erased before starting. If option
is given, the database will not be reset, but all files will be considered
as needing reindexing (in place reset).
.PP
-By default,
+As of version 1.21,
.B recollindex
-does not process again files which previously failed to index (for example
-because of a missing helper program). This behaviour is new in version
-1.21, error files were always retried in previous versions.
-If option
+usually does not process again files which previously failed to index (for
+example because of a missing helper program). If option
.B \-k
is given,
.B recollindex
-will try again to process all failed files.
+will try again to process all failed files. Please note that
+.B recollindex
+may also decide to retry failed files if the auxiliary checking script
+defined by the "checkneedretryindexscript" configuration variable indicates
+that this should happen.
.PP
If option
.B
diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml
index db9a06f1..8bf45096 100644
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@@ -38,7 +38,7 @@
This document introduces full text search notions
and describes the installation and use of the &RCL;
- application. It currently describes &RCL; &RCLVERSION;.
+ application. This version describes &RCL; &RCLVERSION;.
@@ -448,9 +448,36 @@ indexedmimetypes = application/pdf
index configuration tool.
-
+
+ Indexing failures
+
+ Indexing may fail for some documents, for a number of
+ reasons: a helper program may be missing, the document may be
+ corrupt, we may fail to uncompress a file because no file
+ system space is available, etc.
+
+ &RCL; versions prior to 1.21 always retried to index
+ files which had previously caused an error. This guaranteed
+ that anything that may have become indexable (for example
+ because a helper had been installed) would be indexed. However
+ this was bad for performance because some indexing failures
+ may be quite costly (for example failing to uncompress a big
+ file because of insufficient disk space).
+
+ The indexer in &RCL; versions 1.21 and later do not
+ retry failed file by default. Retrying will only occur if an
+ explicit option () is set on the
+ recollindex command line, or if a script
+ executed when recollindex starts up says
+ so. The script is defined by a configuration variable
+ (checkneedretryindexscript), and makes a
+ rather lame attempt at deciding if a helper command may have
+ been installed, by checking if any of the common
+ bin directories have changed.
+
+ Recovery
@@ -5862,7 +5889,7 @@ thrQSizes = -1 -1 -1
- autodiacsens
+ autodiacsensIF the index is not stripped, decide if we
automatically trigger diacritics sensitivity if the search
term has accented characters (not in
@@ -5900,6 +5927,19 @@ thrQSizes = -1 -1 -1
+ checkneedretryindexscript
+ This defines the name for a command
+ executed by recollindex when starting
+ indexing. If the exit status of the command is 0,
+ recollindex retries to index all files
+ which previously could not be indexed because of data
+ extraction errors. The default value is a script which
+ checks if any of the common bin
+ directories have changed (indicating that a helper program
+ may have been installed).
+
+
+
mondelaypatternsThis allows specify wildcard path patterns
(processed with fnmatch(3) with 0 flag), to match files which
diff --git a/src/filters/rclcheckneedretry.sh b/src/filters/rclcheckneedretry.sh
new file mode 100755
index 00000000..400cf2d5
--- /dev/null
+++ b/src/filters/rclcheckneedretry.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+# Check /usr/bin and /usr/local/bin modification date against recorded
+# state, as recorded inside ~/.config/Recoll.org/needidxretrydate
+#
+# If any argument is given, we record the new state instead of
+# generating it (this should be used at the end of an indexing pass
+# with retry set).
+#
+# The script exits with 0 if retrying should be performed (something
+# changed), 1 else.
+
+# Bin dirs to be tested:
+bindirs="/usr/bin /usr/local/bin $HOME/bin /opt/*/bin"
+
+rfiledir=~/.config/Recoll.org
+rfile=$rfiledir/needidxretrydate
+nrfile=$rfiledir/tneedidxretrydate
+
+test -d $rfiledir || mkdir -p $rfiledir
+
+# If any argument is given, we are called just to record the new
+# state. We do not recompute it as it may have changed during
+# indexing, but just move the state in place
+if test $# != 0 ; then
+ mv -f $nrfile $rfile
+ exit 0
+fi
+
+# Compute state of bin dirs and see if anything changed:
+> $nrfile
+for dir in $bindirs; do
+ ls -ld $dir >> $nrfile 2> /dev/null
+done
+
+if cmp -s $rfile $nrfile ; then
+ exit 1
+else
+ exit 0
+fi
+
diff --git a/src/index/checkretryfailed.cpp b/src/index/checkretryfailed.cpp
new file mode 100644
index 00000000..b818749c
--- /dev/null
+++ b/src/index/checkretryfailed.cpp
@@ -0,0 +1,54 @@
+/* Copyright (C) 2014 J.F.Dockes
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include "autoconfig.h"
+
+#include
+#include
+
+#include "rclconfig.h"
+#include "execmd.h"
+#include "debuglog.h"
+#include "checkretryfailed.h"
+
+using namespace std;
+
+bool checkRetryFailed(RclConfig *conf, bool record)
+{
+ string cmd;
+
+ if (!conf->getConfParam("checkneedretryindexscript", cmd)) {
+ LOGDEB(("checkRetryFailed: 'checkneedretryindexscript' "
+ "not set in config\n"));
+ // We could toss a dice ? Say no retry in this case.
+ return false;
+ }
+
+ // Look in the filters directory (ies). If not found execpath will
+ // be the same as cmd, and we'll let execvp do its thing.
+ string execpath = conf->findFilter(cmd);
+
+ vector args;
+ if (record) {
+ args.push_back("1");
+ }
+ ExecCmd ecmd;
+ int status = ecmd.doexec(execpath, args);
+ if (status == 0) {
+ return true;
+ }
+ return false;
+}
diff --git a/src/index/checkretryfailed.h b/src/index/checkretryfailed.h
new file mode 100644
index 00000000..964d37dc
--- /dev/null
+++ b/src/index/checkretryfailed.h
@@ -0,0 +1,33 @@
+#ifndef _CHECKRETRYFAILED_H_INCLUDED_
+#define _CHECKRETRYFAILED_H_INCLUDED_
+/* Copyright (C) 2015 J.F.Dockes
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+/** Check if retrying failed files may be needed. We execute a
+ shell-script for this. The default one checks if any of the common
+ bin directories changed.
+
+ @param conf the config
+ @param record if true, record the state instead of testing
+
+ @return true if retrying should be performed
+*/
+class RclConfig;
+
+bool checkRetryFailed(RclConfig *conf, bool record);
+
+#endif /* _CHECKRETRYFAILED_H_INCLUDED_ */
diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp
index 57a50ecc..294f3115 100644
--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@@ -46,6 +46,7 @@ using namespace std;
#include "fsindexer.h"
#include "rclionice.h"
#include "execmd.h"
+#include "checkretryfailed.h"
// Command line options
static int op_flags;
@@ -418,9 +419,17 @@ int main(int argc, char **argv)
bool rezero((op_flags & OPT_z) != 0);
bool inPlaceReset((op_flags & OPT_Z) != 0);
- int indexerFlags = ConfIndexer::IxFNone;
- if (!(op_flags & OPT_k))
- indexerFlags |= ConfIndexer::IxFNoRetryFailed;
+
+ // We do not retry previously failed files by default. If -k is
+ // set, we do. If the checker script says so, we do too.
+ int indexerFlags = ConfIndexer::IxFNoRetryFailed;
+ if (op_flags & OPT_k) {
+ indexerFlags &= ~ConfIndexer::IxFNoRetryFailed;
+ } else {
+ if (checkRetryFailed(config, false)) {
+ indexerFlags &= ~ConfIndexer::IxFNoRetryFailed;
+ }
+ }
Pidfile pidfile(config->getPidfile());
updater = new MyUpdater(config);
@@ -538,7 +547,12 @@ int main(int argc, char **argv)
LOGERR(("recollindex, initial indexing pass failed, "
"not going into monitor mode\n"));
exit(1);
- }
+ } else {
+ // Record success of indexing pass with failed files retries.
+ if (!(indexerFlags & ConfIndexer::IxFNoRetryFailed)) {
+ checkRetryFailed(config, true);
+ }
+ }
deleteZ(confindexer);
o_reexec->insertArgs(vector(1, "-n"));
LOGINFO(("recollindex: reexecuting with -n after initial full pass\n"));
@@ -573,6 +587,11 @@ int main(int argc, char **argv)
makeIndexerOrExit(config, inPlaceReset);
bool status = confindexer->index(rezero, ConfIndexer::IxTAll,
indexerFlags);
+
+ // Record success of indexing pass with failed files retries.
+ if (status && !(indexerFlags & ConfIndexer::IxFNoRetryFailed)) {
+ checkRetryFailed(config, true);
+ }
if (!status)
cerr << "Indexing failed" << endl;
if (!confindexer->getReason().empty())
diff --git a/src/lib/mkMake.in b/src/lib/mkMake.in
index 468eda15..25479a36 100755
--- a/src/lib/mkMake.in
+++ b/src/lib/mkMake.in
@@ -14,6 +14,7 @@ ${depth}/common/textsplit.cpp \
${depth}/common/unacpp.cpp \
${depth}/index/beaglequeue.cpp \
${depth}/index/bglfetcher.cpp \
+${depth}/index/checkretryfailed.cpp \
${depth}/index/fetcher.cpp \
${depth}/index/fsfetcher.cpp \
${depth}/index/fsindexer.cpp \
diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in
index c5b718b0..a9390da0 100644
--- a/src/sampleconf/recoll.conf.in
+++ b/src/sampleconf/recoll.conf.in
@@ -340,6 +340,12 @@ snippetMaxPosWalk = 1000000
# undetected). Perform a full index reset after changing this.
testmodifusemtime = 0
+# Script used to heuristically check if we need to retry indexing files
+# which previously failed. The default script checks the modified dates on
+# /usr/bin and /usr/local/bin. A relative path will be looked up in the
+# filters dirs, then in the path. Use an absolute path to do otherwise.
+checkneedretryindexscript = rclcheckneedretry.sh
+
# Disable extended attributes conversion to metadata fields. This probably
# needs to be set if testmodifusemtime is set.
noxattrfields = 0