diff --git a/src/doc/man/recollindex.1 b/src/doc/man/recollindex.1 index 761c94a1..c8147d01 100644 --- a/src/doc/man/recollindex.1 +++ b/src/doc/man/recollindex.1 @@ -125,16 +125,18 @@ is given, the database will be erased before starting. If option is given, the database will not be reset, but all files will be considered as needing reindexing (in place reset). .PP -By default, +As of version 1.21, .B recollindex -does not process again files which previously failed to index (for example -because of a missing helper program). This behaviour is new in version -1.21, error files were always retried in previous versions. -If option +usually does not process again files which previously failed to index (for +example because of a missing helper program). If option .B \-k is given, .B recollindex -will try again to process all failed files. +will try again to process all failed files. Please note that +.B recollindex +may also decide to retry failed files if the auxiliary checking script +defined by the "checkneedretryindexscript" configuration variable indicates +that this should happen. .PP If option .B diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index db9a06f1..8bf45096 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -38,7 +38,7 @@ This document introduces full text search notions and describes the installation and use of the &RCL; - application. It currently describes &RCL; &RCLVERSION;. + application. This version describes &RCL; &RCLVERSION;. @@ -448,9 +448,36 @@ indexedmimetypes = application/pdf index configuration tool. - + + Indexing failures + + Indexing may fail for some documents, for a number of + reasons: a helper program may be missing, the document may be + corrupt, we may fail to uncompress a file because no file + system space is available, etc. + + &RCL; versions prior to 1.21 always retried to index + files which had previously caused an error. This guaranteed + that anything that may have become indexable (for example + because a helper had been installed) would be indexed. However + this was bad for performance because some indexing failures + may be quite costly (for example failing to uncompress a big + file because of insufficient disk space). + + The indexer in &RCL; versions 1.21 and later do not + retry failed file by default. Retrying will only occur if an + explicit option () is set on the + recollindex command line, or if a script + executed when recollindex starts up says + so. The script is defined by a configuration variable + (checkneedretryindexscript), and makes a + rather lame attempt at deciding if a helper command may have + been installed, by checking if any of the common + bin directories have changed. + + Recovery @@ -5862,7 +5889,7 @@ thrQSizes = -1 -1 -1 - autodiacsens + autodiacsens IF the index is not stripped, decide if we automatically trigger diacritics sensitivity if the search term has accented characters (not in @@ -5900,6 +5927,19 @@ thrQSizes = -1 -1 -1 + checkneedretryindexscript + This defines the name for a command + executed by recollindex when starting + indexing. If the exit status of the command is 0, + recollindex retries to index all files + which previously could not be indexed because of data + extraction errors. The default value is a script which + checks if any of the common bin + directories have changed (indicating that a helper program + may have been installed). + + + mondelaypatterns This allows specify wildcard path patterns (processed with fnmatch(3) with 0 flag), to match files which diff --git a/src/filters/rclcheckneedretry.sh b/src/filters/rclcheckneedretry.sh new file mode 100755 index 00000000..400cf2d5 --- /dev/null +++ b/src/filters/rclcheckneedretry.sh @@ -0,0 +1,41 @@ +#!/bin/sh + +# Check /usr/bin and /usr/local/bin modification date against recorded +# state, as recorded inside ~/.config/Recoll.org/needidxretrydate +# +# If any argument is given, we record the new state instead of +# generating it (this should be used at the end of an indexing pass +# with retry set). +# +# The script exits with 0 if retrying should be performed (something +# changed), 1 else. + +# Bin dirs to be tested: +bindirs="/usr/bin /usr/local/bin $HOME/bin /opt/*/bin" + +rfiledir=~/.config/Recoll.org +rfile=$rfiledir/needidxretrydate +nrfile=$rfiledir/tneedidxretrydate + +test -d $rfiledir || mkdir -p $rfiledir + +# If any argument is given, we are called just to record the new +# state. We do not recompute it as it may have changed during +# indexing, but just move the state in place +if test $# != 0 ; then + mv -f $nrfile $rfile + exit 0 +fi + +# Compute state of bin dirs and see if anything changed: +> $nrfile +for dir in $bindirs; do + ls -ld $dir >> $nrfile 2> /dev/null +done + +if cmp -s $rfile $nrfile ; then + exit 1 +else + exit 0 +fi + diff --git a/src/index/checkretryfailed.cpp b/src/index/checkretryfailed.cpp new file mode 100644 index 00000000..b818749c --- /dev/null +++ b/src/index/checkretryfailed.cpp @@ -0,0 +1,54 @@ +/* Copyright (C) 2014 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include "autoconfig.h" + +#include +#include + +#include "rclconfig.h" +#include "execmd.h" +#include "debuglog.h" +#include "checkretryfailed.h" + +using namespace std; + +bool checkRetryFailed(RclConfig *conf, bool record) +{ + string cmd; + + if (!conf->getConfParam("checkneedretryindexscript", cmd)) { + LOGDEB(("checkRetryFailed: 'checkneedretryindexscript' " + "not set in config\n")); + // We could toss a dice ? Say no retry in this case. + return false; + } + + // Look in the filters directory (ies). If not found execpath will + // be the same as cmd, and we'll let execvp do its thing. + string execpath = conf->findFilter(cmd); + + vector args; + if (record) { + args.push_back("1"); + } + ExecCmd ecmd; + int status = ecmd.doexec(execpath, args); + if (status == 0) { + return true; + } + return false; +} diff --git a/src/index/checkretryfailed.h b/src/index/checkretryfailed.h new file mode 100644 index 00000000..964d37dc --- /dev/null +++ b/src/index/checkretryfailed.h @@ -0,0 +1,33 @@ +#ifndef _CHECKRETRYFAILED_H_INCLUDED_ +#define _CHECKRETRYFAILED_H_INCLUDED_ +/* Copyright (C) 2015 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +/** Check if retrying failed files may be needed. We execute a + shell-script for this. The default one checks if any of the common + bin directories changed. + + @param conf the config + @param record if true, record the state instead of testing + + @return true if retrying should be performed +*/ +class RclConfig; + +bool checkRetryFailed(RclConfig *conf, bool record); + +#endif /* _CHECKRETRYFAILED_H_INCLUDED_ */ diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index 57a50ecc..294f3115 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -46,6 +46,7 @@ using namespace std; #include "fsindexer.h" #include "rclionice.h" #include "execmd.h" +#include "checkretryfailed.h" // Command line options static int op_flags; @@ -418,9 +419,17 @@ int main(int argc, char **argv) bool rezero((op_flags & OPT_z) != 0); bool inPlaceReset((op_flags & OPT_Z) != 0); - int indexerFlags = ConfIndexer::IxFNone; - if (!(op_flags & OPT_k)) - indexerFlags |= ConfIndexer::IxFNoRetryFailed; + + // We do not retry previously failed files by default. If -k is + // set, we do. If the checker script says so, we do too. + int indexerFlags = ConfIndexer::IxFNoRetryFailed; + if (op_flags & OPT_k) { + indexerFlags &= ~ConfIndexer::IxFNoRetryFailed; + } else { + if (checkRetryFailed(config, false)) { + indexerFlags &= ~ConfIndexer::IxFNoRetryFailed; + } + } Pidfile pidfile(config->getPidfile()); updater = new MyUpdater(config); @@ -538,7 +547,12 @@ int main(int argc, char **argv) LOGERR(("recollindex, initial indexing pass failed, " "not going into monitor mode\n")); exit(1); - } + } else { + // Record success of indexing pass with failed files retries. + if (!(indexerFlags & ConfIndexer::IxFNoRetryFailed)) { + checkRetryFailed(config, true); + } + } deleteZ(confindexer); o_reexec->insertArgs(vector(1, "-n")); LOGINFO(("recollindex: reexecuting with -n after initial full pass\n")); @@ -573,6 +587,11 @@ int main(int argc, char **argv) makeIndexerOrExit(config, inPlaceReset); bool status = confindexer->index(rezero, ConfIndexer::IxTAll, indexerFlags); + + // Record success of indexing pass with failed files retries. + if (status && !(indexerFlags & ConfIndexer::IxFNoRetryFailed)) { + checkRetryFailed(config, true); + } if (!status) cerr << "Indexing failed" << endl; if (!confindexer->getReason().empty()) diff --git a/src/lib/mkMake.in b/src/lib/mkMake.in index 468eda15..25479a36 100755 --- a/src/lib/mkMake.in +++ b/src/lib/mkMake.in @@ -14,6 +14,7 @@ ${depth}/common/textsplit.cpp \ ${depth}/common/unacpp.cpp \ ${depth}/index/beaglequeue.cpp \ ${depth}/index/bglfetcher.cpp \ +${depth}/index/checkretryfailed.cpp \ ${depth}/index/fetcher.cpp \ ${depth}/index/fsfetcher.cpp \ ${depth}/index/fsindexer.cpp \ diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index c5b718b0..a9390da0 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -340,6 +340,12 @@ snippetMaxPosWalk = 1000000 # undetected). Perform a full index reset after changing this. testmodifusemtime = 0 +# Script used to heuristically check if we need to retry indexing files +# which previously failed. The default script checks the modified dates on +# /usr/bin and /usr/local/bin. A relative path will be looked up in the +# filters dirs, then in the path. Use an absolute path to do otherwise. +checkneedretryindexscript = rclcheckneedretry.sh + # Disable extended attributes conversion to metadata fields. This probably # needs to be set if testmodifusemtime is set. noxattrfields = 0