Let recollindex execute a script at startup to try and guess if it should retry failed files
This commit is contained in:
parent
7161f3a396
commit
d7dea4ae3d
8 changed files with 209 additions and 13 deletions
|
@ -125,16 +125,18 @@ is given, the database will be erased before starting. If option
|
|||
is given, the database will not be reset, but all files will be considered
|
||||
as needing reindexing (in place reset).
|
||||
.PP
|
||||
By default,
|
||||
As of version 1.21,
|
||||
.B recollindex
|
||||
does not process again files which previously failed to index (for example
|
||||
because of a missing helper program). This behaviour is new in version
|
||||
1.21, error files were always retried in previous versions.
|
||||
If option
|
||||
usually does not process again files which previously failed to index (for
|
||||
example because of a missing helper program). If option
|
||||
.B \-k
|
||||
is given,
|
||||
.B recollindex
|
||||
will try again to process all failed files.
|
||||
will try again to process all failed files. Please note that
|
||||
.B recollindex
|
||||
may also decide to retry failed files if the auxiliary checking script
|
||||
defined by the "checkneedretryindexscript" configuration variable indicates
|
||||
that this should happen.
|
||||
.PP
|
||||
If option
|
||||
.B
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
|
||||
<para>This document introduces full text search notions
|
||||
and describes the installation and use of the &RCL;
|
||||
application. It currently describes &RCL; &RCLVERSION;.</para>
|
||||
application. This version describes &RCL; &RCLVERSION;.</para>
|
||||
</abstract>
|
||||
|
||||
|
||||
|
@ -448,9 +448,36 @@ indexedmimetypes = application/pdf
|
|||
index configuration tool.</para>
|
||||
|
||||
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Indexing failures</title>
|
||||
|
||||
<para>Indexing may fail for some documents, for a number of
|
||||
reasons: a helper program may be missing, the document may be
|
||||
corrupt, we may fail to uncompress a file because no file
|
||||
system space is available, etc.</para>
|
||||
|
||||
<para>&RCL; versions prior to 1.21 always retried to index
|
||||
files which had previously caused an error. This guaranteed
|
||||
that anything that may have become indexable (for example
|
||||
because a helper had been installed) would be indexed. However
|
||||
this was bad for performance because some indexing failures
|
||||
may be quite costly (for example failing to uncompress a big
|
||||
file because of insufficient disk space).</para>
|
||||
|
||||
<para>The indexer in &RCL; versions 1.21 and later do not
|
||||
retry failed file by default. Retrying will only occur if an
|
||||
explicit option (<option>-k</option>) is set on the
|
||||
<command>recollindex</command> command line, or if a script
|
||||
executed when <command>recollindex</command> starts up says
|
||||
so. The script is defined by a configuration variable
|
||||
(<literal>checkneedretryindexscript</literal>), and makes a
|
||||
rather lame attempt at deciding if a helper command may have
|
||||
been installed, by checking if any of the common
|
||||
<filename>bin</filename> directories have changed.</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Recovery</title>
|
||||
|
@ -5862,7 +5889,7 @@ thrQSizes = -1 -1 -1
|
|||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry><term><varname>autodiacsens</varname></term>
|
||||
<varlistentry><term><varname>autodiacsens</varname></term>
|
||||
<listitem><para>IF the index is not stripped, decide if we
|
||||
automatically trigger diacritics sensitivity if the search
|
||||
term has accented characters (not in
|
||||
|
@ -5900,6 +5927,19 @@ thrQSizes = -1 -1 -1
|
|||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><varname>checkneedretryindexscript</varname></term>
|
||||
<listitem><para>This defines the name for a command
|
||||
executed by <command>recollindex</command> when starting
|
||||
indexing. If the exit status of the command is 0,
|
||||
<command>recollindex</command> retries to index all files
|
||||
which previously could not be indexed because of data
|
||||
extraction errors. The default value is a script which
|
||||
checks if any of the common <filename>bin</filename>
|
||||
directories have changed (indicating that a helper program
|
||||
may have been installed).</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><varname>mondelaypatterns</varname></term>
|
||||
<listitem><para>This allows specify wildcard path patterns
|
||||
(processed with fnmatch(3) with 0 flag), to match files which
|
||||
|
|
41
src/filters/rclcheckneedretry.sh
Executable file
41
src/filters/rclcheckneedretry.sh
Executable file
|
@ -0,0 +1,41 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Check /usr/bin and /usr/local/bin modification date against recorded
|
||||
# state, as recorded inside ~/.config/Recoll.org/needidxretrydate
|
||||
#
|
||||
# If any argument is given, we record the new state instead of
|
||||
# generating it (this should be used at the end of an indexing pass
|
||||
# with retry set).
|
||||
#
|
||||
# The script exits with 0 if retrying should be performed (something
|
||||
# changed), 1 else.
|
||||
|
||||
# Bin dirs to be tested:
|
||||
bindirs="/usr/bin /usr/local/bin $HOME/bin /opt/*/bin"
|
||||
|
||||
rfiledir=~/.config/Recoll.org
|
||||
rfile=$rfiledir/needidxretrydate
|
||||
nrfile=$rfiledir/tneedidxretrydate
|
||||
|
||||
test -d $rfiledir || mkdir -p $rfiledir
|
||||
|
||||
# If any argument is given, we are called just to record the new
|
||||
# state. We do not recompute it as it may have changed during
|
||||
# indexing, but just move the state in place
|
||||
if test $# != 0 ; then
|
||||
mv -f $nrfile $rfile
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Compute state of bin dirs and see if anything changed:
|
||||
> $nrfile
|
||||
for dir in $bindirs; do
|
||||
ls -ld $dir >> $nrfile 2> /dev/null
|
||||
done
|
||||
|
||||
if cmp -s $rfile $nrfile ; then
|
||||
exit 1
|
||||
else
|
||||
exit 0
|
||||
fi
|
||||
|
54
src/index/checkretryfailed.cpp
Normal file
54
src/index/checkretryfailed.cpp
Normal file
|
@ -0,0 +1,54 @@
|
|||
/* Copyright (C) 2014 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rclconfig.h"
|
||||
#include "execmd.h"
|
||||
#include "debuglog.h"
|
||||
#include "checkretryfailed.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
bool checkRetryFailed(RclConfig *conf, bool record)
|
||||
{
|
||||
string cmd;
|
||||
|
||||
if (!conf->getConfParam("checkneedretryindexscript", cmd)) {
|
||||
LOGDEB(("checkRetryFailed: 'checkneedretryindexscript' "
|
||||
"not set in config\n"));
|
||||
// We could toss a dice ? Say no retry in this case.
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look in the filters directory (ies). If not found execpath will
|
||||
// be the same as cmd, and we'll let execvp do its thing.
|
||||
string execpath = conf->findFilter(cmd);
|
||||
|
||||
vector<string> args;
|
||||
if (record) {
|
||||
args.push_back("1");
|
||||
}
|
||||
ExecCmd ecmd;
|
||||
int status = ecmd.doexec(execpath, args);
|
||||
if (status == 0) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
33
src/index/checkretryfailed.h
Normal file
33
src/index/checkretryfailed.h
Normal file
|
@ -0,0 +1,33 @@
|
|||
#ifndef _CHECKRETRYFAILED_H_INCLUDED_
|
||||
#define _CHECKRETRYFAILED_H_INCLUDED_
|
||||
/* Copyright (C) 2015 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
/** Check if retrying failed files may be needed. We execute a
|
||||
shell-script for this. The default one checks if any of the common
|
||||
bin directories changed.
|
||||
|
||||
@param conf the config
|
||||
@param record if true, record the state instead of testing
|
||||
|
||||
@return true if retrying should be performed
|
||||
*/
|
||||
class RclConfig;
|
||||
|
||||
bool checkRetryFailed(RclConfig *conf, bool record);
|
||||
|
||||
#endif /* _CHECKRETRYFAILED_H_INCLUDED_ */
|
|
@ -46,6 +46,7 @@ using namespace std;
|
|||
#include "fsindexer.h"
|
||||
#include "rclionice.h"
|
||||
#include "execmd.h"
|
||||
#include "checkretryfailed.h"
|
||||
|
||||
// Command line options
|
||||
static int op_flags;
|
||||
|
@ -418,9 +419,17 @@ int main(int argc, char **argv)
|
|||
|
||||
bool rezero((op_flags & OPT_z) != 0);
|
||||
bool inPlaceReset((op_flags & OPT_Z) != 0);
|
||||
int indexerFlags = ConfIndexer::IxFNone;
|
||||
if (!(op_flags & OPT_k))
|
||||
indexerFlags |= ConfIndexer::IxFNoRetryFailed;
|
||||
|
||||
// We do not retry previously failed files by default. If -k is
|
||||
// set, we do. If the checker script says so, we do too.
|
||||
int indexerFlags = ConfIndexer::IxFNoRetryFailed;
|
||||
if (op_flags & OPT_k) {
|
||||
indexerFlags &= ~ConfIndexer::IxFNoRetryFailed;
|
||||
} else {
|
||||
if (checkRetryFailed(config, false)) {
|
||||
indexerFlags &= ~ConfIndexer::IxFNoRetryFailed;
|
||||
}
|
||||
}
|
||||
|
||||
Pidfile pidfile(config->getPidfile());
|
||||
updater = new MyUpdater(config);
|
||||
|
@ -538,7 +547,12 @@ int main(int argc, char **argv)
|
|||
LOGERR(("recollindex, initial indexing pass failed, "
|
||||
"not going into monitor mode\n"));
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
// Record success of indexing pass with failed files retries.
|
||||
if (!(indexerFlags & ConfIndexer::IxFNoRetryFailed)) {
|
||||
checkRetryFailed(config, true);
|
||||
}
|
||||
}
|
||||
deleteZ(confindexer);
|
||||
o_reexec->insertArgs(vector<string>(1, "-n"));
|
||||
LOGINFO(("recollindex: reexecuting with -n after initial full pass\n"));
|
||||
|
@ -573,6 +587,11 @@ int main(int argc, char **argv)
|
|||
makeIndexerOrExit(config, inPlaceReset);
|
||||
bool status = confindexer->index(rezero, ConfIndexer::IxTAll,
|
||||
indexerFlags);
|
||||
|
||||
// Record success of indexing pass with failed files retries.
|
||||
if (status && !(indexerFlags & ConfIndexer::IxFNoRetryFailed)) {
|
||||
checkRetryFailed(config, true);
|
||||
}
|
||||
if (!status)
|
||||
cerr << "Indexing failed" << endl;
|
||||
if (!confindexer->getReason().empty())
|
||||
|
|
|
@ -14,6 +14,7 @@ ${depth}/common/textsplit.cpp \
|
|||
${depth}/common/unacpp.cpp \
|
||||
${depth}/index/beaglequeue.cpp \
|
||||
${depth}/index/bglfetcher.cpp \
|
||||
${depth}/index/checkretryfailed.cpp \
|
||||
${depth}/index/fetcher.cpp \
|
||||
${depth}/index/fsfetcher.cpp \
|
||||
${depth}/index/fsindexer.cpp \
|
||||
|
|
|
@ -340,6 +340,12 @@ snippetMaxPosWalk = 1000000
|
|||
# undetected). Perform a full index reset after changing this.
|
||||
testmodifusemtime = 0
|
||||
|
||||
# Script used to heuristically check if we need to retry indexing files
|
||||
# which previously failed. The default script checks the modified dates on
|
||||
# /usr/bin and /usr/local/bin. A relative path will be looked up in the
|
||||
# filters dirs, then in the path. Use an absolute path to do otherwise.
|
||||
checkneedretryindexscript = rclcheckneedretry.sh
|
||||
|
||||
# Disable extended attributes conversion to metadata fields. This probably
|
||||
# needs to be set if testmodifusemtime is set.
|
||||
noxattrfields = 0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue