implemented proper limitation and error reporting in case of truncation for term and query expansions
This commit is contained in:
parent
86515ce52a
commit
a4a7246a12
13 changed files with 217 additions and 105 deletions
|
@ -569,9 +569,9 @@ recoll
|
|||
|
||||
|
||||
<sect2 id="rcl.indexing.config.gui">
|
||||
<title>The indexing configuration GUI</title>
|
||||
<title>The index configuration GUI</title>
|
||||
|
||||
<para>Most parameters for a given indexing configuration can
|
||||
<para>Most parameters for a given index configuration can
|
||||
be set from a <command>recoll</command> GUI running on this
|
||||
configuration (either as default, or by setting
|
||||
<envar>RECOLL_CONFDIR</envar> or the <option>-c</option>
|
||||
|
@ -4219,6 +4219,24 @@ skippedPaths = ~/somedir/∗.txt
|
|||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><varname>maxTermExpand</varname></term>
|
||||
<listitem><para>Maximum expansion count for a single term (e.g.:
|
||||
when using wildcards). The default of 10000 is reasonable and
|
||||
will avoid queries that appear frozen while the engine is
|
||||
walking the term list.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><varname>maxXapianClauses</varname></term>
|
||||
<listitem><para>Maximum number of elementary clauses we can add
|
||||
to a single Xapian query. In some cases, the result of term
|
||||
expansion can be multiplicative, and we want to avoid using
|
||||
excessive memory. The default of 100 000 should be both
|
||||
high enough in most cases and compatible with current
|
||||
typical hardware configurations.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><varname>nonumbers</varname></term>
|
||||
<listitem><para>If this set to true, no terms will be generated
|
||||
for numbers. For example "123", "1.5e6", 192.168.1.4, would not
|
||||
|
|
|
@ -195,6 +195,34 @@ ConfSearchPanelW::ConfSearchPanelW(QWidget *parent, ConfNull *config)
|
|||
));
|
||||
vboxLayout->addWidget(cp2);
|
||||
|
||||
ConfLink lnk3(new ConfLinkRclRep(config, "maxTermExpand"));
|
||||
ConfParamIntW* cp3 =
|
||||
new ConfParamIntW(this, lnk3,
|
||||
tr("Maximum term expansion count"),
|
||||
tr("<p>Maximum expansion count for a single term "
|
||||
"(e.g.: when using wildcards). The default "
|
||||
"of 10 000 is reasonable and will avoid "
|
||||
"queries that appear frozen while the engine is "
|
||||
"walking the term list."
|
||||
));
|
||||
vboxLayout->addWidget(cp3);
|
||||
|
||||
|
||||
ConfLink lnk4(new ConfLinkRclRep(config, "maxXapianClauses"));
|
||||
ConfParamIntW* cp4 =
|
||||
new ConfParamIntW(this, lnk4,
|
||||
tr("Maximum Xapian clauses count"),
|
||||
tr("<p>Maximum number of elementary clauses we "
|
||||
"add to a single Xapian query. In some cases, "
|
||||
"the result of term expansion can be "
|
||||
"multiplicative, and we want to avoid using "
|
||||
"excessive memory. The default of 100 000 "
|
||||
"should be both high enough in most cases "
|
||||
"and compatible with current typical hardware "
|
||||
"configurations."
|
||||
));
|
||||
vboxLayout->addWidget(cp4);
|
||||
|
||||
vboxLayout->insertStretch(-1);
|
||||
}
|
||||
|
||||
|
|
|
@ -138,7 +138,10 @@ class DocSequence {
|
|||
{
|
||||
return std::list<std::string>();
|
||||
}
|
||||
|
||||
virtual std::string getReason()
|
||||
{
|
||||
return m_reason;
|
||||
}
|
||||
/** Optional functionality. */
|
||||
virtual bool canFilter() {return false;}
|
||||
virtual bool canSort() {return false;}
|
||||
|
@ -154,6 +157,7 @@ class DocSequence {
|
|||
protected:
|
||||
static std::string o_sort_trans;
|
||||
static std::string o_filt_trans;
|
||||
std::string m_reason;
|
||||
private:
|
||||
std::string m_title;
|
||||
};
|
||||
|
@ -206,6 +210,12 @@ public:
|
|||
return false;
|
||||
return m_seq->getEnclosing(doc, pdoc);
|
||||
}
|
||||
virtual std::string getReason()
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return false;
|
||||
return m_seq->getReason();
|
||||
}
|
||||
virtual std::string title() {return m_seq->title();}
|
||||
virtual RefCntr<DocSequence> getSourceSeq() {return m_seq;}
|
||||
|
||||
|
|
|
@ -51,14 +51,16 @@ string DocSequenceDb::getDescription()
|
|||
|
||||
bool DocSequenceDb::getDoc(int num, Rcl::Doc &doc, string *sh)
|
||||
{
|
||||
setQuery();
|
||||
if (!setQuery())
|
||||
return false;
|
||||
if (sh) sh->erase();
|
||||
return m_q->getDoc(num, doc);
|
||||
}
|
||||
|
||||
int DocSequenceDb::getResCnt()
|
||||
{
|
||||
setQuery();
|
||||
if (!setQuery())
|
||||
return false;
|
||||
if (m_rescnt < 0) {
|
||||
m_rescnt= m_q->getResCnt();
|
||||
}
|
||||
|
@ -71,7 +73,8 @@ static const string cstr_mre("[...]");
|
|||
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<Rcl::Snippet>& vpabs)
|
||||
{
|
||||
LOGDEB(("DocSequenceDb::getAbstract/pair\n"));
|
||||
setQuery();
|
||||
if (!setQuery())
|
||||
return false;
|
||||
|
||||
// Have to put the limit somewhere.
|
||||
int maxoccs = 500;
|
||||
|
@ -93,7 +96,8 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<Rcl::Snippet>& vpabs)
|
|||
|
||||
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
|
||||
{
|
||||
setQuery();
|
||||
if (!setQuery())
|
||||
return false;
|
||||
if (m_q->whatDb() &&
|
||||
m_queryBuildAbstract && (doc.syntabs || m_queryReplaceAbstract)) {
|
||||
m_q->makeDocAbstract(doc, vabs);
|
||||
|
@ -105,7 +109,8 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
|
|||
|
||||
int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
|
||||
{
|
||||
setQuery();
|
||||
if (!setQuery())
|
||||
return false;
|
||||
if (m_q->whatDb()) {
|
||||
return m_q->getFirstMatchPage(doc, term);
|
||||
}
|
||||
|
@ -114,7 +119,8 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
|
|||
|
||||
bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
|
||||
{
|
||||
setQuery();
|
||||
if (!setQuery())
|
||||
return false;
|
||||
string udi;
|
||||
if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath,
|
||||
udi))
|
||||
|
@ -124,7 +130,8 @@ bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
|
|||
|
||||
list<string> DocSequenceDb::expand(Rcl::Doc &doc)
|
||||
{
|
||||
setQuery();
|
||||
if (!setQuery())
|
||||
return list<string>();
|
||||
vector<string> v = m_q->expand(doc);
|
||||
return list<string>(v.begin(), v.end());
|
||||
}
|
||||
|
@ -209,13 +216,10 @@ bool DocSequenceDb::setQuery()
|
|||
return true;
|
||||
m_rescnt = -1;
|
||||
m_needSetQuery = !m_q->setQuery(m_fsdata);
|
||||
|
||||
#if 0
|
||||
HighlightData hld;
|
||||
m_fsdata->getTerms(hld);
|
||||
string str;
|
||||
hld.toString(str);
|
||||
fprintf(stderr, "DocSequenceDb::setQuery: terms: %s\n", str.c_str());
|
||||
#endif
|
||||
if (m_needSetQuery) {
|
||||
m_reason = m_q->getReason();
|
||||
LOGERR(("DocSequenceDb::setQuery: rclquery::setQuery failed: %s\n",
|
||||
m_reason.c_str()));
|
||||
}
|
||||
return !m_needSetQuery;
|
||||
}
|
||||
|
|
|
@ -67,6 +67,7 @@ class DocSequenceDb : public DocSequence {
|
|||
bool m_isFiltered;
|
||||
bool m_isSorted;
|
||||
bool m_needSetQuery; // search data changed, need to reapply before fetch
|
||||
|
||||
bool setQuery();
|
||||
};
|
||||
|
||||
|
|
|
@ -319,7 +319,10 @@ int recollq(RclConfig **cfp, int argc, char **argv)
|
|||
query.setSortBy(sortfield, (op_flags & OPT_D) ? false : true);
|
||||
}
|
||||
Chrono chron;
|
||||
query.setQuery(rq);
|
||||
if (!query.setQuery(rq)) {
|
||||
cerr << "Query setup failed: " << query.getReason() << endl;
|
||||
return(1);
|
||||
}
|
||||
int cnt = query.getResCnt();
|
||||
if (!(op_flags & OPT_b)) {
|
||||
cout << "Recoll query: " << rq->getDescription() << endl;
|
||||
|
|
|
@ -337,6 +337,11 @@ void ResListPager::displayPage(RclConfig *config)
|
|||
|
||||
if (pageEmpty()) {
|
||||
chunk << trans("<p><b>No results found</b><br>");
|
||||
string reason = m_docSource->getReason();
|
||||
if (!reason.empty()) {
|
||||
chunk << "<blockquote>" << escapeHtml(reason) <<
|
||||
"</blockquote></p>";
|
||||
} else {
|
||||
HighlightData hldata;
|
||||
m_docSource->getTerms(hldata);
|
||||
vector<string> uterms(hldata.uterms.begin(), hldata.uterms.end());
|
||||
|
@ -368,6 +373,7 @@ void ResListPager::displayPage(RclConfig *config)
|
|||
chunk << "</blockquote></p>";
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
unsigned int resCnt = m_docSource->getResCnt();
|
||||
if (m_winfirst + m_respage.size() < resCnt) {
|
||||
|
|
|
@ -1431,7 +1431,7 @@ bool Db::purgeFile(const string &udi, bool *existed)
|
|||
}
|
||||
|
||||
// File name wild card expansion. This is a specialisation ot termMatch
|
||||
bool Db::filenameWildExp(const string& fnexp, vector<string>& names)
|
||||
bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
|
||||
{
|
||||
string pattern = fnexp;
|
||||
names.clear();
|
||||
|
@ -1449,7 +1449,7 @@ bool Db::filenameWildExp(const string& fnexp, vector<string>& names)
|
|||
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
|
||||
|
||||
TermMatchResult result;
|
||||
if (!termMatch(ET_WILD, string(), pattern, result, -1,
|
||||
if (!termMatch(ET_WILD, string(), pattern, result, max,
|
||||
unsplitFilenameFieldName))
|
||||
return false;
|
||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||
|
@ -1459,7 +1459,7 @@ bool Db::filenameWildExp(const string& fnexp, vector<string>& names)
|
|||
if (names.empty()) {
|
||||
// Build an impossible query: we know its impossible because we
|
||||
// control the prefixes!
|
||||
names.push_back("XNONENoMatchingTerms");
|
||||
names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -315,7 +315,7 @@ class Db {
|
|||
bool maxYearSpan(int *minyear, int *maxyear);
|
||||
|
||||
/** Wildcard expansion specific to file names. Internal/sdata use only */
|
||||
bool filenameWildExp(const string& exp, vector<string>& names);
|
||||
bool filenameWildExp(const string& exp, vector<string>& names, int max);
|
||||
|
||||
/** Set parameters for synthetic abstract generation */
|
||||
void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
|
||||
|
|
|
@ -193,8 +193,13 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
|
|||
m_nq->clear();
|
||||
m_sd = sdata;
|
||||
|
||||
int maxexp = 10000;
|
||||
m_db->getConf()->getConfParam("maxTermExpand", &maxexp);
|
||||
int maxcl = 100000;
|
||||
m_db->getConf()->getConfParam("maxXapianClauses", &maxcl);
|
||||
|
||||
Xapian::Query xq;
|
||||
if (!sdata->toNativeQuery(*m_db, &xq)) {
|
||||
if (!sdata->toNativeQuery(*m_db, &xq, maxexp, maxcl)) {
|
||||
m_reason += sdata->getReason();
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -201,14 +201,16 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)
|
|||
|
||||
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
||||
vector<SearchDataClause*>& query,
|
||||
string& reason, void *d)
|
||||
string& reason, void *d,
|
||||
int maxexp, int maxcl)
|
||||
{
|
||||
Xapian::Query xq;
|
||||
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
|
||||
Xapian::Query nq;
|
||||
if (!(*it)->toNativeQuery(db, &nq)) {
|
||||
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed\n"));
|
||||
reason = (*it)->getReason();
|
||||
if (!(*it)->toNativeQuery(db, &nq, maxexp, maxcl)) {
|
||||
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
|
||||
(*it)->getReason().c_str()));
|
||||
reason += (*it)->getReason() + " ";
|
||||
return false;
|
||||
}
|
||||
if (nq.empty()) {
|
||||
|
@ -236,6 +238,13 @@ bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
|||
} else {
|
||||
xq = Xapian::Query(op, xq, nq);
|
||||
}
|
||||
if (int(xq.get_length()) >= maxcl) {
|
||||
LOGERR(("Maximum Xapian query size exceeded."
|
||||
" Maybe increase maxXapianClauses."));
|
||||
m_reason += "Maximum Xapian query size exceeded."
|
||||
" Maybe increase maxXapianClauses.";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (xq.empty())
|
||||
xq = Xapian::Query::MatchAll;
|
||||
|
@ -244,7 +253,7 @@ bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d, int maxexp, int maxcl)
|
||||
{
|
||||
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
|
||||
m_reason.erase();
|
||||
|
@ -252,8 +261,9 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
|||
// Walk the clause list translating each in turn and building the
|
||||
// Xapian query tree
|
||||
Xapian::Query xq;
|
||||
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
|
||||
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed\n"));
|
||||
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq, maxexp, maxcl)) {
|
||||
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
|
||||
m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -620,10 +630,10 @@ private:
|
|||
class StringToXapianQ {
|
||||
public:
|
||||
StringToXapianQ(Db& db, HighlightData& hld, const string& field,
|
||||
const string &stmlng, bool boostUser)
|
||||
const string &stmlng, bool boostUser, int maxexp, int maxcl)
|
||||
: m_db(db), m_field(field), m_stemlang(stmlng),
|
||||
m_doBoostUserTerms(boostUser), m_hld(hld), m_autodiacsens(false),
|
||||
m_autocasesens(true)
|
||||
m_autocasesens(true), m_maxexp(maxexp), m_maxcl(maxcl), m_curcl(0)
|
||||
{
|
||||
m_db.getConf()->getConfParam("autodiacsens", &m_autodiacsens);
|
||||
m_db.getConf()->getConfParam("autocasesens", &m_autocasesens);
|
||||
|
@ -635,15 +645,15 @@ public:
|
|||
vector<Xapian::Query> &pqueries,
|
||||
int slack = 0, bool useNear = false);
|
||||
private:
|
||||
void expandTerm(int mods,
|
||||
bool expandTerm(string& ermsg, int mods,
|
||||
const string& term, vector<string>& exp,
|
||||
string& sterm, const string& prefix);
|
||||
// After splitting entry on whitespace: process non-phrase element
|
||||
void processSimpleSpan(const string& span,
|
||||
void processSimpleSpan(string& ermsg, const string& span,
|
||||
int mods,
|
||||
vector<Xapian::Query> &pqueries);
|
||||
// Process phrase/near element
|
||||
void processPhraseOrNear(TextSplitQ *splitData,
|
||||
void processPhraseOrNear(string& ermsg, TextSplitQ *splitData,
|
||||
int mods,
|
||||
vector<Xapian::Query> &pqueries,
|
||||
bool useNear, int slack);
|
||||
|
@ -655,6 +665,9 @@ private:
|
|||
HighlightData& m_hld;
|
||||
bool m_autodiacsens;
|
||||
bool m_autocasesens;
|
||||
int m_maxexp;
|
||||
int m_maxcl;
|
||||
int m_curcl;
|
||||
};
|
||||
|
||||
#if 1
|
||||
|
@ -679,7 +692,7 @@ static void listVector(const string& what, const vector<string>&l)
|
|||
* has it already. Used in the simple case where there is nothing to expand,
|
||||
* and we just return the prefixed term (else Db::termMatch deals with it).
|
||||
*/
|
||||
void StringToXapianQ::expandTerm(int mods,
|
||||
bool StringToXapianQ::expandTerm(string& ermsg, int mods,
|
||||
const string& term,
|
||||
vector<string>& oexp, string &sterm,
|
||||
const string& prefix)
|
||||
|
@ -689,7 +702,7 @@ void StringToXapianQ::expandTerm(int mods,
|
|||
sterm.clear();
|
||||
oexp.clear();
|
||||
if (term.empty())
|
||||
return;
|
||||
return true;
|
||||
|
||||
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
||||
|
||||
|
@ -753,7 +766,7 @@ void StringToXapianQ::expandTerm(int mods,
|
|||
oexp.push_back(prefix + term);
|
||||
m_hld.terms[term] = m_hld.uterms.size() - 1;
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Make objects before the goto jungle to avoid compiler complaints
|
||||
|
@ -770,7 +783,7 @@ void StringToXapianQ::expandTerm(int mods,
|
|||
// expansion, which means that we are casediac-sensitive. There
|
||||
// would be nothing to prevent us to expand from the casediac
|
||||
// synonyms first. To be done later
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang,term,res,m_maxexp,m_field);
|
||||
goto termmatchtoresult;
|
||||
}
|
||||
|
||||
|
@ -778,14 +791,14 @@ void StringToXapianQ::expandTerm(int mods,
|
|||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, m_maxexp, m_field);
|
||||
|
||||
#else
|
||||
|
||||
if (o_index_stripchars) {
|
||||
// If the index is raw, we can only come here if nostemexp is unset
|
||||
// and we just need stem expansion.
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang,term,res,m_maxexp,m_field);
|
||||
goto termmatchtoresult;
|
||||
}
|
||||
|
||||
|
@ -854,12 +867,17 @@ exptotermatch:
|
|||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, -1, m_field);
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,m_maxexp,m_field);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Term match entries to vector of terms
|
||||
termmatchtoresult:
|
||||
if (int(res.entries.size()) >= m_maxexp) {
|
||||
ermsg = "Maximum term expansion size exceeded."
|
||||
" Maybe increase maxTermExpand.";
|
||||
return false;
|
||||
}
|
||||
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
oexp.push_back(it->term);
|
||||
|
@ -876,6 +894,7 @@ termmatchtoresult:
|
|||
m_hld.terms[strip_prefix(*it)] = term;
|
||||
}
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
||||
|
@ -912,7 +931,7 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
|||
}
|
||||
}
|
||||
|
||||
void StringToXapianQ::processSimpleSpan(const string& span,
|
||||
void StringToXapianQ::processSimpleSpan(string& ermsg, const string& span,
|
||||
int mods,
|
||||
vector<Xapian::Query> &pqueries)
|
||||
{
|
||||
|
@ -927,7 +946,8 @@ void StringToXapianQ::processSimpleSpan(const string& span,
|
|||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
|
||||
expandTerm(mods, span, exp, sterm, prefix);
|
||||
if (!expandTerm(ermsg, mods, span, exp, sterm, prefix))
|
||||
return;
|
||||
|
||||
// Set up the highlight data. No prefix should go in there
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
|
@ -939,6 +959,7 @@ void StringToXapianQ::processSimpleSpan(const string& span,
|
|||
|
||||
// Push either term or OR of stem-expanded set
|
||||
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
||||
m_curcl += exp.size();
|
||||
|
||||
// If sterm (simplified original user term) is not null, give it a
|
||||
// relevance boost. We do this even if no expansion occurred (else
|
||||
|
@ -957,7 +978,7 @@ void StringToXapianQ::processSimpleSpan(const string& span,
|
|||
// NEAR xapian query, the elements of which can themselves be OR
|
||||
// queries if the terms get expanded by stemming or wildcards (we
|
||||
// don't do stemming for PHRASE though)
|
||||
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
||||
void StringToXapianQ::processPhraseOrNear(string& ermsg, TextSplitQ *splitData,
|
||||
int mods,
|
||||
vector<Xapian::Query> &pqueries,
|
||||
bool useNear, int slack)
|
||||
|
@ -999,7 +1020,8 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
|||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||
string sterm;
|
||||
vector<string> exp;
|
||||
expandTerm(lmods, *it, exp, sterm, prefix);
|
||||
if (!expandTerm(ermsg, lmods, *it, exp, sterm, prefix))
|
||||
return;
|
||||
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
||||
listVector("", exp);
|
||||
// groups is used for highlighting, we don't want prefixes in there.
|
||||
|
@ -1011,6 +1033,9 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
|||
groups.push_back(noprefs);
|
||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end()));
|
||||
m_curcl += exp.size();
|
||||
if (m_curcl >= m_maxcl)
|
||||
return;
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
if (exp.size() > 1)
|
||||
hadmultiple = true;
|
||||
|
@ -1099,7 +1124,7 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||
"slack %d near %d\n",
|
||||
iq.c_str(), m_field.c_str(), mods, slack, useNear));
|
||||
ermsg.erase();
|
||||
|
||||
m_curcl = 0;
|
||||
const StopList stops = m_db.getStopList();
|
||||
|
||||
// Simple whitespace-split input into user-level words and
|
||||
|
@ -1165,12 +1190,18 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||
if (splitter.nostemexps.front())
|
||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||
m_hld.ugroups.push_back(vector<string>(1, *it));
|
||||
processSimpleSpan(splitter.terms.front(), lmods, pqueries);
|
||||
processSimpleSpan(ermsg,splitter.terms.front(),lmods, pqueries);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
m_hld.ugroups.push_back(vector<string>(1, *it));
|
||||
processPhraseOrNear(&splitter, mods, pqueries, useNear, slack);
|
||||
processPhraseOrNear(ermsg, &splitter, mods, pqueries,
|
||||
useNear, slack);
|
||||
}
|
||||
if (m_curcl >= m_maxcl) {
|
||||
ermsg = "Maximum Xapian query size exceeded."
|
||||
" Maybe increase maxXapianClauses.";
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
|
@ -1190,7 +1221,8 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||
}
|
||||
|
||||
// Translate a simple OR, AND, or EXCL search clause.
|
||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
int maxexp, int maxcl)
|
||||
{
|
||||
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
|
||||
getStemLang().c_str()));
|
||||
|
@ -1216,7 +1248,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
|||
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
||||
(m_parentSearch == 0 && !m_haveWildCards);
|
||||
|
||||
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
|
||||
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm,
|
||||
maxexp, maxcl);
|
||||
if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
|
@ -1240,13 +1273,14 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
|||
// about expanding multiple fragments in the past. We just take the
|
||||
// value blanks and all and expand this against the indexed unsplit
|
||||
// file names
|
||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
|
||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
||||
int maxexp, int)
|
||||
{
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
vector<string> names;
|
||||
db.filenameWildExp(m_text, names);
|
||||
db.filenameWildExp(m_text, names, maxexp);
|
||||
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
||||
|
||||
if (m_weight != 1.0) {
|
||||
|
@ -1256,7 +1290,8 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
|
|||
}
|
||||
|
||||
// Translate NEAR or PHRASE clause.
|
||||
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
||||
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
||||
int maxexp, int maxcl)
|
||||
{
|
||||
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
||||
|
||||
|
@ -1281,7 +1316,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
|||
}
|
||||
string s = cstr_dquote + m_text + cstr_dquote;
|
||||
bool useNear = (m_tp == SCLT_NEAR);
|
||||
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
|
||||
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm,
|
||||
maxexp, maxcl);
|
||||
if (!tr.processUserString(s, getModifiers(), m_reason, pqueries,
|
||||
m_slack, useNear))
|
||||
return false;
|
||||
|
|
|
@ -89,8 +89,7 @@ public:
|
|||
bool haveWildCards() {return m_haveWildCards;}
|
||||
|
||||
/** Translate to Xapian query. rcldb knows about the void* */
|
||||
bool toNativeQuery(Rcl::Db &db, void *);
|
||||
|
||||
bool toNativeQuery(Rcl::Db &db, void *, int maxexp, int maxcl);
|
||||
|
||||
/** We become the owner of cl and will delete it */
|
||||
bool addClause(SearchDataClause *cl);
|
||||
|
@ -175,7 +174,7 @@ private:
|
|||
bool expandFileTypes(RclConfig *cfg, std::vector<std::string>& exptps);
|
||||
bool clausesToQuery(Rcl::Db &db, SClType tp,
|
||||
std::vector<SearchDataClause*>& query,
|
||||
string& reason, void *d);
|
||||
string& reason, void *d, int, int);
|
||||
|
||||
/* Copyconst and assignment private and forbidden */
|
||||
SearchData(const SearchData &) {}
|
||||
|
@ -192,7 +191,7 @@ public:
|
|||
m_modifiers(SDCM_NONE), m_weight(1.0)
|
||||
{}
|
||||
virtual ~SearchDataClause() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *) = 0;
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, int maxexp, int maxcl) = 0;
|
||||
bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;}
|
||||
virtual std::string getReason() const {return m_reason;}
|
||||
virtual void getTerms(HighlightData & hldata) const = 0;
|
||||
|
@ -266,7 +265,7 @@ public:
|
|||
}
|
||||
|
||||
/** Translate to Xapian query */
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *, int maxexp, int maxcl);
|
||||
|
||||
virtual void getTerms(HighlightData& hldata) const
|
||||
{
|
||||
|
@ -307,7 +306,7 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *, int maxexp, int maxcl);
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -326,7 +325,7 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *, int maxexp, int maxcl);
|
||||
private:
|
||||
int m_slack;
|
||||
};
|
||||
|
@ -338,9 +337,12 @@ public:
|
|||
: SearchDataClause(tp), m_sub(sub)
|
||||
{
|
||||
}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *p)
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *p, int maxexp, int maxcl)
|
||||
{
|
||||
return m_sub->toNativeQuery(db, p);
|
||||
bool ret = m_sub->toNativeQuery(db, p, maxexp, maxcl);
|
||||
if (!ret)
|
||||
m_reason = m_sub->getReason();
|
||||
return ret;
|
||||
}
|
||||
|
||||
virtual void getTerms(HighlightData& hldata) const
|
||||
|
|
|
@ -103,6 +103,17 @@ indexstemminglanguages = english
|
|||
# Actually, this seems a reasonable default for all until someone protests.
|
||||
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
|
||||
|
||||
# Maximum expansion count for a single term (ie: when using wildcards).
|
||||
# We used to not limit this at all (except for filenames where the limit
|
||||
# was too low at 1000), but it is unreasonable with a big index.
|
||||
# Default 10 000
|
||||
maxTermExpand = 10000
|
||||
|
||||
# Maximum number of clauses we add to a single Xapian query. In some cases,
|
||||
# the result of term expansion can be multiplicative, and we want to avoid
|
||||
# eating all the memory. Default 100 000
|
||||
maxXapianClauses = 100000
|
||||
|
||||
# Where to store the database (directory). This may be an absolute path,
|
||||
# else it is taken as relative to the configuration directory (-c argument
|
||||
# or $RECOLL_CONFDIR).
|
||||
|
@ -132,18 +143,6 @@ filtersdir = @prefix@/share/recoll/filters
|
|||
# want to change the icons displayed in the result list
|
||||
iconsdir = @prefix@/share/recoll/images
|
||||
|
||||
# A list of characters, encoded in UTF-8, which should be handled specially
|
||||
# when converting text to unaccented lowercase. For example, in Swedish,
|
||||
# the letter a with diaeresis has full alphabet citizenship and should not
|
||||
# be turned into an a. Each element in the space-separated list has the
|
||||
# special character as first element and the translation following
|
||||
# (multiple chars allowed. The handling of both the lowercase and
|
||||
# upper-case versions of a character should be specified, as appartenance
|
||||
# to the list will turn-off both standard accent and case
|
||||
# processing. ** Changing the list implies a full reindex **
|
||||
# Example for Swedish:
|
||||
# unac_except_trans = åå Åå ää Ää öö Öö
|
||||
|
||||
# Should we use the system's 'file -i' command as a final step in file type
|
||||
# identification ? This may be useful, but will usually cause the
|
||||
# indexation of many bogus 'text' files
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue