Prevent highligting of bogus terms in results (prevent path elts, negative queries or internal stuff)

This commit is contained in:
Jean-Francois Dockes 2015-08-08 21:56:45 +02:00
parent fe6174652b
commit e37284f05f
8 changed files with 58 additions and 46 deletions

View file

@ -867,6 +867,8 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
ft.boost = atof(tval.c_str()); ft.boost = atof(tval.c_str());
if (attrs.get("pfxonly", tval)) if (attrs.get("pfxonly", tval))
ft.pfxonly = stringToBool(tval); ft.pfxonly = stringToBool(tval);
if (attrs.get("noterms", tval))
ft.noterms = stringToBool(tval);
m_fldtotraits[stringtolower(*it)] = ft; m_fldtotraits[stringtolower(*it)] = ft;
LOGDEB2(("readFieldsConfig: [%s] -> [%s] %d %.1f\n", LOGDEB2(("readFieldsConfig: [%s] -> [%s] %d %.1f\n",
it->c_str(), ft.pfx.c_str(), ft.wdfinc, ft.boost)); it->c_str(), ft.pfx.c_str(), ft.wdfinc, ft.boost));

View file

@ -66,9 +66,9 @@ struct FieldTraits {
int wdfinc; // Index time term frequency increment (default 1) int wdfinc; // Index time term frequency increment (default 1)
double boost; // Query time boost (default 1.0) double boost; // Query time boost (default 1.0)
bool pfxonly; // Suppress prefix-less indexing bool pfxonly; // Suppress prefix-less indexing
bool noterms; // Don't add term to highlight data (e.g.: rclbes)
FieldTraits() FieldTraits()
: wdfinc(1), boost(1.0), pfxonly(false) : wdfinc(1), boost(1.0), pfxonly(false), noterms(false)
{} {}
}; };

View file

@ -71,8 +71,8 @@ static const string xapday_prefix = "D";
static const string xapmonth_prefix = "M"; static const string xapmonth_prefix = "M";
static const string xapyear_prefix = "Y"; static const string xapyear_prefix = "Y";
const string pathelt_prefix = "XP"; const string pathelt_prefix = "XP";
const string udi_prefix("Q"); static const string udi_prefix("Q");
const string parent_prefix("F"); static const string parent_prefix("F");
// Special terms to mark begin/end of field (for anchored searches), and // Special terms to mark begin/end of field (for anchored searches), and
// page breaks // page breaks

View file

@ -533,8 +533,6 @@ private:
string version_string(); string version_string();
extern const string pathelt_prefix; extern const string pathelt_prefix;
extern const string udi_prefix;
extern const string parent_prefix;
extern const string mimetype_prefix; extern const string mimetype_prefix;
extern const string unsplitFilenameFieldName; extern const string unsplitFilenameFieldName;
extern string start_of_field_term; extern string start_of_field_term;

View file

@ -181,7 +181,8 @@ bool SearchData::addClause(SearchDataClause* cl)
return true; return true;
} }
// Am I a file name only search ? This is to turn off term highlighting // Am I a file name only search ? This is to turn off term highlighting.
// There can't be a subclause in a filename search: no possible need to recurse
bool SearchData::fileNameOnly() bool SearchData::fileNameOnly()
{ {
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
@ -190,6 +191,7 @@ bool SearchData::fileNameOnly()
return true; return true;
} }
// The query language creates a lot of subqueries. See if we can merge them.
void SearchData::simplify() void SearchData::simplify()
{ {
for (unsigned int i = 0; i < m_query.size(); i++) { for (unsigned int i = 0; i < m_query.size(); i++) {
@ -249,30 +251,35 @@ void SearchData::simplify()
} }
} }
bool SearchData::singleSimple() // Extract terms and groups for highlighting
{
if (m_query.size() != 1 || !m_filetypes.empty() || !m_nfiletypes.empty() ||
m_haveDates || m_maxSize != size_t(-1) || m_minSize != size_t(-1) ||
m_haveWildCards)
return false;
SearchDataClause *clp = *m_query.begin();
if (clp->getTp() != SCLT_AND && clp->getTp() != SCLT_OR) {
return false;
}
return true;
}
// Extract all term data
void SearchData::getTerms(HighlightData &hld) const void SearchData::getTerms(HighlightData &hld) const
{ {
for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++) for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++) {
(*it)->getTerms(hld); if (!((*it)->getmodifiers() & SearchDataClause::SDCM_NOTERMS) &&
!(*it)->getexclude()) {
(*it)->getTerms(hld);
}
}
return; return;
} }
static const char * tpToString(SClType t)
{
switch (t) {
case SCLT_AND: return "AND";
case SCLT_OR: return "OR";
case SCLT_FILENAME: return "FILENAME";
case SCLT_PHRASE: return "PHRASE";
case SCLT_NEAR: return "NEAR";
case SCLT_PATH: return "PATH";
case SCLT_SUB: return "SUB";
default: return "UNKNOWN";
}
}
void SearchData::dump(ostream& o) const void SearchData::dump(ostream& o) const
{ {
o << "SearchData: " << " qs " << int(m_query.size()) << o << "SearchData: " << tpToString(m_tp) << " qs " << int(m_query.size()) <<
" ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() << " ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() <<
" hd " << m_haveDates << " maxs " << int(m_maxSize) << " mins " << " hd " << m_haveDates << " maxs " << int(m_maxSize) << " mins " <<
int(m_minSize) << " wc " << m_haveWildCards << "\n"; int(m_minSize) << " wc " << m_haveWildCards << "\n";
@ -291,7 +298,7 @@ void SearchDataClause::dump(ostream& o) const
void SearchDataClauseSimple::dump(ostream& o) const void SearchDataClauseSimple::dump(ostream& o) const
{ {
o << "ClauseSimple: "; o << "ClauseSimple: " << tpToString(m_tp) << " ";
if (m_exclude) if (m_exclude)
o << "- "; o << "- ";
o << "[" ; o << "[" ;
@ -319,9 +326,9 @@ void SearchDataClausePath::dump(ostream& o) const
void SearchDataClauseDist::dump(ostream& o) const void SearchDataClauseDist::dump(ostream& o) const
{ {
if (m_tp == SCLT_NEAR) if (m_tp == SCLT_NEAR)
o << "ClauseDist: NEAR: "; o << "ClauseDist: NEAR ";
else else
o << "ClauseDist: PHRA: "; o << "ClauseDist: PHRA ";
if (m_exclude) if (m_exclude)
o << " - "; o << " - ";

View file

@ -96,9 +96,6 @@ public:
/** Is there anything but a file name search in here ? */ /** Is there anything but a file name search in here ? */
bool fileNameOnly(); bool fileNameOnly();
/** Are we a simple query with one clause? */
bool singleSimple();
/** Do we have wildcards anywhere apart from filename searches ? */ /** Do we have wildcards anywhere apart from filename searches ? */
bool haveWildCards() {return m_haveWildCards;} bool haveWildCards() {return m_haveWildCards;}
@ -228,7 +225,9 @@ private:
class SearchDataClause { class SearchDataClause {
public: public:
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2, enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16}; SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16,
SDCM_NOTERMS=32 // Don't include terms for highlighting
};
enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE}; enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
SearchDataClause(SClType tp) SearchDataClause(SClType tp)
@ -278,13 +277,12 @@ public:
{ {
return m_parentSearch ? m_parentSearch->getSoftMaxExp() : -1; return m_parentSearch ? m_parentSearch->getSoftMaxExp() : -1;
} }
virtual void setModifiers(Modifier mod)
{
m_modifiers = mod;
}
virtual void addModifier(Modifier mod) virtual void addModifier(Modifier mod)
{ {
m_modifiers = Modifier(m_modifiers | mod); m_modifiers = m_modifiers | mod;
}
virtual unsigned int getmodifiers() {
return m_modifiers;
} }
virtual void setWeight(float w) virtual void setWeight(float w)
{ {
@ -312,7 +310,7 @@ protected:
SClType m_tp; SClType m_tp;
SearchData *m_parentSearch; SearchData *m_parentSearch;
bool m_haveWildCards; bool m_haveWildCards;
Modifier m_modifiers; unsigned int m_modifiers;
float m_weight; float m_weight;
bool m_exclude; bool m_exclude;
Relation m_rel; Relation m_rel;

View file

@ -25,6 +25,7 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
#include <sstream> #include <sstream>
#include <iostream>
using namespace std; using namespace std;
#include "xapian.h" #include "xapian.h"
@ -53,9 +54,10 @@ typedef vector<SearchDataClause *>::iterator qlist_it_t;
static const int original_term_wqf_booster = 10; static const int original_term_wqf_booster = 10;
// Expand categories and mime type wild card exps Categories are // Expand doc categories and mime type wild card expressions
// expanded against the configuration, mimetypes against the index //
// (for wildcards). // Categories are expanded against the configuration, mimetypes
// against the index.
bool SearchData::expandFileTypes(Db &db, vector<string>& tps) bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
{ {
const RclConfig *cfg = db.getConf(); const RclConfig *cfg = db.getConf();
@ -101,6 +103,8 @@ static const char *maxXapClauseCaseDiacMsg =
"wildcards ?" "wildcards ?"
; ;
// Walk the clauses list, translate each and add to top Xapian Query
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
vector<SearchDataClause*>& query, vector<SearchDataClause*>& query,
string& reason, void *d) string& reason, void *d)
@ -484,7 +488,8 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
if (noexpansion) { if (noexpansion) {
oexp.push_back(prefix + term); oexp.push_back(prefix + term);
m_hldata.terms[term] = term; m_hldata.terms[term] = term;
LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str())); LOGDEB(("ExpandTerm: noexpansion: final: %s\n",
stringsToString(oexp).c_str()));
return true; return true;
} }
@ -568,6 +573,8 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
string prefix; string prefix;
const FieldTraits *ftp; const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
if (ftp->noterms)
addModifier(SDCM_NOTERMS);
prefix = wrap_prefix(ftp->pfx); prefix = wrap_prefix(ftp->pfx);
} }

View file

@ -43,12 +43,12 @@ keywords= K
xapyearmon = M xapyearmon = M
title = S ; wdfinc = 10 title = S ; wdfinc = 10
mtype = T mtype = T
ext = XE ext = XE; noterms = 1
rclmd5 = XM rclmd5 = XM
dir = XP dir = XP ; noterms = 1
abstract = XS abstract = XS
filename = XSFN filename = XSFN ; noterms = 1
containerfilename = XCFN ; pfxonly = 1 containerfilename = XCFN ; pfxonly = 1 ; noterms = 1
rclUnsplitFN = XSFS rclUnsplitFN = XSFS
xapyear = Y xapyear = Y
recipient = XTO recipient = XTO
@ -58,7 +58,7 @@ recipient = XTO
# by default. # by default.
# Some values are internally reserved by recoll: # Some values are internally reserved by recoll:
# XE (file ext), XP (for path elements), XSFN, XSFS, XXST, XXND, XXPG # XE (file ext), XP (for path elements), XSFN, XSFS, XXST, XXND, XXPG
rclbes = XB rclbes = XB ; noterms = 1
# Using XX was not a good idea. # Using XX was not a good idea.
# #
# I hereby commit to not using XY for Recoll: # I hereby commit to not using XY for Recoll: