Prevent highligting of bogus terms in results (prevent path elts, negative queries or internal stuff)

2015-08-08 21:56:45 +02:00 · 2015-08-08 21:56:45 +02:00 · e37284f05f
commit e37284f05f
parent fe6174652b
8 changed files with 58 additions and 46 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -867,6 +867,8 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
 	    ft.boost = atof(tval.c_str());
 	if (attrs.get("pfxonly", tval))
 	    ft.pfxonly = stringToBool(tval);
 	if (attrs.get("noterms", tval))
 	    ft.noterms = stringToBool(tval);
 	m_fldtotraits[stringtolower(*it)] = ft;
 	LOGDEB2(("readFieldsConfig: [%s] -> [%s] %d %.1f\n", 
 		it->c_str(), ft.pfx.c_str(), ft.wdfinc, ft.boost));
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@ -66,9 +66,9 @@ struct FieldTraits {
    int    wdfinc; // Index time term frequency increment (default 1)
    double boost; // Query time boost (default 1.0)
    bool   pfxonly; // Suppress prefix-less indexing
-
+    bool   noterms; // Don't add term to highlight data (e.g.: rclbes)
    FieldTraits() 
-        : wdfinc(1), boost(1.0), pfxonly(false)
+        : wdfinc(1), boost(1.0), pfxonly(false), noterms(false)
        {}
 };
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -71,8 +71,8 @@ static const string xapday_prefix = "D";
 static const string xapmonth_prefix = "M";
 static const string xapyear_prefix = "Y";
 const string pathelt_prefix = "XP";
-const string udi_prefix("Q");
+static const string udi_prefix("Q");
-const string parent_prefix("F");
+static const string parent_prefix("F");
 // Special terms to mark begin/end of field (for anchored searches), and
 // page breaks
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -533,8 +533,6 @@ private:
 string version_string();
 extern const string pathelt_prefix;
 extern const string udi_prefix;
 extern const string parent_prefix;
 extern const string mimetype_prefix;
 extern const string unsplitFilenameFieldName;
 extern string start_of_field_term;
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -181,7 +181,8 @@ bool SearchData::addClause(SearchDataClause* cl)
    return true;
 }
-// Am I a file name only search ? This is to turn off term highlighting
+// Am I a file name only search ? This is to turn off term highlighting.
 // There can't be a subclause in a filename search: no possible need to recurse
 bool SearchData::fileNameOnly() 
 {
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
@ -190,6 +191,7 @@ bool SearchData::fileNameOnly()
    return true;
 }
 // The query language creates a lot of subqueries. See if we can merge them.
 void SearchData::simplify()
 {
    for (unsigned int i = 0; i < m_query.size(); i++) {
@ -249,30 +251,35 @@ void SearchData::simplify()
    }
 }
-bool SearchData::singleSimple()
+// Extract terms and groups for highlighting
 {
    if (m_query.size() != 1 || !m_filetypes.empty() || !m_nfiletypes.empty() ||
        m_haveDates || m_maxSize != size_t(-1) || m_minSize != size_t(-1) ||
        m_haveWildCards)
        return false;
    SearchDataClause *clp = *m_query.begin();
    if (clp->getTp() != SCLT_AND && clp->getTp() != SCLT_OR) {
        return false;
    }
    return true;
 }
 // Extract all term data
 void SearchData::getTerms(HighlightData &hld) const
 {
-    for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
+    for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++) {
-        (*it)->getTerms(hld);
+	if (!((*it)->getmodifiers() & SearchDataClause::SDCM_NOTERMS) &&
 	    !(*it)->getexclude()) {
 	    (*it)->getTerms(hld);
 	}
    }
    return;
 }
 static const char * tpToString(SClType t)
 {
    switch (t) {
    case SCLT_AND: return "AND";
    case SCLT_OR: return "OR";
    case SCLT_FILENAME: return "FILENAME";
    case SCLT_PHRASE: return "PHRASE";
    case SCLT_NEAR: return "NEAR";
    case SCLT_PATH: return "PATH";
    case SCLT_SUB: return "SUB";
    default: return "UNKNOWN";
    }
 }
 void SearchData::dump(ostream& o) const
 {
-    o << "SearchData: " << " qs " << int(m_query.size()) << 
+    o << "SearchData: " << tpToString(m_tp) << " qs " << int(m_query.size()) << 
        " ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() << 
        " hd " << m_haveDates << " maxs " << int(m_maxSize) << " mins " << 
        int(m_minSize) << " wc " << m_haveWildCards << "\n";
@ -291,7 +298,7 @@ void SearchDataClause::dump(ostream& o) const
 void SearchDataClauseSimple::dump(ostream& o) const
 {
-    o << "ClauseSimple: ";
+    o << "ClauseSimple: " << tpToString(m_tp) << " ";
    if (m_exclude)
        o << "- ";
    o << "[" ;
@ -319,9 +326,9 @@ void SearchDataClausePath::dump(ostream& o) const
 void SearchDataClauseDist::dump(ostream& o) const
 {
    if (m_tp == SCLT_NEAR)
-        o << "ClauseDist: NEAR: ";
+        o << "ClauseDist: NEAR ";
    else
-        o << "ClauseDist: PHRA: ";
+        o << "ClauseDist: PHRA ";
    if (m_exclude)
        o << " - ";
--- a/src/rcldb/searchdata.h
+++ b/src/rcldb/searchdata.h
@ -96,9 +96,6 @@ public:
    /** Is there anything but a file name search in here ? */
    bool fileNameOnly();
    /** Are we a simple query with one clause? */
    bool singleSimple();
    /** Do we have wildcards anywhere apart from filename searches ? */
    bool haveWildCards() {return m_haveWildCards;}
@ -228,7 +225,9 @@ private:
 class SearchDataClause {
 public:
    enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
-		   SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16};
+		   SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16,
 		   SDCM_NOTERMS=32 // Don't include terms for highlighting
    };
    enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
    SearchDataClause(SClType tp) 
@ -278,13 +277,12 @@ public:
    {
 	return m_parentSearch ? m_parentSearch->getSoftMaxExp() : -1;
    }
    virtual void setModifiers(Modifier mod) 
    {
 	m_modifiers = mod;
    }
    virtual void addModifier(Modifier mod) 
    {
-	m_modifiers = Modifier(m_modifiers | mod);
+	m_modifiers = m_modifiers | mod;
    }
    virtual unsigned int getmodifiers() {
 	return m_modifiers;
    }
    virtual void setWeight(float w) 
    {
@ -312,7 +310,7 @@ protected:
    SClType     m_tp;
    SearchData *m_parentSearch;
    bool        m_haveWildCards;
-    Modifier    m_modifiers;
+    unsigned int  m_modifiers;
    float       m_weight;
    bool        m_exclude;
    Relation    m_rel;
--- a/src/rcldb/searchdatatox.cpp
+++ b/src/rcldb/searchdatatox.cpp
@ -25,6 +25,7 @@
 #include <vector>
 #include <algorithm>
 #include <sstream>
 #include <iostream>
 using namespace std;
 #include "xapian.h"
@ -53,9 +54,10 @@ typedef  vector<SearchDataClause *>::iterator qlist_it_t;
 static const int original_term_wqf_booster = 10;
-// Expand categories and mime type wild card exps Categories are
+// Expand doc categories and mime type wild card expressions 
-// expanded against the configuration, mimetypes against the index
+//
-// (for wildcards).
+// Categories are expanded against the configuration, mimetypes
 // against the index.
 bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
 {
    const RclConfig *cfg = db.getConf();
@ -101,6 +103,8 @@ static const char *maxXapClauseCaseDiacMsg =
    "wildcards ?"
    ;
 // Walk the clauses list, translate each and add to top Xapian Query
 bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, 
 				vector<SearchDataClause*>& query, 
 				string& reason, void *d)
@ -484,7 +488,8 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
    if (noexpansion) {
 	oexp.push_back(prefix + term);
 	m_hldata.terms[term] = term;
-	LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
+	LOGDEB(("ExpandTerm: noexpansion: final: %s\n", 
                stringsToString(oexp).c_str()));
 	return true;
    } 
@ -568,6 +573,8 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
    string prefix;
    const FieldTraits *ftp;
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
 	if (ftp->noterms)
 	    addModifier(SDCM_NOTERMS);
 	prefix = wrap_prefix(ftp->pfx);
    }
--- a/src/sampleconf/fields
+++ b/src/sampleconf/fields
@ -43,12 +43,12 @@ keywords= K
 xapyearmon = M
 title = S ; wdfinc = 10
 mtype = T
-ext = XE
+ext = XE; noterms = 1
 rclmd5 = XM
-dir = XP
+dir = XP ; noterms = 1
 abstract = XS
-filename = XSFN
+filename = XSFN ; noterms = 1
-containerfilename = XCFN ; pfxonly = 1
+containerfilename = XCFN ; pfxonly = 1 ; noterms = 1
 rclUnsplitFN = XSFS
 xapyear = Y
 recipient = XTO
@ -58,7 +58,7 @@ recipient = XTO
 # by default. 
 # Some values are internally reserved by recoll: 
 #   XE (file ext), XP (for path elements), XSFN, XSFS, XXST, XXND, XXPG
-rclbes = XB
+rclbes = XB ; noterms = 1
 # Using XX was not a good idea. 
 #
 # I hereby commit to not using XY for Recoll: