Make Recoll optionally sensitive to case and diacritics

2012-09-14 14:34:27 +02:00 · 2012-09-14 14:34:27 +02:00 · 166624f7f2
commit 166624f7f2
parent 7fcfe27952
30 changed files with 849 additions and 487 deletions
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -14,6 +14,8 @@
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
+#include "autoconfig.h"
+
 #include <stdio.h>
 #include <cstring>
 #include <unistd.h>
@ -53,6 +55,7 @@ using namespace std;
 #include "cancelcheck.h"
 #include "ptmutex.h"
 #include "termproc.h"
+#include "expansiondbs.h"

 #ifndef MAX
 #define MAX(A,B) (A>B?A:B)
@ -84,9 +87,15 @@ static const string xapday_prefix = "D";
 static const string xapmonth_prefix = "M";
 static const string xapyear_prefix = "Y";
 const string pathelt_prefix = "XP";
+#ifdef RCL_INDEX_STRIPCHARS
 const string start_of_field_term = "XXST";
 const string end_of_field_term = "XXND";
 static const string page_break_term = "XXPG";
+#else
+const string start_of_field_term = "XXST/";
+const string end_of_field_term = "XXND/";
+static const string page_break_term = "XXPG/";
+#endif
 // Field name for the unsplit file name. Has to exist in the field file 
 // because of usage in termmatch()
 static const string unsplitFilenameFieldName = "rclUnsplitFN";
@ -197,7 +206,7 @@ static void noPrefixList(const vector<string>& in, vector<string>& out)
 {
    for (vector<string>::const_iterator qit = in.begin(); 
 	 qit != in.end(); qit++) {
-	if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
+	if (!has_prefix(*qit))
 	    out.push_back(*qit);
    }
 }
@ -573,7 +582,7 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
 	for (term = xrdb.termlist_begin(docid);
 	     term != xrdb.termlist_end(docid); term++) {
 	    // Ignore prefixed terms
-	    if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
+	    if (has_prefix(*term))
 		continue;
 	    if (cutoff-- < 0) {
 		LOGDEB0(("makeAbstract: max term count cutoff\n"));
@ -652,7 +661,9 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
 	    vabs.push_back(chunk);
 	    chunk.clear();
 	} else {
-	    chunk += it->second;
+	    if (it->second.compare(end_of_field_term) && 
+		it->second.compare(start_of_field_term))
+		chunk += it->second;
 	}
    }
    if (!chunk.empty())
@ -874,11 +885,13 @@ int Db::termDocCnt(const string& _term)
    if (!m_ndb || !m_ndb->m_isopen)
        return -1;

-    string term;
+    string term = _term;
+#ifdef RCL_INDEX_STRIPCHARS
    if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
 	LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
 	return 0;
    }
+#endif

    if (m_stops.isStop(term)) {
 	LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
@ -994,8 +1007,19 @@ class TextSplitDb : public TextSplitP {
    {}
    // Reimplement text_to_words to add start and end special terms
    virtual bool text_to_words(const string &in);
-    void setprefix(const string& pref) {prefix = pref;}
-    void setwdfinc(int i) {wdfinc = i;}
+
+    void setprefix(const string& pref) 
+    {
+	if (pref.empty())
+	    prefix.clear();
+	else
+	    prefix = wrap_prefix(pref);
+    }
+
+    void setwdfinc(int i) 
+    {
+	wdfinc = i;
+    }

    friend class TermProcIdx;

@ -1127,11 +1151,13 @@ string Db::getSpellingSuggestion(const string& word)
 {
    if (m_ndb == 0)
 	return string();
-    string term;
+    string term = word;
+#ifdef RCL_INDEX_STRIPCHARS
    if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
 	LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
 	return string();
    }
+#endif
    if (!isSpellingCandidate(term))
 	return string();
    return m_ndb->xrdb.get_spelling_suggestion(term);
@ -1239,8 +1265,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    TermProcIdx tpidx;
    TermProc *nxt = &tpidx;
    TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
-//    TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
+    //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
+#ifdef RCL_INDEX_STRIPCHARS
    TermProcPrep tpprep(nxt); nxt = &tpprep;
+#endif

    TextSplitDb splitter(newdocument, nxt);
    tpidx.setTSD(&splitter);
@ -1266,7 +1294,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 	vector<string> vpath;
 	stringToTokens(path, vpath, "/");
 	splitter.curpos = 0;
-	newdocument.add_posting(pathelt_prefix, 
+	newdocument.add_posting(wrap_prefix(pathelt_prefix),
 				splitter.basepos + splitter.curpos++);
 	for (vector<string>::iterator it = vpath.begin(); 
 	     it != vpath.end(); it++){
@ -1274,7 +1302,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 		// Just truncate it. May still be useful because of wildcards
 		*it = it->substr(0, 230);
 	    }
-	    newdocument.add_posting(pathelt_prefix + *it, 
+	    newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
 				    splitter.basepos + splitter.curpos++);
 	}
    }
@ -1319,7 +1347,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,

    ////// Special terms for other metadata. No positions for these.
    // Mime type
-    newdocument.add_term(mimetype_prefix + doc.mimetype);
+    newdocument.add_term(wrap_prefix(mimetype_prefix) + doc.mimetype);

    // Simple file name indexed unsplit for specific "file name"
    // searches. This is not the same as a filename: clause inside the
@ -1335,9 +1363,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 		utf8truncate(fn, 230);
 	    string::size_type pos = fn.rfind('.');
 	    if (pos != string::npos && pos != fn.length() - 1) {
-		newdocument.add_term(fileext_prefix + fn.substr(pos + 1));
+		newdocument.add_term(wrap_prefix(fileext_prefix) + 
+				     fn.substr(pos + 1));
 	    }
-	    newdocument.add_term(unsplitfilename_prefix + fn);
+	    newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn);
 	}
    }

@ -1356,12 +1385,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    struct tm *tm = localtime(&mtime);
    char buf[9];
    snprintf(buf, 9, "%04d%02d%02d",
-	     tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
-    newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
+	    tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
+    // Date (YYYYMMDD)
+    newdocument.add_term(wrap_prefix(xapday_prefix) + string(buf)); 
+    // Month (YYYYMM)
    buf[6] = '\0';
-    newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
+    newdocument.add_term(wrap_prefix(xapmonth_prefix) + string(buf));
+    // Year (YYYY)
    buf[4] = '\0';
-    newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
+    newdocument.add_term(wrap_prefix(xapyear_prefix) + string(buf)); 


    //////////////////////////////////////////////////////////////////
@ -1834,7 +1866,7 @@ bool Db::maxYearSpan(int *minyear, int *maxyear)
    *minyear = 1000000; 
    *maxyear = -1000000;
    TermMatchResult result;
-    if (!termMatch(ET_WILD, string(), "*", result, 5000, "xapyear"))
+    if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear"))
 	return false;
    for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
 	 it != result.entries.end(); it++) {
@ -1899,30 +1931,32 @@ const string cstr_wildSpecChars = "*?[";
 const string cstr_regSpecChars = "(.[{";

 // Find all index terms that match a wildcard or regular expression
+// If field is set, we return a list of appropriately prefixed terms (which 
+// are going to be used to build a Xapian query).
 bool Db::termMatch(MatchType typ, const string &lang,
 		   const string &root, 
 		   TermMatchResult& res,
 		   int max, 
-		   const string& field,
-                   string *prefixp
-    )
+		   const string& field)
 {
    if (!m_ndb || !m_ndb->m_isopen)
 	return false;
    Xapian::Database xdb = m_ndb->xdb();

-    res.clear();
    XAPTRY(res.dbdoccount = xdb.get_doccount();
           res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
    if (!m_reason.empty())
        return false;

    // Get rid of capitals and accents
-    string droot;
+
+    string droot = root;
+#ifdef RCL_INDEX_STRIPCHARS
    if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
 	LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
 	return false;
    }
+#endif
    string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;

    string prefix;
@ -1932,17 +1966,14 @@ bool Db::termMatch(MatchType typ, const string &lang,
            LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n", 
                    field.c_str()));
        } else {
-	    prefix = ftp->pfx;
+	    prefix = wrap_prefix(ftp->pfx);
 	}
-        if (prefixp)
-            *prefixp = prefix;
    }
+    res.prefix = prefix;

    if (typ == ET_STEM) {
 	if (!stemExpand(lang, root, res, max))
 	    return false;
-	sort(res.entries.begin(), res.entries.end());
-	unique(res.entries.begin(), res.entries.end());
 	for (vector<TermMatchEntry>::iterator it = res.entries.begin(); 
 	     it != res.entries.end(); it++) {
 	    XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
@ -2032,7 +2063,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
    TermMatchCmpByTerm tcmp;
    sort(res.entries.begin(), res.entries.end(), tcmp);
    TermMatchTermEqual teq;
-    unique(res.entries.begin(), res.entries.end(), teq);
+    vector<TermMatchEntry>::iterator uit = 
+	unique(res.entries.begin(), res.entries.end(), teq);
+    res.entries.resize(uit - res.entries.begin());
    TermMatchCmpByWcf wcmp;
    sort(res.entries.begin(), res.entries.end(), wcmp);
    if (max > 0) {