split the term expansion code out of rcldb.cpp

2013-01-15 10:06:24 +01:00 · 2013-01-15 10:06:24 +01:00 · 9b4ce08a0d
commit 9b4ce08a0d
parent af214b3aa0
6 changed files with 507 additions and 495 deletions
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -19,8 +19,6 @@
 #include <stdio.h>
 #include <cstring>
 #include <unistd.h>
-#include <fnmatch.h>
-#include <regex.h>
 #include <math.h>
 #include <time.h>

@ -29,9 +27,7 @@
 #include <algorithm>
 #include <sstream>

-#ifndef NO_NAMESPACES
 using namespace std;
-#endif /* NO_NAMESPACES */

 #include "xapian.h"

@ -65,9 +61,7 @@ static const string cstr_RCL_IDX_VERSION("1");

 static const string cstr_mbreaks("rclmbreaks");

-#ifndef NO_NAMESPACES
 namespace Rcl {
-#endif

 // Some prefixes that we could get from the fields file, but are not going
 // to ever change.
@ -94,7 +88,7 @@ const string page_break_term = "XXPG/";

 // Field name for the unsplit file name. Has to exist in the field file 
 // because of usage in termmatch()
-static const string unsplitFilenameFieldName = "rclUnsplitFN";
+const string unsplitFilenameFieldName = "rclUnsplitFN";
 static const string unsplitfilename_prefix = "XSFS";

 string version_string(){
@ -1358,7 +1352,6 @@ bool Db::needUpdate(const string &udi, const string& sig)
    return true;
 }

-
 // Return existing stem db languages
 vector<string> Db::getStemLangs()
 {
@ -1581,120 +1574,6 @@ bool Db::purgeFileWrite(const string& udi, const string& uniterm)
    return false;
 }

-// File name wild card expansion. This is a specialisation ot termMatch
-bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
-{
-    string pattern = fnexp;
-    names.clear();
-
-    // If pattern is not capitalized, not quoted (quoted pattern can't
-    // get here currently anyway), and has no wildcards, we add * at
-    // each end: match any substring
-    if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
-	pattern = pattern.substr(1, pattern.size() -2);
-    } else if (pattern.find_first_of(cstr_minwilds) == string::npos && 
-	       !unaciscapital(pattern)) {
-	pattern = "*" + pattern + "*";
-    } // else let it be
-
-    LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
-
-    // We inconditionnally lowercase and strip the pattern, as is done
-    // during indexing. This seems to be the only sane possible
-    // approach with file names and wild cards. termMatch does
-    // stripping conditionally on indexstripchars.
-    string pat1;
-    if (unacmaybefold(pattern, pat1, "UTF-8", UNACOP_UNACFOLD)) {
-	pattern.swap(pat1);
-    }
-
-    TermMatchResult result;
-    if (!termMatch(ET_WILD, string(), pattern, result, max,
-		   unsplitFilenameFieldName))
-	return false;
-    for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
-	 it != result.entries.end(); it++) 
-	names.push_back(it->term);
-
-    if (names.empty()) {
-	// Build an impossible query: we know its impossible because we
-	// control the prefixes!
-	names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
-    }
-    return true;
-}
-
-// Walk the Y terms and return min/max
-bool Db::maxYearSpan(int *minyear, int *maxyear)
-{
-    LOGDEB(("Rcl::Db:maxYearSpan\n"));
-    *minyear = 1000000; 
-    *maxyear = -1000000;
-    TermMatchResult result;
-    if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear")) {
-	LOGINFO(("Rcl::Db:maxYearSpan: termMatch failed\n"));
-	return false;
-    }
-    for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
-	 it != result.entries.end(); it++) {
-        if (!it->term.empty()) {
-            int year = atoi(strip_prefix(it->term).c_str());
-            if (year < *minyear)
-                *minyear = year;
-            if (year > *maxyear)
-                *maxyear = year;
-        }
-    }
-    return true;
-}
-
-
-class TermMatchCmpByWcf {
-public:
-    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
-	return r.wcf - l.wcf < 0;
-    }
-};
-class TermMatchCmpByTerm {
-public:
-    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
-	return l.term.compare(r.term) > 0;
-    }
-};
-class TermMatchTermEqual {
-public:
-    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
-	return !l.term.compare(r.term);
-    }
-};
-
-#ifdef RCL_INDEX_STRIPCHARS
-bool Db::stemExpand(const string &langs, const string &term, 
-		    TermMatchResult& result)
-{
-    if (m_ndb == 0 || m_ndb->m_isopen == false)
-	return false;
-    vector<string> exp;
-    StemDb db(m_ndb->xrdb);
-    if (!db.stemExpand(langs, term, exp))
-	return false;
-    result.entries.insert(result.entries.end(), exp.begin(), exp.end());
-    return true;
-}
-#endif
-
-/** Add prefix to all strings in list. 
- * @param prefix already wrapped prefix
- */
-static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
-{
-    if (prefix.empty())
-	return;
-    for (vector<TermMatchEntry>::iterator it = terms.begin(); 
-         it != terms.end(); it++)
-	it->term.insert(0, prefix);
-}
-
 bool Db::dbStats(DbStats& res)
 {
    if (!m_ndb || !m_ndb->m_isopen)
@ -1711,369 +1590,6 @@ bool Db::dbStats(DbStats& res)
    return true;
 }

-// Find all index terms that match a wildcard or regular expression If
-// field is set, we return a list of appropriately prefixed terms
-// (which are going to be used to build a Xapian query).  This routine
-// performs case/diacritics/stemming expansion and possibly calls
-// idxTermMatch for wildcard/regexp expansion and filtering against
-// the main index terms.
-bool Db::termMatch(int typ_sens, const string &lang,
-		   const string &_term,
-		   TermMatchResult& res,
-		   int max, 
-		   const string& field)
-{
-    int matchtyp = matchTypeTp(typ_sens);
-    if (!m_ndb || !m_ndb->m_isopen)
-	return false;
-    Xapian::Database xrdb = m_ndb->xrdb;
-
-    bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
-    bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
-
-    bool stripped = false;
-#ifdef RCL_INDEX_STRIPCHARS
-    stripped = true;
-#else
-    stripped = o_index_stripchars;
-#endif
-
-    LOGDEB(("Db::TermMatch: typ %d diacsens %d casesens %d lang [%s] term [%s] "
-	    "max %d field [%s] stripped %d\n",
-	    matchtyp, diac_sensitive, case_sensitive, lang.c_str(), 
-	    _term.c_str(), max, field.c_str(), stripped));
-
-    // If index is stripped, no case or diac expansion can be needed:
-    // for the processing inside this routine, everything looks like
-    // we're all-sensitive: no use of expansion db.
-    // Also, convert input to lowercase and strip its accents.
-    string term = _term;
-    if (stripped) {
-	diac_sensitive = case_sensitive = true;
-	if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
-	    LOGERR(("Db::termMatch: unac failed for [%s]\n", _term.c_str()));
-	    return false;
-	}
-    }
-
-#ifndef RCL_INDEX_STRIPCHARS
-    // The case/diac expansion db
-    SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
-    XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
-#endif // RCL_INDEX_STRIPCHARS
-
-
-    if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
-#ifdef RCL_INDEX_STRIPCHARS
-	idxTermMatch(typ_sens, lang, term, res, max, field);
-#else
-	RefCntr<StrMatcher> matcher;
-	if (matchtyp == ET_WILD) {
-	    matcher = RefCntr<StrMatcher>(new StrWildMatcher(term));
-	} else {
-	    matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(term));
-	}
-	if (!diac_sensitive || !case_sensitive) {
-	    // Perform case/diac expansion on the exp as appropriate and
-	    // expand the result.
-	    vector<string> exp;
-	    if (diac_sensitive) {
-		// Expand for diacritics and case, filtering for same diacritics
-		SynTermTransUnac foldtrans(UNACOP_FOLD);
-		synac.synKeyExpand(matcher.getptr(), exp, &foldtrans);
-	    } else if (case_sensitive) {
-		// Expand for diacritics and case, filtering for same case
-		SynTermTransUnac unactrans(UNACOP_UNAC);
-		synac.synKeyExpand(matcher.getptr(), exp, &unactrans);
-	    } else {
-		// Expand for diacritics and case, no filtering
-		synac.synKeyExpand(matcher.getptr(), exp);
-	    }
-	    // Retrieve additional info and filter against the index itself
-	    for (vector<string>::const_iterator it = exp.begin(); 
-		 it != exp.end(); it++) {
-		idxTermMatch(ET_NONE, "", *it, res, max, field);
-	    }
-	} else {
-	    idxTermMatch(typ_sens, lang, term, res, max, field);
-	}
-
-#endif // RCL_INDEX_STRIPCHARS
-
-    } else {
-	// Expansion is STEM or NONE (which may still need case/diac exp)
-
-#ifdef RCL_INDEX_STRIPCHARS
-
-	idxTermMatch(Rcl::Db::ET_STEM, lang, term, res, max, field);
-
-#else
-	vector<string> lexp;
-	if (diac_sensitive && case_sensitive) {
-	    // No case/diac expansion
-	    lexp.push_back(term);
-	} else if (diac_sensitive) {
-	    // Expand for accents and case, filtering for same accents,
-	    SynTermTransUnac foldtrans(UNACOP_FOLD);
-	    synac.synExpand(term, lexp, &foldtrans);
-	} else if (case_sensitive) {
-	    // Expand for accents and case, filtering for same case
-	    SynTermTransUnac unactrans(UNACOP_UNAC);
-	    synac.synExpand(term, lexp, &unactrans);
-	} else {
-	    // We are neither accent- nor case- sensitive and may need stem
-	    // expansion or not. Expand for accents and case
-	    synac.synExpand(term, lexp);
-	}
-
-	if (matchTypeTp(typ_sens) == ET_STEM) {
-	    // Need stem expansion. Lowercase the result of accent and case
-	    // expansion for input to stemdb.
-	    for (unsigned int i = 0; i < lexp.size(); i++) {
-		string lower;
-		unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
-		lexp[i] = lower;
-	    }
-	    sort(lexp.begin(), lexp.end());
-	    lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
-	    StemDb sdb(xrdb);
-	    vector<string> exp1;
-	    for (vector<string>::const_iterator it = lexp.begin(); 
-		 it != lexp.end(); it++) {
-		sdb.stemExpand(lang, *it, exp1);
-	    }
-	    LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
-
-	    // Expand the resulting list for case (all stemdb content
-	    // is lowercase)
-	    lexp.clear();
-	    for (vector<string>::const_iterator it = exp1.begin(); 
-		 it != exp1.end(); it++) {
-		synac.synExpand(*it, lexp);
-	    }
-	    sort(lexp.begin(), lexp.end());
-	    lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
-	}
-
-	// Filter the result and get the stats, possibly add prefixes.
-	LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
-	for (vector<string>::const_iterator it = lexp.begin();
-	     it != lexp.end(); it++) {
-	    idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
-	}
-    }
-#endif
-
-    TermMatchCmpByTerm tcmp;
-    sort(res.entries.begin(), res.entries.end(), tcmp);
-    TermMatchTermEqual teq;
-    vector<TermMatchEntry>::iterator uit = 
-	unique(res.entries.begin(), res.entries.end(), teq);
-    res.entries.resize(uit - res.entries.begin());
-    TermMatchCmpByWcf wcmp;
-    sort(res.entries.begin(), res.entries.end(), wcmp);
-    if (max > 0) {
-	// Would need a small max and big stem expansion...
-	res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
-    }
-    return true;
-}
-
-// Second phase of wildcard/regexp term expansion after case/diac
-// expansion: expand against main index terms
-bool Db::idxTermMatch(int typ_sens, const string &lang,
-		      const string &root,
-		      TermMatchResult& res,
-		      int max, 
-		      const string& field)
-{
-    int typ = matchTypeTp(typ_sens);
-
-#ifndef RCL_INDEX_STRIPCHARS
-    if (typ == ET_STEM) {
-	LOGFATAL(("RCLDB: internal error: idxTermMatch called with ET_STEM\n"));
-	abort();
-    }
-#endif
-
-    if (!m_ndb || !m_ndb->m_isopen)
-	return false;
-    Xapian::Database xdb = m_ndb->xrdb;
-
-    string prefix;
-    if (!field.empty()) {
-	const FieldTraits *ftp = 0;
-	if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
-            LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n", 
-                    field.c_str()));
-        } else {
-	    prefix = wrap_prefix(ftp->pfx);
-	}
-    }
-    res.prefix = prefix;
-
-#ifdef RCL_INDEX_STRIPCHARS
-    if (typ == ET_STEM) {
-	if (!stemExpand(lang, root, res))
-	    return false;
-	for (vector<TermMatchEntry>::iterator it = res.entries.begin(); 
-	     it != res.entries.end(); it++) {
-	    XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
-                   it->docs = xdb.get_termfreq(it->term),
-                   xdb, m_reason);
-            if (!m_reason.empty())
-                return false;
-	    LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
-	}
-        if (!prefix.empty())
-            addPrefix(res.entries, prefix);
-    } else 
-#endif
-    {
-	RefCntr<StrMatcher> matcher;
-	if (typ == ET_REGEXP) {
-	    matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(root));
-	    if (!matcher->ok()) {
-		LOGERR(("termMatch: regcomp failed: %s\n", 
-			matcher->getreason().c_str()))
-		    return false;
-	    }
-	} else if (typ == ET_WILD) {
-	    matcher = RefCntr<StrMatcher>(new StrWildMatcher(root));
-	}
-
-	// Find the initial section before any special char
-	string::size_type es = string::npos;
-	if (matcher.isNotNull()) {
-	    es = matcher->baseprefixlen();
-	}
-	string is;
-	switch (es) {
-	case string::npos: is = prefix + root; break;
-	case 0: is = prefix; break;
-	default: is = prefix + root.substr(0, es); break;
-	}
-	LOGDEB2(("termMatch: initsec: [%s]\n", is.c_str()));
-
-        for (int tries = 0; tries < 2; tries++) { 
-            try {
-                Xapian::TermIterator it = xdb.allterms_begin(); 
-                if (!is.empty())
-                    it.skip_to(is.c_str());
-                for (int rcnt = 0; it != xdb.allterms_end(); it++) {
-                    // If we're beyond the terms matching the initial
-                    // string, end
-                    if (!is.empty() && (*it).find(is) != 0)
-                        break;
-                    string term;
-                    if (!prefix.empty())
-                        term = (*it).substr(prefix.length());
-                    else
-                        term = *it;
-
-		    if (matcher.isNotNull() && !matcher->match(term))
-			continue;
-
-                    res.entries.push_back(
-			TermMatchEntry(*it, xdb.get_collection_freq(*it),
-				       it.get_termfreq()));
-
-		    // The problem with truncating here is that this is done
-		    // alphabetically and we may not keep the most frequent 
-		    // terms. OTOH, not doing it may stall the program if
-		    // we are walking the whole term list. We compromise
-		    // by cutting at 2*max
-                    if (max > 0 && ++rcnt >= 2*max)
-			break;
-                }
-                m_reason.erase();
-                break;
-            } catch (const Xapian::DatabaseModifiedError &e) {
-                m_reason = e.get_msg();
-                xdb.reopen();
-                continue;
-            } XCATCHERROR(m_reason);
-            break;
-        }
-	if (!m_reason.empty()) {
-	    LOGERR(("termMatch: %s\n", m_reason.c_str()));
-	    return false;
-	}
-    }
-
-    return true;
-}
-
-/** Term list walking. */
-class TermIter {
-public:
-    Xapian::TermIterator it;
-    Xapian::Database db;
-};
-TermIter *Db::termWalkOpen()
-{
-    if (!m_ndb || !m_ndb->m_isopen)
-	return 0;
-    TermIter *tit = new TermIter;
-    if (tit) {
-	tit->db = m_ndb->xrdb;
-        XAPTRY(tit->it = tit->db.allterms_begin(), tit->db, m_reason);
-	if (!m_reason.empty()) {
-	    LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
-	    return 0;
-	}
-    }
-    return tit;
-}
-bool Db::termWalkNext(TermIter *tit, string &term)
-{
-    XAPTRY(
-	if (tit && tit->it != tit->db.allterms_end()) {
-	    term = *(tit->it)++;
-	    return true;
-	}
-        , tit->db, m_reason);
-
-    if (!m_reason.empty()) {
-	LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
-    }
-    return false;
-}
-void Db::termWalkClose(TermIter *tit)
-{
-    try {
-	delete tit;
-    } catch (...) {}
-}
-
-bool Db::termExists(const string& word)
-{
-    if (!m_ndb || !m_ndb->m_isopen)
-	return 0;
-
-    XAPTRY(if (!m_ndb->xrdb.term_exists(word)) return false,
-           m_ndb->xrdb, m_reason);
-
-    if (!m_reason.empty()) {
-	LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
-	return false;
-    }
-    return true;
-}
-
-
-bool Db::stemDiffers(const string& lang, const string& word, 
-		     const string& base)
-{
-    Xapian::Stem stemmer(lang);
-    if (!stemmer(word).compare(stemmer(base))) {
-	LOGDEB2(("Rcl::Db::stemDiffers: same for %s and %s\n", 
-		word.c_str(), base.c_str()));
-	return false;
-    }
-    return true;
-}
-
 // Retrieve document defined by Unique doc identifier. This is used
 // by the GUI history feature and by open parent/getenclosing
 // ! The return value is always true except for fatal errors. Document
@ -2120,6 +1636,4 @@ bool Db::getDoc(const string &udi, Doc &doc)
    return false;
 }

-#ifndef NO_NAMESPACES
-}
-#endif
+} // End namespace Rcl