split the term expansion code out of rcldb.cpp
This commit is contained in:
parent
af214b3aa0
commit
9b4ce08a0d
6 changed files with 507 additions and 495 deletions
|
@ -19,8 +19,6 @@
|
|||
#include <stdio.h>
|
||||
#include <cstring>
|
||||
#include <unistd.h>
|
||||
#include <fnmatch.h>
|
||||
#include <regex.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
|
||||
|
@ -29,9 +27,7 @@
|
|||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
#include "xapian.h"
|
||||
|
||||
|
@ -65,9 +61,7 @@ static const string cstr_RCL_IDX_VERSION("1");
|
|||
|
||||
static const string cstr_mbreaks("rclmbreaks");
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
namespace Rcl {
|
||||
#endif
|
||||
|
||||
// Some prefixes that we could get from the fields file, but are not going
|
||||
// to ever change.
|
||||
|
@ -94,7 +88,7 @@ const string page_break_term = "XXPG/";
|
|||
|
||||
// Field name for the unsplit file name. Has to exist in the field file
|
||||
// because of usage in termmatch()
|
||||
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
||||
const string unsplitFilenameFieldName = "rclUnsplitFN";
|
||||
static const string unsplitfilename_prefix = "XSFS";
|
||||
|
||||
string version_string(){
|
||||
|
@ -1358,7 +1352,6 @@ bool Db::needUpdate(const string &udi, const string& sig)
|
|||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Return existing stem db languages
|
||||
vector<string> Db::getStemLangs()
|
||||
{
|
||||
|
@ -1581,120 +1574,6 @@ bool Db::purgeFileWrite(const string& udi, const string& uniterm)
|
|||
return false;
|
||||
}
|
||||
|
||||
// File name wild card expansion. This is a specialisation ot termMatch
|
||||
bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
|
||||
{
|
||||
string pattern = fnexp;
|
||||
names.clear();
|
||||
|
||||
// If pattern is not capitalized, not quoted (quoted pattern can't
|
||||
// get here currently anyway), and has no wildcards, we add * at
|
||||
// each end: match any substring
|
||||
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
||||
pattern = pattern.substr(1, pattern.size() -2);
|
||||
} else if (pattern.find_first_of(cstr_minwilds) == string::npos &&
|
||||
!unaciscapital(pattern)) {
|
||||
pattern = "*" + pattern + "*";
|
||||
} // else let it be
|
||||
|
||||
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
|
||||
|
||||
// We inconditionnally lowercase and strip the pattern, as is done
|
||||
// during indexing. This seems to be the only sane possible
|
||||
// approach with file names and wild cards. termMatch does
|
||||
// stripping conditionally on indexstripchars.
|
||||
string pat1;
|
||||
if (unacmaybefold(pattern, pat1, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
pattern.swap(pat1);
|
||||
}
|
||||
|
||||
TermMatchResult result;
|
||||
if (!termMatch(ET_WILD, string(), pattern, result, max,
|
||||
unsplitFilenameFieldName))
|
||||
return false;
|
||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||
it != result.entries.end(); it++)
|
||||
names.push_back(it->term);
|
||||
|
||||
if (names.empty()) {
|
||||
// Build an impossible query: we know its impossible because we
|
||||
// control the prefixes!
|
||||
names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Walk the Y terms and return min/max
|
||||
bool Db::maxYearSpan(int *minyear, int *maxyear)
|
||||
{
|
||||
LOGDEB(("Rcl::Db:maxYearSpan\n"));
|
||||
*minyear = 1000000;
|
||||
*maxyear = -1000000;
|
||||
TermMatchResult result;
|
||||
if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear")) {
|
||||
LOGINFO(("Rcl::Db:maxYearSpan: termMatch failed\n"));
|
||||
return false;
|
||||
}
|
||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||
it != result.entries.end(); it++) {
|
||||
if (!it->term.empty()) {
|
||||
int year = atoi(strip_prefix(it->term).c_str());
|
||||
if (year < *minyear)
|
||||
*minyear = year;
|
||||
if (year > *maxyear)
|
||||
*maxyear = year;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
class TermMatchCmpByWcf {
|
||||
public:
|
||||
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
||||
return r.wcf - l.wcf < 0;
|
||||
}
|
||||
};
|
||||
class TermMatchCmpByTerm {
|
||||
public:
|
||||
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
||||
return l.term.compare(r.term) > 0;
|
||||
}
|
||||
};
|
||||
class TermMatchTermEqual {
|
||||
public:
|
||||
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
||||
return !l.term.compare(r.term);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
bool Db::stemExpand(const string &langs, const string &term,
|
||||
TermMatchResult& result)
|
||||
{
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||
return false;
|
||||
vector<string> exp;
|
||||
StemDb db(m_ndb->xrdb);
|
||||
if (!db.stemExpand(langs, term, exp))
|
||||
return false;
|
||||
result.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Add prefix to all strings in list.
|
||||
* @param prefix already wrapped prefix
|
||||
*/
|
||||
static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
|
||||
{
|
||||
if (prefix.empty())
|
||||
return;
|
||||
for (vector<TermMatchEntry>::iterator it = terms.begin();
|
||||
it != terms.end(); it++)
|
||||
it->term.insert(0, prefix);
|
||||
}
|
||||
|
||||
bool Db::dbStats(DbStats& res)
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
|
@ -1711,369 +1590,6 @@ bool Db::dbStats(DbStats& res)
|
|||
return true;
|
||||
}
|
||||
|
||||
// Find all index terms that match a wildcard or regular expression If
|
||||
// field is set, we return a list of appropriately prefixed terms
|
||||
// (which are going to be used to build a Xapian query). This routine
|
||||
// performs case/diacritics/stemming expansion and possibly calls
|
||||
// idxTermMatch for wildcard/regexp expansion and filtering against
|
||||
// the main index terms.
|
||||
bool Db::termMatch(int typ_sens, const string &lang,
|
||||
const string &_term,
|
||||
TermMatchResult& res,
|
||||
int max,
|
||||
const string& field)
|
||||
{
|
||||
int matchtyp = matchTypeTp(typ_sens);
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return false;
|
||||
Xapian::Database xrdb = m_ndb->xrdb;
|
||||
|
||||
bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
|
||||
bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
|
||||
|
||||
bool stripped = false;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
stripped = true;
|
||||
#else
|
||||
stripped = o_index_stripchars;
|
||||
#endif
|
||||
|
||||
LOGDEB(("Db::TermMatch: typ %d diacsens %d casesens %d lang [%s] term [%s] "
|
||||
"max %d field [%s] stripped %d\n",
|
||||
matchtyp, diac_sensitive, case_sensitive, lang.c_str(),
|
||||
_term.c_str(), max, field.c_str(), stripped));
|
||||
|
||||
// If index is stripped, no case or diac expansion can be needed:
|
||||
// for the processing inside this routine, everything looks like
|
||||
// we're all-sensitive: no use of expansion db.
|
||||
// Also, convert input to lowercase and strip its accents.
|
||||
string term = _term;
|
||||
if (stripped) {
|
||||
diac_sensitive = case_sensitive = true;
|
||||
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", _term.c_str()));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
// The case/diac expansion db
|
||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||
XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
|
||||
#endif // RCL_INDEX_STRIPCHARS
|
||||
|
||||
|
||||
if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||
#else
|
||||
RefCntr<StrMatcher> matcher;
|
||||
if (matchtyp == ET_WILD) {
|
||||
matcher = RefCntr<StrMatcher>(new StrWildMatcher(term));
|
||||
} else {
|
||||
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(term));
|
||||
}
|
||||
if (!diac_sensitive || !case_sensitive) {
|
||||
// Perform case/diac expansion on the exp as appropriate and
|
||||
// expand the result.
|
||||
vector<string> exp;
|
||||
if (diac_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same diacritics
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synKeyExpand(matcher.getptr(), exp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synKeyExpand(matcher.getptr(), exp, &unactrans);
|
||||
} else {
|
||||
// Expand for diacritics and case, no filtering
|
||||
synac.synKeyExpand(matcher.getptr(), exp);
|
||||
}
|
||||
// Retrieve additional info and filter against the index itself
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
idxTermMatch(ET_NONE, "", *it, res, max, field);
|
||||
}
|
||||
} else {
|
||||
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||
}
|
||||
|
||||
#endif // RCL_INDEX_STRIPCHARS
|
||||
|
||||
} else {
|
||||
// Expansion is STEM or NONE (which may still need case/diac exp)
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
|
||||
idxTermMatch(Rcl::Db::ET_STEM, lang, term, res, max, field);
|
||||
|
||||
#else
|
||||
vector<string> lexp;
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
// No case/diac expansion
|
||||
lexp.push_back(term);
|
||||
} else if (diac_sensitive) {
|
||||
// Expand for accents and case, filtering for same accents,
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synExpand(term, lexp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for accents and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synExpand(term, lexp, &unactrans);
|
||||
} else {
|
||||
// We are neither accent- nor case- sensitive and may need stem
|
||||
// expansion or not. Expand for accents and case
|
||||
synac.synExpand(term, lexp);
|
||||
}
|
||||
|
||||
if (matchTypeTp(typ_sens) == ET_STEM) {
|
||||
// Need stem expansion. Lowercase the result of accent and case
|
||||
// expansion for input to stemdb.
|
||||
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||
string lower;
|
||||
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||
lexp[i] = lower;
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
StemDb sdb(xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
sdb.stemExpand(lang, *it, exp1);
|
||||
}
|
||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
lexp.clear();
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
synac.synExpand(*it, lexp);
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
}
|
||||
|
||||
// Filter the result and get the stats, possibly add prefixes.
|
||||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
TermMatchCmpByTerm tcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), tcmp);
|
||||
TermMatchTermEqual teq;
|
||||
vector<TermMatchEntry>::iterator uit =
|
||||
unique(res.entries.begin(), res.entries.end(), teq);
|
||||
res.entries.resize(uit - res.entries.begin());
|
||||
TermMatchCmpByWcf wcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), wcmp);
|
||||
if (max > 0) {
|
||||
// Would need a small max and big stem expansion...
|
||||
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Second phase of wildcard/regexp term expansion after case/diac
|
||||
// expansion: expand against main index terms
|
||||
bool Db::idxTermMatch(int typ_sens, const string &lang,
|
||||
const string &root,
|
||||
TermMatchResult& res,
|
||||
int max,
|
||||
const string& field)
|
||||
{
|
||||
int typ = matchTypeTp(typ_sens);
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (typ == ET_STEM) {
|
||||
LOGFATAL(("RCLDB: internal error: idxTermMatch called with ET_STEM\n"));
|
||||
abort();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return false;
|
||||
Xapian::Database xdb = m_ndb->xrdb;
|
||||
|
||||
string prefix;
|
||||
if (!field.empty()) {
|
||||
const FieldTraits *ftp = 0;
|
||||
if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
|
||||
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
|
||||
field.c_str()));
|
||||
} else {
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
}
|
||||
res.prefix = prefix;
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
if (typ == ET_STEM) {
|
||||
if (!stemExpand(lang, root, res))
|
||||
return false;
|
||||
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
|
||||
it->docs = xdb.get_termfreq(it->term),
|
||||
xdb, m_reason);
|
||||
if (!m_reason.empty())
|
||||
return false;
|
||||
LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
|
||||
}
|
||||
if (!prefix.empty())
|
||||
addPrefix(res.entries, prefix);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
RefCntr<StrMatcher> matcher;
|
||||
if (typ == ET_REGEXP) {
|
||||
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(root));
|
||||
if (!matcher->ok()) {
|
||||
LOGERR(("termMatch: regcomp failed: %s\n",
|
||||
matcher->getreason().c_str()))
|
||||
return false;
|
||||
}
|
||||
} else if (typ == ET_WILD) {
|
||||
matcher = RefCntr<StrMatcher>(new StrWildMatcher(root));
|
||||
}
|
||||
|
||||
// Find the initial section before any special char
|
||||
string::size_type es = string::npos;
|
||||
if (matcher.isNotNull()) {
|
||||
es = matcher->baseprefixlen();
|
||||
}
|
||||
string is;
|
||||
switch (es) {
|
||||
case string::npos: is = prefix + root; break;
|
||||
case 0: is = prefix; break;
|
||||
default: is = prefix + root.substr(0, es); break;
|
||||
}
|
||||
LOGDEB2(("termMatch: initsec: [%s]\n", is.c_str()));
|
||||
|
||||
for (int tries = 0; tries < 2; tries++) {
|
||||
try {
|
||||
Xapian::TermIterator it = xdb.allterms_begin();
|
||||
if (!is.empty())
|
||||
it.skip_to(is.c_str());
|
||||
for (int rcnt = 0; it != xdb.allterms_end(); it++) {
|
||||
// If we're beyond the terms matching the initial
|
||||
// string, end
|
||||
if (!is.empty() && (*it).find(is) != 0)
|
||||
break;
|
||||
string term;
|
||||
if (!prefix.empty())
|
||||
term = (*it).substr(prefix.length());
|
||||
else
|
||||
term = *it;
|
||||
|
||||
if (matcher.isNotNull() && !matcher->match(term))
|
||||
continue;
|
||||
|
||||
res.entries.push_back(
|
||||
TermMatchEntry(*it, xdb.get_collection_freq(*it),
|
||||
it.get_termfreq()));
|
||||
|
||||
// The problem with truncating here is that this is done
|
||||
// alphabetically and we may not keep the most frequent
|
||||
// terms. OTOH, not doing it may stall the program if
|
||||
// we are walking the whole term list. We compromise
|
||||
// by cutting at 2*max
|
||||
if (max > 0 && ++rcnt >= 2*max)
|
||||
break;
|
||||
}
|
||||
m_reason.erase();
|
||||
break;
|
||||
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||
m_reason = e.get_msg();
|
||||
xdb.reopen();
|
||||
continue;
|
||||
} XCATCHERROR(m_reason);
|
||||
break;
|
||||
}
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("termMatch: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Term list walking. */
|
||||
class TermIter {
|
||||
public:
|
||||
Xapian::TermIterator it;
|
||||
Xapian::Database db;
|
||||
};
|
||||
TermIter *Db::termWalkOpen()
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return 0;
|
||||
TermIter *tit = new TermIter;
|
||||
if (tit) {
|
||||
tit->db = m_ndb->xrdb;
|
||||
XAPTRY(tit->it = tit->db.allterms_begin(), tit->db, m_reason);
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return tit;
|
||||
}
|
||||
bool Db::termWalkNext(TermIter *tit, string &term)
|
||||
{
|
||||
XAPTRY(
|
||||
if (tit && tit->it != tit->db.allterms_end()) {
|
||||
term = *(tit->it)++;
|
||||
return true;
|
||||
}
|
||||
, tit->db, m_reason);
|
||||
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
void Db::termWalkClose(TermIter *tit)
|
||||
{
|
||||
try {
|
||||
delete tit;
|
||||
} catch (...) {}
|
||||
}
|
||||
|
||||
bool Db::termExists(const string& word)
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return 0;
|
||||
|
||||
XAPTRY(if (!m_ndb->xrdb.term_exists(word)) return false,
|
||||
m_ndb->xrdb, m_reason);
|
||||
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool Db::stemDiffers(const string& lang, const string& word,
|
||||
const string& base)
|
||||
{
|
||||
Xapian::Stem stemmer(lang);
|
||||
if (!stemmer(word).compare(stemmer(base))) {
|
||||
LOGDEB2(("Rcl::Db::stemDiffers: same for %s and %s\n",
|
||||
word.c_str(), base.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Retrieve document defined by Unique doc identifier. This is used
|
||||
// by the GUI history feature and by open parent/getenclosing
|
||||
// ! The return value is always true except for fatal errors. Document
|
||||
|
@ -2120,6 +1636,4 @@ bool Db::getDoc(const string &udi, Doc &doc)
|
|||
return false;
|
||||
}
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
}
|
||||
#endif
|
||||
} // End namespace Rcl
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue