split the term expansion code out of rcldb.cpp
This commit is contained in:
parent
af214b3aa0
commit
9b4ce08a0d
6 changed files with 507 additions and 495 deletions
|
@ -8,8 +8,8 @@ LIBS = librecoll.a $(LIBRECOLL)
|
|||
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o daterange.o expansiondbs.o rclabstract.o rcldb.o rcldoc.o rclquery.o searchdata.o searchdataxml.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o strmatcher.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp daterange.dep.stamp expansiondbs.dep.stamp rclabstract.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp searchdataxml.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp strmatcher.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o daterange.o expansiondbs.o rclabstract.o rcldb.o rcldoc.o rclquery.o rclterms.o searchdata.o searchdataxml.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o strmatcher.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp daterange.dep.stamp expansiondbs.dep.stamp rclabstract.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp rclterms.dep.stamp searchdata.dep.stamp searchdataxml.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp strmatcher.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
|
||||
librecoll.a : $(DEPS) $(OBJS)
|
||||
ar ru librecoll.a $(OBJS)
|
||||
|
@ -109,6 +109,8 @@ rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
|
|||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldoc.cpp
|
||||
rclquery.o : ../rcldb/rclquery.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rclquery.cpp
|
||||
rclterms.o : ../rcldb/rclterms.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rclterms.cpp
|
||||
searchdata.o : ../rcldb/searchdata.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/searchdata.cpp
|
||||
searchdataxml.o : ../rcldb/searchdataxml.cpp $(depth)/mk/localdefs
|
||||
|
@ -316,6 +318,9 @@ rcldoc.dep.stamp : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
|
|||
rclquery.dep.stamp : ../rcldb/rclquery.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rclquery.cpp > rclquery.dep
|
||||
touch rclquery.dep.stamp
|
||||
rclterms.dep.stamp : ../rcldb/rclterms.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rclterms.cpp > rclterms.dep
|
||||
touch rclterms.dep.stamp
|
||||
searchdata.dep.stamp : ../rcldb/searchdata.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/searchdata.cpp > searchdata.dep
|
||||
touch searchdata.dep.stamp
|
||||
|
@ -446,6 +451,7 @@ include rclabstract.dep
|
|||
include rcldb.dep
|
||||
include rcldoc.dep
|
||||
include rclquery.dep
|
||||
include rclterms.dep
|
||||
include searchdata.dep
|
||||
include searchdataxml.dep
|
||||
include stemdb.dep
|
||||
|
|
|
@ -48,6 +48,7 @@ ${depth}/rcldb/rclabstract.cpp \
|
|||
${depth}/rcldb/rcldb.cpp \
|
||||
${depth}/rcldb/rcldoc.cpp \
|
||||
${depth}/rcldb/rclquery.cpp \
|
||||
${depth}/rcldb/rclterms.cpp \
|
||||
${depth}/rcldb/searchdata.cpp \
|
||||
${depth}/rcldb/searchdataxml.cpp \
|
||||
${depth}/rcldb/stemdb.cpp \
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include <xapian.h>
|
||||
|
||||
#include "unacpp.h"
|
||||
#include "synfamily.h"
|
||||
|
||||
/** Specialization and overall creation code for the term expansion mechanism
|
||||
* defined in synfamily.h
|
||||
|
|
|
@ -19,8 +19,6 @@
|
|||
#include <stdio.h>
|
||||
#include <cstring>
|
||||
#include <unistd.h>
|
||||
#include <fnmatch.h>
|
||||
#include <regex.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
|
||||
|
@ -29,9 +27,7 @@
|
|||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
#include "xapian.h"
|
||||
|
||||
|
@ -65,9 +61,7 @@ static const string cstr_RCL_IDX_VERSION("1");
|
|||
|
||||
static const string cstr_mbreaks("rclmbreaks");
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
namespace Rcl {
|
||||
#endif
|
||||
|
||||
// Some prefixes that we could get from the fields file, but are not going
|
||||
// to ever change.
|
||||
|
@ -94,7 +88,7 @@ const string page_break_term = "XXPG/";
|
|||
|
||||
// Field name for the unsplit file name. Has to exist in the field file
|
||||
// because of usage in termmatch()
|
||||
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
||||
const string unsplitFilenameFieldName = "rclUnsplitFN";
|
||||
static const string unsplitfilename_prefix = "XSFS";
|
||||
|
||||
string version_string(){
|
||||
|
@ -1358,7 +1352,6 @@ bool Db::needUpdate(const string &udi, const string& sig)
|
|||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Return existing stem db languages
|
||||
vector<string> Db::getStemLangs()
|
||||
{
|
||||
|
@ -1581,120 +1574,6 @@ bool Db::purgeFileWrite(const string& udi, const string& uniterm)
|
|||
return false;
|
||||
}
|
||||
|
||||
// File name wild card expansion. This is a specialisation ot termMatch
|
||||
bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
|
||||
{
|
||||
string pattern = fnexp;
|
||||
names.clear();
|
||||
|
||||
// If pattern is not capitalized, not quoted (quoted pattern can't
|
||||
// get here currently anyway), and has no wildcards, we add * at
|
||||
// each end: match any substring
|
||||
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
||||
pattern = pattern.substr(1, pattern.size() -2);
|
||||
} else if (pattern.find_first_of(cstr_minwilds) == string::npos &&
|
||||
!unaciscapital(pattern)) {
|
||||
pattern = "*" + pattern + "*";
|
||||
} // else let it be
|
||||
|
||||
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
|
||||
|
||||
// We inconditionnally lowercase and strip the pattern, as is done
|
||||
// during indexing. This seems to be the only sane possible
|
||||
// approach with file names and wild cards. termMatch does
|
||||
// stripping conditionally on indexstripchars.
|
||||
string pat1;
|
||||
if (unacmaybefold(pattern, pat1, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
pattern.swap(pat1);
|
||||
}
|
||||
|
||||
TermMatchResult result;
|
||||
if (!termMatch(ET_WILD, string(), pattern, result, max,
|
||||
unsplitFilenameFieldName))
|
||||
return false;
|
||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||
it != result.entries.end(); it++)
|
||||
names.push_back(it->term);
|
||||
|
||||
if (names.empty()) {
|
||||
// Build an impossible query: we know its impossible because we
|
||||
// control the prefixes!
|
||||
names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Walk the Y terms and return min/max
|
||||
bool Db::maxYearSpan(int *minyear, int *maxyear)
|
||||
{
|
||||
LOGDEB(("Rcl::Db:maxYearSpan\n"));
|
||||
*minyear = 1000000;
|
||||
*maxyear = -1000000;
|
||||
TermMatchResult result;
|
||||
if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear")) {
|
||||
LOGINFO(("Rcl::Db:maxYearSpan: termMatch failed\n"));
|
||||
return false;
|
||||
}
|
||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||
it != result.entries.end(); it++) {
|
||||
if (!it->term.empty()) {
|
||||
int year = atoi(strip_prefix(it->term).c_str());
|
||||
if (year < *minyear)
|
||||
*minyear = year;
|
||||
if (year > *maxyear)
|
||||
*maxyear = year;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
class TermMatchCmpByWcf {
|
||||
public:
|
||||
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
||||
return r.wcf - l.wcf < 0;
|
||||
}
|
||||
};
|
||||
class TermMatchCmpByTerm {
|
||||
public:
|
||||
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
||||
return l.term.compare(r.term) > 0;
|
||||
}
|
||||
};
|
||||
class TermMatchTermEqual {
|
||||
public:
|
||||
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
||||
return !l.term.compare(r.term);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
bool Db::stemExpand(const string &langs, const string &term,
|
||||
TermMatchResult& result)
|
||||
{
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||
return false;
|
||||
vector<string> exp;
|
||||
StemDb db(m_ndb->xrdb);
|
||||
if (!db.stemExpand(langs, term, exp))
|
||||
return false;
|
||||
result.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Add prefix to all strings in list.
|
||||
* @param prefix already wrapped prefix
|
||||
*/
|
||||
static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
|
||||
{
|
||||
if (prefix.empty())
|
||||
return;
|
||||
for (vector<TermMatchEntry>::iterator it = terms.begin();
|
||||
it != terms.end(); it++)
|
||||
it->term.insert(0, prefix);
|
||||
}
|
||||
|
||||
bool Db::dbStats(DbStats& res)
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
|
@ -1711,369 +1590,6 @@ bool Db::dbStats(DbStats& res)
|
|||
return true;
|
||||
}
|
||||
|
||||
// Find all index terms that match a wildcard or regular expression If
|
||||
// field is set, we return a list of appropriately prefixed terms
|
||||
// (which are going to be used to build a Xapian query). This routine
|
||||
// performs case/diacritics/stemming expansion and possibly calls
|
||||
// idxTermMatch for wildcard/regexp expansion and filtering against
|
||||
// the main index terms.
|
||||
bool Db::termMatch(int typ_sens, const string &lang,
|
||||
const string &_term,
|
||||
TermMatchResult& res,
|
||||
int max,
|
||||
const string& field)
|
||||
{
|
||||
int matchtyp = matchTypeTp(typ_sens);
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return false;
|
||||
Xapian::Database xrdb = m_ndb->xrdb;
|
||||
|
||||
bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
|
||||
bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
|
||||
|
||||
bool stripped = false;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
stripped = true;
|
||||
#else
|
||||
stripped = o_index_stripchars;
|
||||
#endif
|
||||
|
||||
LOGDEB(("Db::TermMatch: typ %d diacsens %d casesens %d lang [%s] term [%s] "
|
||||
"max %d field [%s] stripped %d\n",
|
||||
matchtyp, diac_sensitive, case_sensitive, lang.c_str(),
|
||||
_term.c_str(), max, field.c_str(), stripped));
|
||||
|
||||
// If index is stripped, no case or diac expansion can be needed:
|
||||
// for the processing inside this routine, everything looks like
|
||||
// we're all-sensitive: no use of expansion db.
|
||||
// Also, convert input to lowercase and strip its accents.
|
||||
string term = _term;
|
||||
if (stripped) {
|
||||
diac_sensitive = case_sensitive = true;
|
||||
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", _term.c_str()));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
// The case/diac expansion db
|
||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||
XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
|
||||
#endif // RCL_INDEX_STRIPCHARS
|
||||
|
||||
|
||||
if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||
#else
|
||||
RefCntr<StrMatcher> matcher;
|
||||
if (matchtyp == ET_WILD) {
|
||||
matcher = RefCntr<StrMatcher>(new StrWildMatcher(term));
|
||||
} else {
|
||||
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(term));
|
||||
}
|
||||
if (!diac_sensitive || !case_sensitive) {
|
||||
// Perform case/diac expansion on the exp as appropriate and
|
||||
// expand the result.
|
||||
vector<string> exp;
|
||||
if (diac_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same diacritics
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synKeyExpand(matcher.getptr(), exp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synKeyExpand(matcher.getptr(), exp, &unactrans);
|
||||
} else {
|
||||
// Expand for diacritics and case, no filtering
|
||||
synac.synKeyExpand(matcher.getptr(), exp);
|
||||
}
|
||||
// Retrieve additional info and filter against the index itself
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
idxTermMatch(ET_NONE, "", *it, res, max, field);
|
||||
}
|
||||
} else {
|
||||
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||
}
|
||||
|
||||
#endif // RCL_INDEX_STRIPCHARS
|
||||
|
||||
} else {
|
||||
// Expansion is STEM or NONE (which may still need case/diac exp)
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
|
||||
idxTermMatch(Rcl::Db::ET_STEM, lang, term, res, max, field);
|
||||
|
||||
#else
|
||||
vector<string> lexp;
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
// No case/diac expansion
|
||||
lexp.push_back(term);
|
||||
} else if (diac_sensitive) {
|
||||
// Expand for accents and case, filtering for same accents,
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synExpand(term, lexp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for accents and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synExpand(term, lexp, &unactrans);
|
||||
} else {
|
||||
// We are neither accent- nor case- sensitive and may need stem
|
||||
// expansion or not. Expand for accents and case
|
||||
synac.synExpand(term, lexp);
|
||||
}
|
||||
|
||||
if (matchTypeTp(typ_sens) == ET_STEM) {
|
||||
// Need stem expansion. Lowercase the result of accent and case
|
||||
// expansion for input to stemdb.
|
||||
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||
string lower;
|
||||
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||
lexp[i] = lower;
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
StemDb sdb(xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
sdb.stemExpand(lang, *it, exp1);
|
||||
}
|
||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
lexp.clear();
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
synac.synExpand(*it, lexp);
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
}
|
||||
|
||||
// Filter the result and get the stats, possibly add prefixes.
|
||||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
TermMatchCmpByTerm tcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), tcmp);
|
||||
TermMatchTermEqual teq;
|
||||
vector<TermMatchEntry>::iterator uit =
|
||||
unique(res.entries.begin(), res.entries.end(), teq);
|
||||
res.entries.resize(uit - res.entries.begin());
|
||||
TermMatchCmpByWcf wcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), wcmp);
|
||||
if (max > 0) {
|
||||
// Would need a small max and big stem expansion...
|
||||
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Second phase of wildcard/regexp term expansion after case/diac
|
||||
// expansion: expand against main index terms
|
||||
bool Db::idxTermMatch(int typ_sens, const string &lang,
|
||||
const string &root,
|
||||
TermMatchResult& res,
|
||||
int max,
|
||||
const string& field)
|
||||
{
|
||||
int typ = matchTypeTp(typ_sens);
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (typ == ET_STEM) {
|
||||
LOGFATAL(("RCLDB: internal error: idxTermMatch called with ET_STEM\n"));
|
||||
abort();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return false;
|
||||
Xapian::Database xdb = m_ndb->xrdb;
|
||||
|
||||
string prefix;
|
||||
if (!field.empty()) {
|
||||
const FieldTraits *ftp = 0;
|
||||
if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
|
||||
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
|
||||
field.c_str()));
|
||||
} else {
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
}
|
||||
res.prefix = prefix;
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
if (typ == ET_STEM) {
|
||||
if (!stemExpand(lang, root, res))
|
||||
return false;
|
||||
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
|
||||
it->docs = xdb.get_termfreq(it->term),
|
||||
xdb, m_reason);
|
||||
if (!m_reason.empty())
|
||||
return false;
|
||||
LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
|
||||
}
|
||||
if (!prefix.empty())
|
||||
addPrefix(res.entries, prefix);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
RefCntr<StrMatcher> matcher;
|
||||
if (typ == ET_REGEXP) {
|
||||
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(root));
|
||||
if (!matcher->ok()) {
|
||||
LOGERR(("termMatch: regcomp failed: %s\n",
|
||||
matcher->getreason().c_str()))
|
||||
return false;
|
||||
}
|
||||
} else if (typ == ET_WILD) {
|
||||
matcher = RefCntr<StrMatcher>(new StrWildMatcher(root));
|
||||
}
|
||||
|
||||
// Find the initial section before any special char
|
||||
string::size_type es = string::npos;
|
||||
if (matcher.isNotNull()) {
|
||||
es = matcher->baseprefixlen();
|
||||
}
|
||||
string is;
|
||||
switch (es) {
|
||||
case string::npos: is = prefix + root; break;
|
||||
case 0: is = prefix; break;
|
||||
default: is = prefix + root.substr(0, es); break;
|
||||
}
|
||||
LOGDEB2(("termMatch: initsec: [%s]\n", is.c_str()));
|
||||
|
||||
for (int tries = 0; tries < 2; tries++) {
|
||||
try {
|
||||
Xapian::TermIterator it = xdb.allterms_begin();
|
||||
if (!is.empty())
|
||||
it.skip_to(is.c_str());
|
||||
for (int rcnt = 0; it != xdb.allterms_end(); it++) {
|
||||
// If we're beyond the terms matching the initial
|
||||
// string, end
|
||||
if (!is.empty() && (*it).find(is) != 0)
|
||||
break;
|
||||
string term;
|
||||
if (!prefix.empty())
|
||||
term = (*it).substr(prefix.length());
|
||||
else
|
||||
term = *it;
|
||||
|
||||
if (matcher.isNotNull() && !matcher->match(term))
|
||||
continue;
|
||||
|
||||
res.entries.push_back(
|
||||
TermMatchEntry(*it, xdb.get_collection_freq(*it),
|
||||
it.get_termfreq()));
|
||||
|
||||
// The problem with truncating here is that this is done
|
||||
// alphabetically and we may not keep the most frequent
|
||||
// terms. OTOH, not doing it may stall the program if
|
||||
// we are walking the whole term list. We compromise
|
||||
// by cutting at 2*max
|
||||
if (max > 0 && ++rcnt >= 2*max)
|
||||
break;
|
||||
}
|
||||
m_reason.erase();
|
||||
break;
|
||||
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||
m_reason = e.get_msg();
|
||||
xdb.reopen();
|
||||
continue;
|
||||
} XCATCHERROR(m_reason);
|
||||
break;
|
||||
}
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("termMatch: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Term list walking. */
|
||||
class TermIter {
|
||||
public:
|
||||
Xapian::TermIterator it;
|
||||
Xapian::Database db;
|
||||
};
|
||||
TermIter *Db::termWalkOpen()
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return 0;
|
||||
TermIter *tit = new TermIter;
|
||||
if (tit) {
|
||||
tit->db = m_ndb->xrdb;
|
||||
XAPTRY(tit->it = tit->db.allterms_begin(), tit->db, m_reason);
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return tit;
|
||||
}
|
||||
bool Db::termWalkNext(TermIter *tit, string &term)
|
||||
{
|
||||
XAPTRY(
|
||||
if (tit && tit->it != tit->db.allterms_end()) {
|
||||
term = *(tit->it)++;
|
||||
return true;
|
||||
}
|
||||
, tit->db, m_reason);
|
||||
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
void Db::termWalkClose(TermIter *tit)
|
||||
{
|
||||
try {
|
||||
delete tit;
|
||||
} catch (...) {}
|
||||
}
|
||||
|
||||
bool Db::termExists(const string& word)
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return 0;
|
||||
|
||||
XAPTRY(if (!m_ndb->xrdb.term_exists(word)) return false,
|
||||
m_ndb->xrdb, m_reason);
|
||||
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool Db::stemDiffers(const string& lang, const string& word,
|
||||
const string& base)
|
||||
{
|
||||
Xapian::Stem stemmer(lang);
|
||||
if (!stemmer(word).compare(stemmer(base))) {
|
||||
LOGDEB2(("Rcl::Db::stemDiffers: same for %s and %s\n",
|
||||
word.c_str(), base.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Retrieve document defined by Unique doc identifier. This is used
|
||||
// by the GUI history feature and by open parent/getenclosing
|
||||
// ! The return value is always true except for fatal errors. Document
|
||||
|
@ -2120,6 +1636,4 @@ bool Db::getDoc(const string &udi, Doc &doc)
|
|||
return false;
|
||||
}
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
}
|
||||
#endif
|
||||
} // End namespace Rcl
|
||||
|
|
|
@ -326,7 +326,7 @@ class Db {
|
|||
* @param field if set, defines the field within with the expansion should
|
||||
* be performed. Only used for wildcards and regexps, stemming is
|
||||
* always global. If this is set, the resulting output terms
|
||||
* will be appropriately prefix and the prefix value will be set
|
||||
* will be appropriately prefixed and the prefix value will be set
|
||||
* in the TermMatchResult header
|
||||
*/
|
||||
enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3,
|
||||
|
@ -440,10 +440,6 @@ private:
|
|||
bool i_close(bool final);
|
||||
// Reinitialize when adding/removing additional dbs
|
||||
bool adjustdbs();
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
bool stemExpand(const string &lang, const string &s,
|
||||
TermMatchResult& result);
|
||||
#endif
|
||||
bool idxTermMatch(int typ_sens, const string &lang, const string &term,
|
||||
TermMatchResult& result, int max = -1,
|
||||
const string& field = cstr_null);
|
||||
|
@ -465,6 +461,7 @@ extern const string pathelt_prefix;
|
|||
extern const string udi_prefix;
|
||||
extern const string parent_prefix;
|
||||
extern const string mimetype_prefix;
|
||||
extern const string unsplitFilenameFieldName;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
extern const string start_of_field_term;
|
||||
extern const string end_of_field_term;
|
||||
|
|
493
src/rcldb/rclterms.cpp
Normal file
493
src/rcldb/rclterms.cpp
Normal file
|
@ -0,0 +1,493 @@
|
|||
/* Copyright (C) 2004 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
/** Things dealing with walking the terms lists and expansion dbs */
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "rcldb.h"
|
||||
#include "rcldb_p.h"
|
||||
#include "stemdb.h"
|
||||
#include "expansiondbs.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
// File name wild card expansion. This is a specialisation ot termMatch
|
||||
bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
|
||||
{
|
||||
string pattern = fnexp;
|
||||
names.clear();
|
||||
|
||||
// If pattern is not capitalized, not quoted (quoted pattern can't
|
||||
// get here currently anyway), and has no wildcards, we add * at
|
||||
// each end: match any substring
|
||||
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
||||
pattern = pattern.substr(1, pattern.size() -2);
|
||||
} else if (pattern.find_first_of(cstr_minwilds) == string::npos &&
|
||||
!unaciscapital(pattern)) {
|
||||
pattern = "*" + pattern + "*";
|
||||
} // else let it be
|
||||
|
||||
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
|
||||
|
||||
// We inconditionnally lowercase and strip the pattern, as is done
|
||||
// during indexing. This seems to be the only sane possible
|
||||
// approach with file names and wild cards. termMatch does
|
||||
// stripping conditionally on indexstripchars.
|
||||
string pat1;
|
||||
if (unacmaybefold(pattern, pat1, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
pattern.swap(pat1);
|
||||
}
|
||||
|
||||
TermMatchResult result;
|
||||
if (!termMatch(ET_WILD, string(), pattern, result, max,
|
||||
unsplitFilenameFieldName))
|
||||
return false;
|
||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||
it != result.entries.end(); it++)
|
||||
names.push_back(it->term);
|
||||
|
||||
if (names.empty()) {
|
||||
// Build an impossible query: we know its impossible because we
|
||||
// control the prefixes!
|
||||
names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Walk the Y terms and return min/max
|
||||
bool Db::maxYearSpan(int *minyear, int *maxyear)
|
||||
{
|
||||
LOGDEB(("Rcl::Db:maxYearSpan\n"));
|
||||
*minyear = 1000000;
|
||||
*maxyear = -1000000;
|
||||
TermMatchResult result;
|
||||
if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear")) {
|
||||
LOGINFO(("Rcl::Db:maxYearSpan: termMatch failed\n"));
|
||||
return false;
|
||||
}
|
||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||
it != result.entries.end(); it++) {
|
||||
if (!it->term.empty()) {
|
||||
int year = atoi(strip_prefix(it->term).c_str());
|
||||
if (year < *minyear)
|
||||
*minyear = year;
|
||||
if (year > *maxyear)
|
||||
*maxyear = year;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
class TermMatchCmpByWcf {
|
||||
public:
|
||||
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
||||
return r.wcf - l.wcf < 0;
|
||||
}
|
||||
};
|
||||
class TermMatchCmpByTerm {
|
||||
public:
|
||||
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
||||
return l.term.compare(r.term) > 0;
|
||||
}
|
||||
};
|
||||
class TermMatchTermEqual {
|
||||
public:
|
||||
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
||||
return !l.term.compare(r.term);
|
||||
}
|
||||
};
|
||||
|
||||
/** Add prefix to all strings in list.
|
||||
* @param prefix already wrapped prefix
|
||||
*/
|
||||
static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
|
||||
{
|
||||
if (prefix.empty())
|
||||
return;
|
||||
for (vector<TermMatchEntry>::iterator it = terms.begin();
|
||||
it != terms.end(); it++)
|
||||
it->term.insert(0, prefix);
|
||||
}
|
||||
|
||||
// Find all index terms that match an input along different expansion modes:
|
||||
// wildcard, regular expression, or stemming. Depending on flags we perform
|
||||
// case and/or diacritics expansion (this can be the only thing requested).
|
||||
// If the "field" parameter is set, we return a list of appropriately
|
||||
// prefixed terms (which are going to be used to build a Xapian
|
||||
// query).
|
||||
// This routine performs case/diacritics/stemming expansion against
|
||||
// the auxiliary tables, and possibly calls idxTermMatch() for work
|
||||
// using the main index terms (filtering, retrieving stats, expansion
|
||||
// in some cases).
|
||||
bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
||||
TermMatchResult& res, int max, const string& field)
|
||||
{
|
||||
int matchtyp = matchTypeTp(typ_sens);
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return false;
|
||||
Xapian::Database xrdb = m_ndb->xrdb;
|
||||
|
||||
bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
|
||||
bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
|
||||
|
||||
bool stripped = false;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
stripped = true;
|
||||
#else
|
||||
stripped = o_index_stripchars;
|
||||
#endif
|
||||
|
||||
LOGDEB(("Db::TermMatch: typ %d diacsens %d casesens %d lang [%s] term [%s] "
|
||||
"max %d field [%s] stripped %d\n",
|
||||
matchtyp, diac_sensitive, case_sensitive, lang.c_str(),
|
||||
_term.c_str(), max, field.c_str(), stripped));
|
||||
|
||||
// If index is stripped, no case or diac expansion can be needed:
|
||||
// for the processing inside this routine, everything looks like
|
||||
// we're all-sensitive: no use of expansion db.
|
||||
// Also, convert input to lowercase and strip its accents.
|
||||
string term = _term;
|
||||
if (stripped) {
|
||||
diac_sensitive = case_sensitive = true;
|
||||
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", _term.c_str()));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
// The case/diac expansion db
|
||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||
XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
|
||||
#endif // RCL_INDEX_STRIPCHARS
|
||||
|
||||
|
||||
if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||
#else
|
||||
RefCntr<StrMatcher> matcher;
|
||||
if (matchtyp == ET_WILD) {
|
||||
matcher = RefCntr<StrMatcher>(new StrWildMatcher(term));
|
||||
} else {
|
||||
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(term));
|
||||
}
|
||||
if (!diac_sensitive || !case_sensitive) {
|
||||
// Perform case/diac expansion on the exp as appropriate and
|
||||
// expand the result.
|
||||
vector<string> exp;
|
||||
if (diac_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same diacritics
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synKeyExpand(matcher.getptr(), exp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synKeyExpand(matcher.getptr(), exp, &unactrans);
|
||||
} else {
|
||||
// Expand for diacritics and case, no filtering
|
||||
synac.synKeyExpand(matcher.getptr(), exp);
|
||||
}
|
||||
// Retrieve additional info and filter against the index itself
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
idxTermMatch(ET_NONE, "", *it, res, max, field);
|
||||
}
|
||||
} else {
|
||||
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||
}
|
||||
|
||||
#endif // RCL_INDEX_STRIPCHARS
|
||||
|
||||
} else {
|
||||
// Expansion is STEM or NONE (which may still need case/diac exp)
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
|
||||
idxTermMatch(Rcl::Db::ET_STEM, lang, term, res, max, field);
|
||||
|
||||
#else
|
||||
vector<string> lexp;
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
// No case/diac expansion
|
||||
lexp.push_back(term);
|
||||
} else if (diac_sensitive) {
|
||||
// Expand for accents and case, filtering for same accents,
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synExpand(term, lexp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for accents and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synExpand(term, lexp, &unactrans);
|
||||
} else {
|
||||
// We are neither accent- nor case- sensitive and may need stem
|
||||
// expansion or not. Expand for accents and case
|
||||
synac.synExpand(term, lexp);
|
||||
}
|
||||
|
||||
if (matchTypeTp(typ_sens) == ET_STEM) {
|
||||
// Need stem expansion. Lowercase the result of accent and case
|
||||
// expansion for input to stemdb.
|
||||
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||
string lower;
|
||||
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||
lexp[i] = lower;
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
StemDb sdb(xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
sdb.stemExpand(lang, *it, exp1);
|
||||
}
|
||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
lexp.clear();
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
synac.synExpand(*it, lexp);
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
}
|
||||
|
||||
// Filter the result and get the stats, possibly add prefixes.
|
||||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
TermMatchCmpByTerm tcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), tcmp);
|
||||
TermMatchTermEqual teq;
|
||||
vector<TermMatchEntry>::iterator uit =
|
||||
unique(res.entries.begin(), res.entries.end(), teq);
|
||||
res.entries.resize(uit - res.entries.begin());
|
||||
TermMatchCmpByWcf wcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), wcmp);
|
||||
if (max > 0) {
|
||||
// Would need a small max and big stem expansion...
|
||||
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Second phase of wildcard/regexp term expansion after case/diac
|
||||
// expansion: expand against main index terms
|
||||
bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
|
||||
TermMatchResult& res, int max, const string& field)
|
||||
{
|
||||
int typ = matchTypeTp(typ_sens);
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (typ == ET_STEM) {
|
||||
LOGFATAL(("RCLDB: internal error: idxTermMatch called with ET_STEM\n"));
|
||||
abort();
|
||||
}
|
||||
#endif
|
||||
|
||||
Xapian::Database xdb = m_ndb->xrdb;
|
||||
|
||||
string prefix;
|
||||
if (!field.empty()) {
|
||||
const FieldTraits *ftp = 0;
|
||||
if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
|
||||
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
|
||||
field.c_str()));
|
||||
} else {
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
}
|
||||
res.prefix = prefix;
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
if (typ == ET_STEM) {
|
||||
vector<string> exp;
|
||||
StemDb db(m_ndb->xrdb);
|
||||
if (!db.stemExpand(langs, term, exp))
|
||||
return false;
|
||||
res.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
||||
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
|
||||
it->docs = xdb.get_termfreq(it->term),
|
||||
xdb, m_reason);
|
||||
if (!m_reason.empty())
|
||||
return false;
|
||||
LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
|
||||
}
|
||||
if (!prefix.empty())
|
||||
addPrefix(res.entries, prefix);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
RefCntr<StrMatcher> matcher;
|
||||
if (typ == ET_REGEXP) {
|
||||
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(root));
|
||||
if (!matcher->ok()) {
|
||||
LOGERR(("termMatch: regcomp failed: %s\n",
|
||||
matcher->getreason().c_str()))
|
||||
return false;
|
||||
}
|
||||
} else if (typ == ET_WILD) {
|
||||
matcher = RefCntr<StrMatcher>(new StrWildMatcher(root));
|
||||
}
|
||||
|
||||
// Find the initial section before any special char
|
||||
string::size_type es = string::npos;
|
||||
if (matcher.isNotNull()) {
|
||||
es = matcher->baseprefixlen();
|
||||
}
|
||||
string is;
|
||||
switch (es) {
|
||||
case string::npos: is = prefix + root; break;
|
||||
case 0: is = prefix; break;
|
||||
default: is = prefix + root.substr(0, es); break;
|
||||
}
|
||||
LOGDEB2(("termMatch: initsec: [%s]\n", is.c_str()));
|
||||
|
||||
for (int tries = 0; tries < 2; tries++) {
|
||||
try {
|
||||
Xapian::TermIterator it = xdb.allterms_begin();
|
||||
if (!is.empty())
|
||||
it.skip_to(is.c_str());
|
||||
for (int rcnt = 0; it != xdb.allterms_end(); it++) {
|
||||
// If we're beyond the terms matching the initial
|
||||
// string, end
|
||||
if (!is.empty() && (*it).find(is) != 0)
|
||||
break;
|
||||
string term;
|
||||
if (!prefix.empty())
|
||||
term = (*it).substr(prefix.length());
|
||||
else
|
||||
term = *it;
|
||||
|
||||
if (matcher.isNotNull() && !matcher->match(term))
|
||||
continue;
|
||||
|
||||
res.entries.push_back(
|
||||
TermMatchEntry(*it, xdb.get_collection_freq(*it),
|
||||
it.get_termfreq()));
|
||||
|
||||
// The problem with truncating here is that this is done
|
||||
// alphabetically and we may not keep the most frequent
|
||||
// terms. OTOH, not doing it may stall the program if
|
||||
// we are walking the whole term list. We compromise
|
||||
// by cutting at 2*max
|
||||
if (max > 0 && ++rcnt >= 2*max)
|
||||
break;
|
||||
}
|
||||
m_reason.erase();
|
||||
break;
|
||||
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||
m_reason = e.get_msg();
|
||||
xdb.reopen();
|
||||
continue;
|
||||
} XCATCHERROR(m_reason);
|
||||
break;
|
||||
}
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("termMatch: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Term list walking. */
|
||||
class TermIter {
|
||||
public:
|
||||
Xapian::TermIterator it;
|
||||
Xapian::Database db;
|
||||
};
|
||||
TermIter *Db::termWalkOpen()
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return 0;
|
||||
TermIter *tit = new TermIter;
|
||||
if (tit) {
|
||||
tit->db = m_ndb->xrdb;
|
||||
XAPTRY(tit->it = tit->db.allterms_begin(), tit->db, m_reason);
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return tit;
|
||||
}
|
||||
bool Db::termWalkNext(TermIter *tit, string &term)
|
||||
{
|
||||
XAPTRY(
|
||||
if (tit && tit->it != tit->db.allterms_end()) {
|
||||
term = *(tit->it)++;
|
||||
return true;
|
||||
}
|
||||
, tit->db, m_reason);
|
||||
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
void Db::termWalkClose(TermIter *tit)
|
||||
{
|
||||
try {
|
||||
delete tit;
|
||||
} catch (...) {}
|
||||
}
|
||||
|
||||
bool Db::termExists(const string& word)
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return 0;
|
||||
|
||||
XAPTRY(if (!m_ndb->xrdb.term_exists(word)) return false,
|
||||
m_ndb->xrdb, m_reason);
|
||||
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Db::stemDiffers(const string& lang, const string& word,
|
||||
const string& base)
|
||||
{
|
||||
Xapian::Stem stemmer(lang);
|
||||
if (!stemmer(word).compare(stemmer(base))) {
|
||||
LOGDEB2(("Rcl::Db::stemDiffers: same for %s and %s\n",
|
||||
word.c_str(), base.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // End namespace Rcl
|
Loading…
Add table
Add a link
Reference in a new issue