513 lines
18 KiB
C++
513 lines
18 KiB
C++
/* Copyright (C) 2004 J.F.Dockes
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
|
|
////////////////////////////////////////////////////////////////////
|
|
/** Things dealing with walking the terms lists and expansion dbs */
|
|
|
|
#include "autoconfig.h"
|
|
|
|
#include <string>
|
|
|
|
#include "log.h"
|
|
#include "rcldb.h"
|
|
#include "rcldb_p.h"
|
|
#include "stemdb.h"
|
|
#include "expansiondbs.h"
|
|
#include "strmatcher.h"
|
|
|
|
using namespace std;
|
|
|
|
namespace Rcl {
|
|
|
|
// File name wild card expansion. This is a specialisation ot termMatch
|
|
bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
|
|
{
|
|
string pattern = fnexp;
|
|
names.clear();
|
|
|
|
// If pattern is not capitalized, not quoted (quoted pattern can't
|
|
// get here currently anyway), and has no wildcards, we add * at
|
|
// each end: match any substring
|
|
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
|
pattern = pattern.substr(1, pattern.size() -2);
|
|
} else if (pattern.find_first_of(cstr_minwilds) == string::npos &&
|
|
!unaciscapital(pattern)) {
|
|
pattern = "*" + pattern + "*";
|
|
} // else let it be
|
|
|
|
LOGDEB("Rcl::Db::filenameWildExp: pattern: [" << (pattern) << "]\n" );
|
|
|
|
// We inconditionnally lowercase and strip the pattern, as is done
|
|
// during indexing. This seems to be the only sane possible
|
|
// approach with file names and wild cards. termMatch does
|
|
// stripping conditionally on indexstripchars.
|
|
string pat1;
|
|
if (unacmaybefold(pattern, pat1, "UTF-8", UNACOP_UNACFOLD)) {
|
|
pattern.swap(pat1);
|
|
}
|
|
|
|
TermMatchResult result;
|
|
if (!idxTermMatch(ET_WILD, string(), pattern, result, max,
|
|
unsplitFilenameFieldName))
|
|
return false;
|
|
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
|
it != result.entries.end(); it++)
|
|
names.push_back(it->term);
|
|
|
|
if (names.empty()) {
|
|
// Build an impossible query: we know its impossible because we
|
|
// control the prefixes!
|
|
names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Walk the Y terms and return min/max
|
|
bool Db::maxYearSpan(int *minyear, int *maxyear)
|
|
{
|
|
LOGDEB("Rcl::Db:maxYearSpan\n" );
|
|
*minyear = 1000000;
|
|
*maxyear = -1000000;
|
|
TermMatchResult result;
|
|
if (!idxTermMatch(ET_WILD, string(), "*", result, -1, "xapyear")) {
|
|
LOGINFO("Rcl::Db:maxYearSpan: termMatch failed\n" );
|
|
return false;
|
|
}
|
|
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
|
it != result.entries.end(); it++) {
|
|
if (!it->term.empty()) {
|
|
int year = atoi(strip_prefix(it->term).c_str());
|
|
if (year < *minyear)
|
|
*minyear = year;
|
|
if (year > *maxyear)
|
|
*maxyear = year;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Db::getAllDbMimeTypes(std::vector<std::string>& exp)
|
|
{
|
|
Rcl::TermMatchResult res;
|
|
if (!idxTermMatch(Rcl::Db::ET_WILD, "", "*", res, -1, "mtype")) {
|
|
return false;
|
|
}
|
|
for (vector<Rcl::TermMatchEntry>::const_iterator rit = res.entries.begin();
|
|
rit != res.entries.end(); rit++) {
|
|
exp.push_back(Rcl::strip_prefix(rit->term));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
class TermMatchCmpByWcf {
|
|
public:
|
|
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
|
return r.wcf - l.wcf < 0;
|
|
}
|
|
};
|
|
class TermMatchCmpByTerm {
|
|
public:
|
|
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
|
return l.term.compare(r.term) > 0;
|
|
}
|
|
};
|
|
class TermMatchTermEqual {
|
|
public:
|
|
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
|
return !l.term.compare(r.term);
|
|
}
|
|
};
|
|
|
|
static const char *tmtptostr(int typ)
|
|
{
|
|
switch (typ) {
|
|
case Db::ET_WILD: return "wildcard";
|
|
case Db::ET_REGEXP: return "regexp";
|
|
case Db::ET_STEM: return "stem";
|
|
case Db::ET_NONE:
|
|
default: return "none";
|
|
}
|
|
}
|
|
|
|
// Find all index terms that match an input along different expansion modes:
|
|
// wildcard, regular expression, or stemming. Depending on flags we perform
|
|
// case and/or diacritics expansion (this can be the only thing requested).
|
|
// If the "field" parameter is set, we return a list of appropriately
|
|
// prefixed terms (which are going to be used to build a Xapian
|
|
// query).
|
|
// This routine performs case/diacritics/stemming expansion against
|
|
// the auxiliary tables, and possibly calls idxTermMatch() for work
|
|
// using the main index terms (filtering, retrieving stats, expansion
|
|
// in some cases).
|
|
bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
|
TermMatchResult& res, int max, const string& field,
|
|
vector<string>* multiwords)
|
|
{
|
|
int matchtyp = matchTypeTp(typ_sens);
|
|
if (!m_ndb || !m_ndb->m_isopen)
|
|
return false;
|
|
Xapian::Database xrdb = m_ndb->xrdb;
|
|
|
|
bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
|
|
bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
|
|
|
|
LOGDEB0("Db::TermMatch: typ " << (tmtptostr(matchtyp)) << " diacsens " << (diac_sensitive) << " casesens " << (case_sensitive) << " lang [" << (lang) << "] term [" << (_term) << "] max " << (max) << " field [" << (field) << "] stripped " << (o_index_stripchars) << " init res.size " << (res.entries.size()) << "\n" );
|
|
|
|
// If index is stripped, no case or diac expansion can be needed:
|
|
// for the processing inside this routine, everything looks like
|
|
// we're all-sensitive: no use of expansion db.
|
|
// Also, convert input to lowercase and strip its accents.
|
|
string term = _term;
|
|
if (o_index_stripchars) {
|
|
diac_sensitive = case_sensitive = true;
|
|
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
|
LOGERR("Db::termMatch: unac failed for [" << (_term) << "]\n" );
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// The case/diac expansion db
|
|
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
|
XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
|
|
|
|
if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
|
|
std::shared_ptr<StrMatcher> matcher;
|
|
if (matchtyp == ET_WILD) {
|
|
matcher = std::shared_ptr<StrMatcher>(new StrWildMatcher(term));
|
|
} else {
|
|
matcher = std::shared_ptr<StrMatcher>(new StrRegexpMatcher(term));
|
|
}
|
|
if (!diac_sensitive || !case_sensitive) {
|
|
// Perform case/diac expansion on the exp as appropriate and
|
|
// expand the result.
|
|
vector<string> exp;
|
|
if (diac_sensitive) {
|
|
// Expand for diacritics and case, filtering for same diacritics
|
|
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
synac.synKeyExpand(matcher.get(), exp, &foldtrans);
|
|
} else if (case_sensitive) {
|
|
// Expand for diacritics and case, filtering for same case
|
|
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
synac.synKeyExpand(matcher.get(), exp, &unactrans);
|
|
} else {
|
|
// Expand for diacritics and case, no filtering
|
|
synac.synKeyExpand(matcher.get(), exp);
|
|
}
|
|
// Retrieve additional info and filter against the index itself
|
|
for (vector<string>::const_iterator it = exp.begin();
|
|
it != exp.end(); it++) {
|
|
idxTermMatch(ET_NONE, "", *it, res, max, field);
|
|
}
|
|
// And also expand the original expression against the
|
|
// main index: for the common case where the expression
|
|
// had no case/diac expansion (no entry in the exp db if
|
|
// the original term is lowercase and without accents).
|
|
idxTermMatch(typ_sens, lang, term, res, max, field);
|
|
} else {
|
|
idxTermMatch(typ_sens, lang, term, res, max, field);
|
|
}
|
|
|
|
} else {
|
|
// Expansion is STEM or NONE (which may still need synonyms
|
|
// and case/diac exp)
|
|
|
|
vector<string> lexp;
|
|
if (diac_sensitive && case_sensitive) {
|
|
// No case/diac expansion
|
|
lexp.push_back(term);
|
|
} else if (diac_sensitive) {
|
|
// Expand for accents and case, filtering for same accents,
|
|
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
synac.synExpand(term, lexp, &foldtrans);
|
|
} else if (case_sensitive) {
|
|
// Expand for accents and case, filtering for same case
|
|
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
synac.synExpand(term, lexp, &unactrans);
|
|
} else {
|
|
// We are neither accent- nor case- sensitive and may need stem
|
|
// expansion or not. Expand for accents and case
|
|
synac.synExpand(term, lexp);
|
|
}
|
|
|
|
if (matchtyp == ET_STEM || (typ_sens & ET_SYNEXP)) {
|
|
// Note: if any of the above conds is true, we are insensitive to
|
|
// diacs and case (enforced in searchdatatox:termexpand
|
|
// Need stem expansion. Lowercase the result of accent and case
|
|
// expansion for input to stemdb.
|
|
for (unsigned int i = 0; i < lexp.size(); i++) {
|
|
string lower;
|
|
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
|
lexp[i] = lower;
|
|
}
|
|
sort(lexp.begin(), lexp.end());
|
|
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
|
|
if (matchtyp == ET_STEM) {
|
|
StemDb sdb(xrdb);
|
|
vector<string> exp1;
|
|
for (vector<string>::const_iterator it = lexp.begin();
|
|
it != lexp.end(); it++) {
|
|
sdb.stemExpand(lang, *it, exp1);
|
|
}
|
|
exp1.swap(lexp);
|
|
sort(lexp.begin(), lexp.end());
|
|
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
LOGDEB("ExpTerm: stemexp: " << (stringsToString(lexp)) << "\n" );
|
|
}
|
|
|
|
if (m_syngroups.ok() && (typ_sens & ET_SYNEXP)) {
|
|
LOGDEB("ExpTerm: got syngroups\n" );
|
|
vector<string> exp1(lexp);
|
|
for (vector<string>::const_iterator it = lexp.begin();
|
|
it != lexp.end(); it++) {
|
|
vector<string> sg = m_syngroups.getgroup(*it);
|
|
if (!sg.empty()) {
|
|
LOGDEB("ExpTerm: syns: " << *it << " -> " << (stringsToString(sg)) << "\n" );
|
|
for (vector<string>::const_iterator it1 = sg.begin();
|
|
it1 != sg.end(); it1++) {
|
|
if (it1->find_first_of(" ") != string::npos) {
|
|
if (multiwords) {
|
|
multiwords->push_back(*it1);
|
|
}
|
|
} else {
|
|
exp1.push_back(*it1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
lexp.swap(exp1);
|
|
sort(lexp.begin(), lexp.end());
|
|
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
}
|
|
|
|
// Expand the resulting list for case (all stemdb content
|
|
// is lowercase)
|
|
vector<string> exp1;
|
|
for (vector<string>::const_iterator it = lexp.begin();
|
|
it != lexp.end(); it++) {
|
|
synac.synExpand(*it, exp1);
|
|
}
|
|
exp1.swap(lexp);
|
|
sort(lexp.begin(), lexp.end());
|
|
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
}
|
|
|
|
// Filter the result and get the stats, possibly add prefixes.
|
|
LOGDEB("ExpandTerm:TM: lexp: " << (stringsToString(lexp)) << "\n" );
|
|
for (vector<string>::const_iterator it = lexp.begin();
|
|
it != lexp.end(); it++) {
|
|
idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
|
|
}
|
|
}
|
|
|
|
TermMatchCmpByTerm tcmp;
|
|
sort(res.entries.begin(), res.entries.end(), tcmp);
|
|
TermMatchTermEqual teq;
|
|
vector<TermMatchEntry>::iterator uit =
|
|
unique(res.entries.begin(), res.entries.end(), teq);
|
|
res.entries.resize(uit - res.entries.begin());
|
|
TermMatchCmpByWcf wcmp;
|
|
sort(res.entries.begin(), res.entries.end(), wcmp);
|
|
if (max > 0) {
|
|
// Would need a small max and big stem expansion...
|
|
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Second phase of wildcard/regexp term expansion after case/diac
|
|
// expansion: expand against main index terms
|
|
bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
|
|
TermMatchResult& res, int max, const string& field)
|
|
{
|
|
int typ = matchTypeTp(typ_sens);
|
|
LOGDEB1("Db::idxTermMatch: typ " << (tmtptostr(typ)) << " lang [" << (lang) << "] term [" << (root) << "] max " << (max) << " field [" << (field) << "] init res.size " << (res.entries.size()) << "\n" );
|
|
|
|
if (typ == ET_STEM) {
|
|
LOGFATAL("RCLDB: internal error: idxTermMatch called with ET_STEM\n" );
|
|
abort();
|
|
}
|
|
|
|
Xapian::Database xdb = m_ndb->xrdb;
|
|
|
|
string prefix;
|
|
if (!field.empty()) {
|
|
const FieldTraits *ftp = 0;
|
|
if (!fieldToTraits(field, &ftp, true) || ftp->pfx.empty()) {
|
|
LOGDEB("Db::termMatch: field is not indexed (no prefix): [" << (field) << "]\n" );
|
|
} else {
|
|
prefix = wrap_prefix(ftp->pfx);
|
|
}
|
|
}
|
|
res.prefix = prefix;
|
|
|
|
std::shared_ptr<StrMatcher> matcher;
|
|
if (typ == ET_REGEXP) {
|
|
matcher = std::shared_ptr<StrMatcher>(new StrRegexpMatcher(root));
|
|
if (!matcher->ok()) {
|
|
LOGERR("termMatch: regcomp failed: " << (matcher->getreason()));
|
|
return false;
|
|
}
|
|
} else if (typ == ET_WILD) {
|
|
matcher = std::shared_ptr<StrMatcher>(new StrWildMatcher(root));
|
|
}
|
|
|
|
// Find the initial section before any special char
|
|
string::size_type es = string::npos;
|
|
if (matcher) {
|
|
es = matcher->baseprefixlen();
|
|
}
|
|
|
|
// Initial section: the part of the prefix+expr before the
|
|
// first wildcard character. We only scan the part of the
|
|
// index where this matches
|
|
string is;
|
|
if (es == string::npos) {
|
|
is = prefix + root;
|
|
} else if (es == 0) {
|
|
is = prefix;
|
|
} else {
|
|
is = prefix + root.substr(0, es);
|
|
}
|
|
LOGDEB2("termMatch: initsec: [" << (is) << "]\n" );
|
|
|
|
for (int tries = 0; tries < 2; tries++) {
|
|
try {
|
|
Xapian::TermIterator it = xdb.allterms_begin();
|
|
if (!is.empty())
|
|
it.skip_to(is.c_str());
|
|
for (int rcnt = 0; it != xdb.allterms_end(); it++) {
|
|
// If we're beyond the terms matching the initial
|
|
// section, end
|
|
if (!is.empty() && (*it).find(is) != 0)
|
|
break;
|
|
|
|
// Else try to match the term. The matcher content
|
|
// is without prefix, so we remove this if any. We
|
|
// just checked that the index term did begin with
|
|
// the prefix.
|
|
string term;
|
|
if (!prefix.empty()) {
|
|
term = (*it).substr(prefix.length());
|
|
} else {
|
|
if (has_prefix(*it)) {
|
|
continue;
|
|
}
|
|
term = *it;
|
|
}
|
|
|
|
if (matcher && !matcher->match(term))
|
|
continue;
|
|
|
|
res.entries.push_back(
|
|
TermMatchEntry(*it, xdb.get_collection_freq(*it),
|
|
it.get_termfreq()));
|
|
|
|
// The problem with truncating here is that this is done
|
|
// alphabetically and we may not keep the most frequent
|
|
// terms. OTOH, not doing it may stall the program if
|
|
// we are walking the whole term list. We compromise
|
|
// by cutting at 2*max
|
|
if (max > 0 && ++rcnt >= 2*max)
|
|
break;
|
|
}
|
|
m_reason.erase();
|
|
break;
|
|
} catch (const Xapian::DatabaseModifiedError &e) {
|
|
m_reason = e.get_msg();
|
|
xdb.reopen();
|
|
continue;
|
|
} XCATCHERROR(m_reason);
|
|
break;
|
|
}
|
|
if (!m_reason.empty()) {
|
|
LOGERR("termMatch: " << (m_reason) << "\n" );
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/** Term list walking. */
|
|
class TermIter {
|
|
public:
|
|
Xapian::TermIterator it;
|
|
Xapian::Database db;
|
|
};
|
|
TermIter *Db::termWalkOpen()
|
|
{
|
|
if (!m_ndb || !m_ndb->m_isopen)
|
|
return 0;
|
|
TermIter *tit = new TermIter;
|
|
if (tit) {
|
|
tit->db = m_ndb->xrdb;
|
|
XAPTRY(tit->it = tit->db.allterms_begin(), tit->db, m_reason);
|
|
if (!m_reason.empty()) {
|
|
LOGERR("Db::termWalkOpen: xapian error: " << (m_reason) << "\n" );
|
|
return 0;
|
|
}
|
|
}
|
|
return tit;
|
|
}
|
|
bool Db::termWalkNext(TermIter *tit, string &term)
|
|
{
|
|
XAPTRY(
|
|
if (tit && tit->it != tit->db.allterms_end()) {
|
|
term = *(tit->it)++;
|
|
return true;
|
|
}
|
|
, tit->db, m_reason);
|
|
|
|
if (!m_reason.empty()) {
|
|
LOGERR("Db::termWalkOpen: xapian error: " << (m_reason) << "\n" );
|
|
}
|
|
return false;
|
|
}
|
|
void Db::termWalkClose(TermIter *tit)
|
|
{
|
|
try {
|
|
delete tit;
|
|
} catch (...) {}
|
|
}
|
|
|
|
bool Db::termExists(const string& word)
|
|
{
|
|
if (!m_ndb || !m_ndb->m_isopen)
|
|
return 0;
|
|
|
|
XAPTRY(if (!m_ndb->xrdb.term_exists(word)) return false,
|
|
m_ndb->xrdb, m_reason);
|
|
|
|
if (!m_reason.empty()) {
|
|
LOGERR("Db::termWalkOpen: xapian error: " << (m_reason) << "\n" );
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Db::stemDiffers(const string& lang, const string& word,
|
|
const string& base)
|
|
{
|
|
Xapian::Stem stemmer(lang);
|
|
if (!stemmer(word).compare(stemmer(base))) {
|
|
LOGDEB2("Rcl::Db::stemDiffers: same for " << (word) << " and " << (base) << "\n" );
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
} // End namespace Rcl
|