reorganized the term expansion code so that the term explorer works fully with case and diac sensitivity options
This commit is contained in:
parent
1f9e9d200a
commit
e0640357ce
10 changed files with 406 additions and 302 deletions
|
@ -1668,6 +1668,7 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
bool Db::stemExpand(const string &langs, const string &term,
|
||||
TermMatchResult& result)
|
||||
{
|
||||
|
@ -1680,6 +1681,7 @@ bool Db::stemExpand(const string &langs, const string &term,
|
|||
result.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Add prefix to all strings in list.
|
||||
* @param prefix already wrapped prefix
|
||||
|
@ -1693,14 +1695,7 @@ static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
|
|||
it->term.insert(0, prefix);
|
||||
}
|
||||
|
||||
// Find all index terms that match a wildcard or regular expression
|
||||
// If field is set, we return a list of appropriately prefixed terms (which
|
||||
// are going to be used to build a Xapian query).
|
||||
bool Db::termMatch(MatchType typ, const string &lang,
|
||||
const string &_root,
|
||||
TermMatchResult& res,
|
||||
int max,
|
||||
const string& field)
|
||||
bool Db::dbStats(DbStats& res)
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return false;
|
||||
|
@ -1713,20 +1708,197 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||
, xdb, m_reason);
|
||||
if (!m_reason.empty())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
string droot = _root;
|
||||
// Find all index terms that match a wildcard or regular expression If
|
||||
// field is set, we return a list of appropriately prefixed terms
|
||||
// (which are going to be used to build a Xapian query). This routine
|
||||
// performs case/diacritics/stemming expansion and possibly calls
|
||||
// idxTermMatch for wildcard/regexp expansion and filtering against
|
||||
// the main index terms.
|
||||
bool Db::termMatch(int typ_sens, const string &lang,
|
||||
const string &_term,
|
||||
TermMatchResult& res,
|
||||
int max,
|
||||
const string& field)
|
||||
{
|
||||
int matchtyp = matchTypeTp(typ_sens);
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return false;
|
||||
Xapian::Database xrdb = m_ndb->xrdb;
|
||||
|
||||
// If index is stripped, get rid of capitals and accents
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars)
|
||||
bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
|
||||
bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
|
||||
|
||||
bool stripped = false;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
stripped = true;
|
||||
#else
|
||||
stripped = o_index_stripchars;
|
||||
#endif
|
||||
if (!unacmaybefold(_root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", _root.c_str()));
|
||||
|
||||
LOGDEB(("Db::TermMatch: typ %d diacsens %d casesens %d lang [%s] term [%s] "
|
||||
"max %d field [%s] stripped %d\n",
|
||||
matchtyp, diac_sensitive, case_sensitive, lang.c_str(),
|
||||
_term.c_str(), max, field.c_str(), stripped));
|
||||
|
||||
// If index is stripped, no case or diac expansion can be needed:
|
||||
// for the processing inside this routine, everything looks like
|
||||
// we're all-sensitive: no use of expansion db.
|
||||
// Also, convert input to lowercase and strip its accents.
|
||||
string term = _term;
|
||||
if (stripped) {
|
||||
diac_sensitive = case_sensitive = true;
|
||||
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", _term.c_str()));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
string nochars = typ == ET_WILD ? cstr_wildSpecStChars :
|
||||
cstr_regSpecStChars;
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
// The case/diac expansion db
|
||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||
XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
|
||||
#endif // RCL_INDEX_STRIPCHARS
|
||||
|
||||
|
||||
if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||
#else
|
||||
RefCntr<StrMatcher> matcher;
|
||||
if (matchtyp == ET_WILD) {
|
||||
matcher = RefCntr<StrMatcher>(new StrWildMatcher(term));
|
||||
} else {
|
||||
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(term));
|
||||
}
|
||||
if (!diac_sensitive || !case_sensitive) {
|
||||
// Perform case/diac expansion on the exp as appropriate and
|
||||
// expand the result.
|
||||
vector<string> exp;
|
||||
if (diac_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same diacritics
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synKeyExpand(matcher.getptr(), exp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for diacritics and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synKeyExpand(matcher.getptr(), exp, &unactrans);
|
||||
} else {
|
||||
// Expand for diacritics and case, no filtering
|
||||
synac.synKeyExpand(matcher.getptr(), exp);
|
||||
}
|
||||
// Retrieve additional info and filter against the index itself
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
idxTermMatch(ET_NONE, "", *it, res, max, field);
|
||||
}
|
||||
} else {
|
||||
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||
}
|
||||
|
||||
#endif // RCL_INDEX_STRIPCHARS
|
||||
|
||||
} else {
|
||||
// Expansion is STEM or NONE (which may still need case/diac exp)
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
|
||||
idxTermMatch(Rcl::Db::ET_STEM, lang, term, res, max, field);
|
||||
|
||||
#else
|
||||
vector<string> lexp;
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
// No case/diac expansion
|
||||
lexp.push_back(term);
|
||||
} else if (diac_sensitive) {
|
||||
// Expand for accents and case, filtering for same accents,
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synExpand(term, lexp, &foldtrans);
|
||||
} else if (case_sensitive) {
|
||||
// Expand for accents and case, filtering for same case
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synExpand(term, lexp, &unactrans);
|
||||
} else {
|
||||
// We are neither accent- nor case- sensitive and may need stem
|
||||
// expansion or not. Expand for accents and case
|
||||
synac.synExpand(term, lexp);
|
||||
}
|
||||
|
||||
if (matchTypeTp(typ_sens) == ET_STEM) {
|
||||
// Need stem expansion. Lowercase the result of accent and case
|
||||
// expansion for input to stemdb.
|
||||
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||
string lower;
|
||||
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||
lexp[i] = lower;
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
StemDb sdb(xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
sdb.stemExpand(lang, *it, exp1);
|
||||
}
|
||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
lexp.clear();
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
synac.synExpand(*it, lexp);
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
}
|
||||
|
||||
// Filter the result and get the stats, possibly add prefixes.
|
||||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
TermMatchCmpByTerm tcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), tcmp);
|
||||
TermMatchTermEqual teq;
|
||||
vector<TermMatchEntry>::iterator uit =
|
||||
unique(res.entries.begin(), res.entries.end(), teq);
|
||||
res.entries.resize(uit - res.entries.begin());
|
||||
TermMatchCmpByWcf wcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), wcmp);
|
||||
if (max > 0) {
|
||||
// Would need a small max and big stem expansion...
|
||||
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Second phase of wildcard/regexp term expansion after case/diac
|
||||
// expansion: expand against main index terms
|
||||
bool Db::idxTermMatch(int typ_sens, const string &lang,
|
||||
const string &root,
|
||||
TermMatchResult& res,
|
||||
int max,
|
||||
const string& field)
|
||||
{
|
||||
int typ = matchTypeTp(typ_sens);
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (typ == ET_STEM) {
|
||||
LOGFATAL(("RCLDB: internal error: idxTermMatch called with ET_STEM\n"));
|
||||
abort();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return false;
|
||||
Xapian::Database xdb = m_ndb->xrdb;
|
||||
|
||||
string prefix;
|
||||
if (!field.empty()) {
|
||||
|
@ -1740,8 +1912,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||
}
|
||||
res.prefix = prefix;
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
if (typ == ET_STEM) {
|
||||
if (!stemExpand(lang, droot, res))
|
||||
if (!stemExpand(lang, root, res))
|
||||
return false;
|
||||
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
|
@ -1754,30 +1927,33 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||
}
|
||||
if (!prefix.empty())
|
||||
addPrefix(res.entries, prefix);
|
||||
} else {
|
||||
regex_t reg;
|
||||
int errcode;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
RefCntr<StrMatcher> matcher;
|
||||
if (typ == ET_REGEXP) {
|
||||
if ((errcode = regcomp(®, droot.c_str(),
|
||||
REG_EXTENDED|REG_NOSUB))) {
|
||||
char errbuf[200];
|
||||
regerror(errcode, ®, errbuf, 199);
|
||||
LOGERR(("termMatch: regcomp failed: %s\n", errbuf));
|
||||
res.entries.push_back(string(errbuf));
|
||||
regfree(®);
|
||||
return false;
|
||||
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(root));
|
||||
if (!matcher->ok()) {
|
||||
LOGERR(("termMatch: regcomp failed: %s\n",
|
||||
matcher->getreason().c_str()))
|
||||
return false;
|
||||
}
|
||||
} else if (typ == ET_WILD) {
|
||||
matcher = RefCntr<StrMatcher>(new StrWildMatcher(root));
|
||||
}
|
||||
|
||||
// Find the initial section before any special char
|
||||
string::size_type es = droot.find_first_of(nochars);
|
||||
string::size_type es = string::npos;
|
||||
if (matcher.isNotNull()) {
|
||||
es = matcher->baseprefixlen();
|
||||
}
|
||||
string is;
|
||||
switch (es) {
|
||||
case string::npos: is = prefix + droot; break;
|
||||
case string::npos: is = prefix + root; break;
|
||||
case 0: is = prefix; break;
|
||||
default: is = prefix + droot.substr(0, es); break;
|
||||
default: is = prefix + root.substr(0, es); break;
|
||||
}
|
||||
LOGDEB1(("termMatch: initsec: [%s]\n", is.c_str()));
|
||||
LOGDEB2(("termMatch: initsec: [%s]\n", is.c_str()));
|
||||
|
||||
for (int tries = 0; tries < 2; tries++) {
|
||||
try {
|
||||
|
@ -1794,18 +1970,13 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||
term = (*it).substr(prefix.length());
|
||||
else
|
||||
term = *it;
|
||||
if (typ == ET_WILD) {
|
||||
if (fnmatch(droot.c_str(), term.c_str(), 0) ==
|
||||
FNM_NOMATCH)
|
||||
continue;
|
||||
} else {
|
||||
if (regexec(®, term.c_str(), 0, 0, 0))
|
||||
continue;
|
||||
}
|
||||
// Do we want stem expansion here? We don't do it for now
|
||||
res.entries.push_back(TermMatchEntry(*it,
|
||||
xdb.get_collection_freq(*it),
|
||||
it.get_termfreq()));
|
||||
|
||||
if (matcher.isNotNull() && !matcher->match(term))
|
||||
continue;
|
||||
|
||||
res.entries.push_back(
|
||||
TermMatchEntry(*it, xdb.get_collection_freq(*it),
|
||||
it.get_termfreq()));
|
||||
|
||||
// The problem with truncating here is that this is done
|
||||
// alphabetically and we may not keep the most frequent
|
||||
|
@ -1828,25 +1999,8 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||
LOGERR(("termMatch: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (typ == ET_REGEXP) {
|
||||
regfree(®);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TermMatchCmpByTerm tcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), tcmp);
|
||||
TermMatchTermEqual teq;
|
||||
vector<TermMatchEntry>::iterator uit =
|
||||
unique(res.entries.begin(), res.entries.end(), teq);
|
||||
res.entries.resize(uit - res.entries.begin());
|
||||
TermMatchCmpByWcf wcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), wcmp);
|
||||
if (max > 0) {
|
||||
// Would need a small max and big stem expansion...
|
||||
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue