reorganized the term expansion code so that the term explorer works fully with case and diac sensitivity options
This commit is contained in:
parent
1f9e9d200a
commit
e0640357ce
10 changed files with 406 additions and 302 deletions
|
@ -8,8 +8,8 @@ LIBS = librecoll.a $(LIBRECOLL)
|
||||||
|
|
||||||
all: $(LIBS)
|
all: $(LIBS)
|
||||||
|
|
||||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o daterange.o expansiondbs.o rclabstract.o rcldb.o rcldoc.o rclquery.o searchdata.o searchdataxml.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o daterange.o expansiondbs.o rclabstract.o rcldb.o rcldoc.o rclquery.o searchdata.o searchdataxml.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o strmatcher.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp daterange.dep.stamp expansiondbs.dep.stamp rclabstract.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp searchdataxml.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp daterange.dep.stamp expansiondbs.dep.stamp rclabstract.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp searchdataxml.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp strmatcher.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||||
|
|
||||||
librecoll.a : $(DEPS) $(OBJS)
|
librecoll.a : $(DEPS) $(OBJS)
|
||||||
ar ru librecoll.a $(OBJS)
|
ar ru librecoll.a $(OBJS)
|
||||||
|
@ -159,6 +159,8 @@ readfile.o : ../utils/readfile.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../utils/readfile.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../utils/readfile.cpp
|
||||||
smallut.o : ../utils/smallut.cpp $(depth)/mk/localdefs
|
smallut.o : ../utils/smallut.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../utils/smallut.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../utils/smallut.cpp
|
||||||
|
strmatcher.o : ../utils/strmatcher.cpp $(depth)/mk/localdefs
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c ../utils/strmatcher.cpp
|
||||||
transcode.o : ../utils/transcode.cpp $(depth)/mk/localdefs
|
transcode.o : ../utils/transcode.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../utils/transcode.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../utils/transcode.cpp
|
||||||
wipedir.o : ../utils/wipedir.cpp $(depth)/mk/localdefs
|
wipedir.o : ../utils/wipedir.cpp $(depth)/mk/localdefs
|
||||||
|
@ -389,6 +391,9 @@ readfile.dep.stamp : ../utils/readfile.cpp $(depth)/mk/localdefs
|
||||||
smallut.dep.stamp : ../utils/smallut.cpp $(depth)/mk/localdefs
|
smallut.dep.stamp : ../utils/smallut.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../utils/smallut.cpp > smallut.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../utils/smallut.cpp > smallut.dep
|
||||||
touch smallut.dep.stamp
|
touch smallut.dep.stamp
|
||||||
|
strmatcher.dep.stamp : ../utils/strmatcher.cpp $(depth)/mk/localdefs
|
||||||
|
$(CXX) -M $(ALL_CXXFLAGS) ../utils/strmatcher.cpp > strmatcher.dep
|
||||||
|
touch strmatcher.dep.stamp
|
||||||
transcode.dep.stamp : ../utils/transcode.cpp $(depth)/mk/localdefs
|
transcode.dep.stamp : ../utils/transcode.cpp $(depth)/mk/localdefs
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../utils/transcode.cpp > transcode.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../utils/transcode.cpp > transcode.dep
|
||||||
touch transcode.dep.stamp
|
touch transcode.dep.stamp
|
||||||
|
@ -466,6 +471,7 @@ include pxattr.dep
|
||||||
include rclionice.dep
|
include rclionice.dep
|
||||||
include readfile.dep
|
include readfile.dep
|
||||||
include smallut.dep
|
include smallut.dep
|
||||||
|
include strmatcher.dep
|
||||||
include transcode.dep
|
include transcode.dep
|
||||||
include wipedir.dep
|
include wipedir.dep
|
||||||
include x11mon.dep
|
include x11mon.dep
|
||||||
|
|
|
@ -73,6 +73,7 @@ ${depth}/utils/pxattr.cpp \
|
||||||
${depth}/utils/rclionice.cpp \
|
${depth}/utils/rclionice.cpp \
|
||||||
${depth}/utils/readfile.cpp \
|
${depth}/utils/readfile.cpp \
|
||||||
${depth}/utils/smallut.cpp \
|
${depth}/utils/smallut.cpp \
|
||||||
|
${depth}/utils/strmatcher.cpp \
|
||||||
${depth}/utils/transcode.cpp \
|
${depth}/utils/transcode.cpp \
|
||||||
${depth}/utils/wipedir.cpp \
|
${depth}/utils/wipedir.cpp \
|
||||||
${depth}/utils/x11mon.cpp \
|
${depth}/utils/x11mon.cpp \
|
||||||
|
|
|
@ -29,9 +29,10 @@
|
||||||
<item>
|
<item>
|
||||||
<layout class="QVBoxLayout" name="verticalLayout">
|
<layout class="QVBoxLayout" name="verticalLayout">
|
||||||
<item>
|
<item>
|
||||||
<layout class="QVBoxLayout">
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
<item>
|
<item row="0" column="0">
|
||||||
<layout class="QGridLayout">
|
<widget class="QComboBox" name="expTypeCMB"/>
|
||||||
|
</item>
|
||||||
<item row="0" column="1" colspan="2">
|
<item row="0" column="1" colspan="2">
|
||||||
<widget class="QLineEdit" name="baseWordLE">
|
<widget class="QLineEdit" name="baseWordLE">
|
||||||
<property name="minimumSize">
|
<property name="minimumSize">
|
||||||
|
@ -42,7 +43,31 @@
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label">
|
||||||
|
<property name="text">
|
||||||
|
<string>Match</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
<item row="1" column="1">
|
<item row="1" column="1">
|
||||||
|
<widget class="QCheckBox" name="caseSensCB">
|
||||||
|
<property name="text">
|
||||||
|
<string>Case</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="2">
|
||||||
|
<widget class="QCheckBox" name="diacSensCB">
|
||||||
|
<property name="text">
|
||||||
|
<string>Accents</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QComboBox" name="stemLangCMB"/>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="1">
|
||||||
<widget class="QPushButton" name="expandPB">
|
<widget class="QPushButton" name="expandPB">
|
||||||
<property name="enabled">
|
<property name="enabled">
|
||||||
<bool>false</bool>
|
<bool>false</bool>
|
||||||
|
@ -58,13 +83,7 @@
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="1" column="0">
|
<item row="2" column="2">
|
||||||
<widget class="QComboBox" name="stemLangCMB"/>
|
|
||||||
</item>
|
|
||||||
<item row="0" column="0">
|
|
||||||
<widget class="QComboBox" name="expTypeCMB"/>
|
|
||||||
</item>
|
|
||||||
<item row="1" column="2">
|
|
||||||
<widget class="QPushButton" name="dismissPB">
|
<widget class="QPushButton" name="dismissPB">
|
||||||
<property name="enabled">
|
<property name="enabled">
|
||||||
<bool>true</bool>
|
<bool>true</bool>
|
||||||
|
@ -82,8 +101,6 @@
|
||||||
</item>
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</item>
|
</item>
|
||||||
</layout>
|
|
||||||
</item>
|
|
||||||
<item>
|
<item>
|
||||||
<widget class="QLabel" name="statsLBL">
|
<widget class="QLabel" name="statsLBL">
|
||||||
<property name="font">
|
<property name="font">
|
||||||
|
@ -125,7 +142,6 @@
|
||||||
<tabstop>baseWordLE</tabstop>
|
<tabstop>baseWordLE</tabstop>
|
||||||
<tabstop>expandPB</tabstop>
|
<tabstop>expandPB</tabstop>
|
||||||
<tabstop>dismissPB</tabstop>
|
<tabstop>dismissPB</tabstop>
|
||||||
<tabstop>expTypeCMB</tabstop>
|
|
||||||
<tabstop>stemLangCMB</tabstop>
|
<tabstop>stemLangCMB</tabstop>
|
||||||
</tabstops>
|
</tabstops>
|
||||||
<resources/>
|
<resources/>
|
||||||
|
|
|
@ -119,6 +119,16 @@ void SpellW::init()
|
||||||
resTW->setColumnWidth(1, 150);
|
resTW->setColumnWidth(1, 150);
|
||||||
resTW->installEventFilter(this);
|
resTW->installEventFilter(this);
|
||||||
|
|
||||||
|
bool stripped = false;
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
|
stripped = true;
|
||||||
|
#else
|
||||||
|
stripped = o_index_stripchars;
|
||||||
|
#endif
|
||||||
|
if (stripped) {
|
||||||
|
caseSensCB->setEnabled(false);
|
||||||
|
caseSensCB->setEnabled(false);
|
||||||
|
}
|
||||||
modeSet(cmbidx);
|
modeSet(cmbidx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -144,16 +154,23 @@ void SpellW::doExpand()
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Rcl::Db::MatchType mt;
|
int mt;
|
||||||
switch(mode) {
|
switch(mode) {
|
||||||
case TYPECMB_WILD: mt = Rcl::Db::ET_WILD; break;
|
case TYPECMB_WILD: mt = Rcl::Db::ET_WILD; break;
|
||||||
case TYPECMB_REG: mt = Rcl::Db::ET_REGEXP; break;
|
case TYPECMB_REG: mt = Rcl::Db::ET_REGEXP; break;
|
||||||
case TYPECMB_STEM: mt = Rcl::Db::ET_STEM; break;
|
case TYPECMB_STEM: mt = Rcl::Db::ET_STEM; break;
|
||||||
default: mt = Rcl::Db::ET_WILD;
|
default: mt = Rcl::Db::ET_WILD;
|
||||||
}
|
}
|
||||||
|
if (caseSensCB->isChecked()) {
|
||||||
|
mt |= Rcl::Db::ET_CASESENS;
|
||||||
|
}
|
||||||
|
if (diacSensCB->isChecked()) {
|
||||||
|
mt |= Rcl::Db::ET_DIACSENS;
|
||||||
|
}
|
||||||
Rcl::TermMatchResult res;
|
Rcl::TermMatchResult res;
|
||||||
string expr = string((const char *)baseWordLE->text().toUtf8());
|
string expr = string((const char *)baseWordLE->text().toUtf8());
|
||||||
|
Rcl::DbStats dbs;
|
||||||
|
rcldb->dbStats(dbs);
|
||||||
|
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case TYPECMB_WILD:
|
case TYPECMB_WILD:
|
||||||
|
@ -169,7 +186,7 @@ void SpellW::doExpand()
|
||||||
}
|
}
|
||||||
statsLBL->setText(tr("Index: %1 documents, average length %2 terms."
|
statsLBL->setText(tr("Index: %1 documents, average length %2 terms."
|
||||||
"%3 results")
|
"%3 results")
|
||||||
.arg(res.dbdoccount).arg(res.dbavgdoclen, 0, 'f', 1)
|
.arg(dbs.dbdoccount).arg(dbs.dbavgdoclen, 0, 'f', 1)
|
||||||
.arg(res.entries.size()));
|
.arg(res.entries.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -239,7 +256,7 @@ void SpellW::doExpand()
|
||||||
|
|
||||||
for (vector<Rcl::TermMatchEntry>::iterator it = res.entries.begin();
|
for (vector<Rcl::TermMatchEntry>::iterator it = res.entries.begin();
|
||||||
it != res.entries.end(); it++) {
|
it != res.entries.end(); it++) {
|
||||||
LOGDEB(("SpellW::expand: %6d [%s]\n", it->wcf, it->term.c_str()));
|
LOGDEB2(("SpellW::expand: %6d [%s]\n", it->wcf, it->term.c_str()));
|
||||||
char num[30];
|
char num[30];
|
||||||
if (it->wcf)
|
if (it->wcf)
|
||||||
sprintf(num, "%d / %d", it->docs, it->wcf);
|
sprintf(num, "%d / %d", it->docs, it->wcf);
|
||||||
|
@ -259,9 +276,9 @@ void SpellW::showStats()
|
||||||
statsLBL->setText("");
|
statsLBL->setText("");
|
||||||
int row = 0;
|
int row = 0;
|
||||||
|
|
||||||
Rcl::TermMatchResult res;
|
Rcl::DbStats res;
|
||||||
if (!rcldb->termMatch(Rcl::Db::ET_WILD, "", "azbogusaz", res, 1)) {
|
if (!rcldb->dbStats(res)) {
|
||||||
LOGERR(("SpellW::doExpand:rcldb::termMatch failed\n"));
|
LOGERR(("SpellW::doExpand:rcldb::dbStats failed\n"));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -366,10 +383,17 @@ void SpellW::modeSet(int idx)
|
||||||
comboboxchoice mode = m_c2t[idx];
|
comboboxchoice mode = m_c2t[idx];
|
||||||
resTW->setRowCount(0);
|
resTW->setRowCount(0);
|
||||||
|
|
||||||
if (mode == TYPECMB_STEM)
|
if (mode == TYPECMB_STEM) {
|
||||||
stemLangCMB->setEnabled(true);
|
stemLangCMB->setEnabled(true);
|
||||||
else
|
diacSensCB->setChecked(false);
|
||||||
|
diacSensCB->setEnabled(false);
|
||||||
|
caseSensCB->setChecked(false);
|
||||||
|
caseSensCB->setEnabled(false);
|
||||||
|
} else {
|
||||||
stemLangCMB->setEnabled(false);
|
stemLangCMB->setEnabled(false);
|
||||||
|
diacSensCB->setEnabled(true);
|
||||||
|
caseSensCB->setEnabled(true);
|
||||||
|
}
|
||||||
if (mode == TYPECMB_STATS)
|
if (mode == TYPECMB_STATS)
|
||||||
baseWordLE->setEnabled(false);
|
baseWordLE->setEnabled(false);
|
||||||
else
|
else
|
||||||
|
|
|
@ -170,6 +170,8 @@ int recollq(RclConfig **cfp, int argc, char **argv)
|
||||||
switch (*(*argv)++) {
|
switch (*(*argv)++) {
|
||||||
case '-':
|
case '-':
|
||||||
// -- : end of options
|
// -- : end of options
|
||||||
|
if (*(*argv) != 0)
|
||||||
|
Usage();
|
||||||
goto endopts;
|
goto endopts;
|
||||||
case 'A': op_flags |= OPT_A; break;
|
case 'A': op_flags |= OPT_A; break;
|
||||||
case 'a': op_flags |= OPT_a; break;
|
case 'a': op_flags |= OPT_a; break;
|
||||||
|
|
|
@ -1668,6 +1668,7 @@ public:
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
bool Db::stemExpand(const string &langs, const string &term,
|
bool Db::stemExpand(const string &langs, const string &term,
|
||||||
TermMatchResult& result)
|
TermMatchResult& result)
|
||||||
{
|
{
|
||||||
|
@ -1680,6 +1681,7 @@ bool Db::stemExpand(const string &langs, const string &term,
|
||||||
result.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
result.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/** Add prefix to all strings in list.
|
/** Add prefix to all strings in list.
|
||||||
* @param prefix already wrapped prefix
|
* @param prefix already wrapped prefix
|
||||||
|
@ -1693,14 +1695,7 @@ static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
|
||||||
it->term.insert(0, prefix);
|
it->term.insert(0, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find all index terms that match a wildcard or regular expression
|
bool Db::dbStats(DbStats& res)
|
||||||
// If field is set, we return a list of appropriately prefixed terms (which
|
|
||||||
// are going to be used to build a Xapian query).
|
|
||||||
bool Db::termMatch(MatchType typ, const string &lang,
|
|
||||||
const string &_root,
|
|
||||||
TermMatchResult& res,
|
|
||||||
int max,
|
|
||||||
const string& field)
|
|
||||||
{
|
{
|
||||||
if (!m_ndb || !m_ndb->m_isopen)
|
if (!m_ndb || !m_ndb->m_isopen)
|
||||||
return false;
|
return false;
|
||||||
|
@ -1713,20 +1708,197 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
||||||
, xdb, m_reason);
|
, xdb, m_reason);
|
||||||
if (!m_reason.empty())
|
if (!m_reason.empty())
|
||||||
return false;
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
string droot = _root;
|
// Find all index terms that match a wildcard or regular expression If
|
||||||
|
// field is set, we return a list of appropriately prefixed terms
|
||||||
|
// (which are going to be used to build a Xapian query). This routine
|
||||||
|
// performs case/diacritics/stemming expansion and possibly calls
|
||||||
|
// idxTermMatch for wildcard/regexp expansion and filtering against
|
||||||
|
// the main index terms.
|
||||||
|
bool Db::termMatch(int typ_sens, const string &lang,
|
||||||
|
const string &_term,
|
||||||
|
TermMatchResult& res,
|
||||||
|
int max,
|
||||||
|
const string& field)
|
||||||
|
{
|
||||||
|
int matchtyp = matchTypeTp(typ_sens);
|
||||||
|
if (!m_ndb || !m_ndb->m_isopen)
|
||||||
|
return false;
|
||||||
|
Xapian::Database xrdb = m_ndb->xrdb;
|
||||||
|
|
||||||
// If index is stripped, get rid of capitals and accents
|
bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
|
||||||
#ifndef RCL_INDEX_STRIPCHARS
|
bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
|
||||||
if (o_index_stripchars)
|
|
||||||
|
bool stripped = false;
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
|
stripped = true;
|
||||||
|
#else
|
||||||
|
stripped = o_index_stripchars;
|
||||||
#endif
|
#endif
|
||||||
if (!unacmaybefold(_root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
|
||||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", _root.c_str()));
|
LOGDEB(("Db::TermMatch: typ %d diacsens %d casesens %d lang [%s] term [%s] "
|
||||||
|
"max %d field [%s] stripped %d\n",
|
||||||
|
matchtyp, diac_sensitive, case_sensitive, lang.c_str(),
|
||||||
|
_term.c_str(), max, field.c_str(), stripped));
|
||||||
|
|
||||||
|
// If index is stripped, no case or diac expansion can be needed:
|
||||||
|
// for the processing inside this routine, everything looks like
|
||||||
|
// we're all-sensitive: no use of expansion db.
|
||||||
|
// Also, convert input to lowercase and strip its accents.
|
||||||
|
string term = _term;
|
||||||
|
if (stripped) {
|
||||||
|
diac_sensitive = case_sensitive = true;
|
||||||
|
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
|
LOGERR(("Db::termMatch: unac failed for [%s]\n", _term.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
string nochars = typ == ET_WILD ? cstr_wildSpecStChars :
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
cstr_regSpecStChars;
|
// The case/diac expansion db
|
||||||
|
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||||
|
XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
|
||||||
|
#endif // RCL_INDEX_STRIPCHARS
|
||||||
|
|
||||||
|
|
||||||
|
if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
|
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||||
|
#else
|
||||||
|
RefCntr<StrMatcher> matcher;
|
||||||
|
if (matchtyp == ET_WILD) {
|
||||||
|
matcher = RefCntr<StrMatcher>(new StrWildMatcher(term));
|
||||||
|
} else {
|
||||||
|
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(term));
|
||||||
|
}
|
||||||
|
if (!diac_sensitive || !case_sensitive) {
|
||||||
|
// Perform case/diac expansion on the exp as appropriate and
|
||||||
|
// expand the result.
|
||||||
|
vector<string> exp;
|
||||||
|
if (diac_sensitive) {
|
||||||
|
// Expand for diacritics and case, filtering for same diacritics
|
||||||
|
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||||
|
synac.synKeyExpand(matcher.getptr(), exp, &foldtrans);
|
||||||
|
} else if (case_sensitive) {
|
||||||
|
// Expand for diacritics and case, filtering for same case
|
||||||
|
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||||
|
synac.synKeyExpand(matcher.getptr(), exp, &unactrans);
|
||||||
|
} else {
|
||||||
|
// Expand for diacritics and case, no filtering
|
||||||
|
synac.synKeyExpand(matcher.getptr(), exp);
|
||||||
|
}
|
||||||
|
// Retrieve additional info and filter against the index itself
|
||||||
|
for (vector<string>::const_iterator it = exp.begin();
|
||||||
|
it != exp.end(); it++) {
|
||||||
|
idxTermMatch(ET_NONE, "", *it, res, max, field);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
idxTermMatch(typ_sens, lang, term, res, max, field);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // RCL_INDEX_STRIPCHARS
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// Expansion is STEM or NONE (which may still need case/diac exp)
|
||||||
|
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
|
|
||||||
|
idxTermMatch(Rcl::Db::ET_STEM, lang, term, res, max, field);
|
||||||
|
|
||||||
|
#else
|
||||||
|
vector<string> lexp;
|
||||||
|
if (diac_sensitive && case_sensitive) {
|
||||||
|
// No case/diac expansion
|
||||||
|
lexp.push_back(term);
|
||||||
|
} else if (diac_sensitive) {
|
||||||
|
// Expand for accents and case, filtering for same accents,
|
||||||
|
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||||
|
synac.synExpand(term, lexp, &foldtrans);
|
||||||
|
} else if (case_sensitive) {
|
||||||
|
// Expand for accents and case, filtering for same case
|
||||||
|
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||||
|
synac.synExpand(term, lexp, &unactrans);
|
||||||
|
} else {
|
||||||
|
// We are neither accent- nor case- sensitive and may need stem
|
||||||
|
// expansion or not. Expand for accents and case
|
||||||
|
synac.synExpand(term, lexp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matchTypeTp(typ_sens) == ET_STEM) {
|
||||||
|
// Need stem expansion. Lowercase the result of accent and case
|
||||||
|
// expansion for input to stemdb.
|
||||||
|
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||||
|
string lower;
|
||||||
|
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||||
|
lexp[i] = lower;
|
||||||
|
}
|
||||||
|
sort(lexp.begin(), lexp.end());
|
||||||
|
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||||
|
StemDb sdb(xrdb);
|
||||||
|
vector<string> exp1;
|
||||||
|
for (vector<string>::const_iterator it = lexp.begin();
|
||||||
|
it != lexp.end(); it++) {
|
||||||
|
sdb.stemExpand(lang, *it, exp1);
|
||||||
|
}
|
||||||
|
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||||
|
|
||||||
|
// Expand the resulting list for case (all stemdb content
|
||||||
|
// is lowercase)
|
||||||
|
lexp.clear();
|
||||||
|
for (vector<string>::const_iterator it = exp1.begin();
|
||||||
|
it != exp1.end(); it++) {
|
||||||
|
synac.synExpand(*it, lexp);
|
||||||
|
}
|
||||||
|
sort(lexp.begin(), lexp.end());
|
||||||
|
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter the result and get the stats, possibly add prefixes.
|
||||||
|
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||||
|
for (vector<string>::const_iterator it = lexp.begin();
|
||||||
|
it != lexp.end(); it++) {
|
||||||
|
idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TermMatchCmpByTerm tcmp;
|
||||||
|
sort(res.entries.begin(), res.entries.end(), tcmp);
|
||||||
|
TermMatchTermEqual teq;
|
||||||
|
vector<TermMatchEntry>::iterator uit =
|
||||||
|
unique(res.entries.begin(), res.entries.end(), teq);
|
||||||
|
res.entries.resize(uit - res.entries.begin());
|
||||||
|
TermMatchCmpByWcf wcmp;
|
||||||
|
sort(res.entries.begin(), res.entries.end(), wcmp);
|
||||||
|
if (max > 0) {
|
||||||
|
// Would need a small max and big stem expansion...
|
||||||
|
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second phase of wildcard/regexp term expansion after case/diac
|
||||||
|
// expansion: expand against main index terms
|
||||||
|
bool Db::idxTermMatch(int typ_sens, const string &lang,
|
||||||
|
const string &root,
|
||||||
|
TermMatchResult& res,
|
||||||
|
int max,
|
||||||
|
const string& field)
|
||||||
|
{
|
||||||
|
int typ = matchTypeTp(typ_sens);
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (typ == ET_STEM) {
|
||||||
|
LOGFATAL(("RCLDB: internal error: idxTermMatch called with ET_STEM\n"));
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (!m_ndb || !m_ndb->m_isopen)
|
||||||
|
return false;
|
||||||
|
Xapian::Database xdb = m_ndb->xrdb;
|
||||||
|
|
||||||
string prefix;
|
string prefix;
|
||||||
if (!field.empty()) {
|
if (!field.empty()) {
|
||||||
|
@ -1740,8 +1912,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
||||||
}
|
}
|
||||||
res.prefix = prefix;
|
res.prefix = prefix;
|
||||||
|
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
if (typ == ET_STEM) {
|
if (typ == ET_STEM) {
|
||||||
if (!stemExpand(lang, droot, res))
|
if (!stemExpand(lang, root, res))
|
||||||
return false;
|
return false;
|
||||||
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
||||||
it != res.entries.end(); it++) {
|
it != res.entries.end(); it++) {
|
||||||
|
@ -1754,30 +1927,33 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
||||||
}
|
}
|
||||||
if (!prefix.empty())
|
if (!prefix.empty())
|
||||||
addPrefix(res.entries, prefix);
|
addPrefix(res.entries, prefix);
|
||||||
} else {
|
} else
|
||||||
regex_t reg;
|
#endif
|
||||||
int errcode;
|
{
|
||||||
|
RefCntr<StrMatcher> matcher;
|
||||||
if (typ == ET_REGEXP) {
|
if (typ == ET_REGEXP) {
|
||||||
if ((errcode = regcomp(®, droot.c_str(),
|
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(root));
|
||||||
REG_EXTENDED|REG_NOSUB))) {
|
if (!matcher->ok()) {
|
||||||
char errbuf[200];
|
LOGERR(("termMatch: regcomp failed: %s\n",
|
||||||
regerror(errcode, ®, errbuf, 199);
|
matcher->getreason().c_str()))
|
||||||
LOGERR(("termMatch: regcomp failed: %s\n", errbuf));
|
|
||||||
res.entries.push_back(string(errbuf));
|
|
||||||
regfree(®);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
} else if (typ == ET_WILD) {
|
||||||
|
matcher = RefCntr<StrMatcher>(new StrWildMatcher(root));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the initial section before any special char
|
// Find the initial section before any special char
|
||||||
string::size_type es = droot.find_first_of(nochars);
|
string::size_type es = string::npos;
|
||||||
|
if (matcher.isNotNull()) {
|
||||||
|
es = matcher->baseprefixlen();
|
||||||
|
}
|
||||||
string is;
|
string is;
|
||||||
switch (es) {
|
switch (es) {
|
||||||
case string::npos: is = prefix + droot; break;
|
case string::npos: is = prefix + root; break;
|
||||||
case 0: is = prefix; break;
|
case 0: is = prefix; break;
|
||||||
default: is = prefix + droot.substr(0, es); break;
|
default: is = prefix + root.substr(0, es); break;
|
||||||
}
|
}
|
||||||
LOGDEB1(("termMatch: initsec: [%s]\n", is.c_str()));
|
LOGDEB2(("termMatch: initsec: [%s]\n", is.c_str()));
|
||||||
|
|
||||||
for (int tries = 0; tries < 2; tries++) {
|
for (int tries = 0; tries < 2; tries++) {
|
||||||
try {
|
try {
|
||||||
|
@ -1794,17 +1970,12 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
||||||
term = (*it).substr(prefix.length());
|
term = (*it).substr(prefix.length());
|
||||||
else
|
else
|
||||||
term = *it;
|
term = *it;
|
||||||
if (typ == ET_WILD) {
|
|
||||||
if (fnmatch(droot.c_str(), term.c_str(), 0) ==
|
if (matcher.isNotNull() && !matcher->match(term))
|
||||||
FNM_NOMATCH)
|
|
||||||
continue;
|
continue;
|
||||||
} else {
|
|
||||||
if (regexec(®, term.c_str(), 0, 0, 0))
|
res.entries.push_back(
|
||||||
continue;
|
TermMatchEntry(*it, xdb.get_collection_freq(*it),
|
||||||
}
|
|
||||||
// Do we want stem expansion here? We don't do it for now
|
|
||||||
res.entries.push_back(TermMatchEntry(*it,
|
|
||||||
xdb.get_collection_freq(*it),
|
|
||||||
it.get_termfreq()));
|
it.get_termfreq()));
|
||||||
|
|
||||||
// The problem with truncating here is that this is done
|
// The problem with truncating here is that this is done
|
||||||
|
@ -1828,25 +1999,8 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
||||||
LOGERR(("termMatch: %s\n", m_reason.c_str()));
|
LOGERR(("termMatch: %s\n", m_reason.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (typ == ET_REGEXP) {
|
|
||||||
regfree(®);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
TermMatchCmpByTerm tcmp;
|
|
||||||
sort(res.entries.begin(), res.entries.end(), tcmp);
|
|
||||||
TermMatchTermEqual teq;
|
|
||||||
vector<TermMatchEntry>::iterator uit =
|
|
||||||
unique(res.entries.begin(), res.entries.end(), teq);
|
|
||||||
res.entries.resize(uit - res.entries.begin());
|
|
||||||
TermMatchCmpByWcf wcmp;
|
|
||||||
sort(res.entries.begin(), res.entries.end(), wcmp);
|
|
||||||
if (max > 0) {
|
|
||||||
// Would need a small max and big stem expansion...
|
|
||||||
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -111,13 +111,19 @@ public:
|
||||||
void clear()
|
void clear()
|
||||||
{
|
{
|
||||||
entries.clear();
|
entries.clear();
|
||||||
dbdoccount = 0;
|
|
||||||
dbavgdoclen = 0;
|
|
||||||
}
|
}
|
||||||
// Term expansion
|
// Term expansion
|
||||||
vector<TermMatchEntry> entries;
|
vector<TermMatchEntry> entries;
|
||||||
// If a field was specified, this is the corresponding index prefix
|
// If a field was specified, this is the corresponding index prefix
|
||||||
string prefix;
|
string prefix;
|
||||||
|
};
|
||||||
|
|
||||||
|
class DbStats {
|
||||||
|
public:
|
||||||
|
DbStats()
|
||||||
|
:dbdoccount(0), dbavgdoclen(0), mindoclen(0), maxdoclen(0)
|
||||||
|
{
|
||||||
|
}
|
||||||
// Index-wide stats
|
// Index-wide stats
|
||||||
unsigned int dbdoccount;
|
unsigned int dbdoccount;
|
||||||
double dbavgdoclen;
|
double dbavgdoclen;
|
||||||
|
@ -310,7 +316,9 @@ class Db {
|
||||||
* Expansion is performed either with either wildcard or regexp processing
|
* Expansion is performed either with either wildcard or regexp processing
|
||||||
* Stem expansion is performed if lang is not empty
|
* Stem expansion is performed if lang is not empty
|
||||||
*
|
*
|
||||||
* @param typ defines the kind of expansion: wildcard, regexp or stemming
|
* @param typ_sens defines the kind of expansion: none, wildcard,
|
||||||
|
* regexp or stemming. "none" will still expand case and
|
||||||
|
* diacritics depending on the casesens and diacsens flags.
|
||||||
* @param lang sets the stemming language(s). Can be a space-separated list
|
* @param lang sets the stemming language(s). Can be a space-separated list
|
||||||
* @param term is the term to expand
|
* @param term is the term to expand
|
||||||
* @param result is the main output
|
* @param result is the main output
|
||||||
|
@ -321,10 +329,16 @@ class Db {
|
||||||
* will be appropriately prefix and the prefix value will be set
|
* will be appropriately prefix and the prefix value will be set
|
||||||
* in the TermMatchResult header
|
* in the TermMatchResult header
|
||||||
*/
|
*/
|
||||||
enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
|
enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3,
|
||||||
bool termMatch(MatchType typ, const string &lang, const string &term,
|
ET_DIACSENS=8, ET_CASESENS=16};
|
||||||
|
int matchTypeTp(int tp)
|
||||||
|
{
|
||||||
|
return tp & 7;
|
||||||
|
}
|
||||||
|
bool termMatch(int typ_sens, const string &lang, const string &term,
|
||||||
TermMatchResult& result, int max = -1,
|
TermMatchResult& result, int max = -1,
|
||||||
const string& field = cstr_null);
|
const string& field = cstr_null);
|
||||||
|
bool dbStats(DbStats& stats);
|
||||||
/** Return min and max years for doc mod times in db */
|
/** Return min and max years for doc mod times in db */
|
||||||
bool maxYearSpan(int *minyear, int *maxyear);
|
bool maxYearSpan(int *minyear, int *maxyear);
|
||||||
|
|
||||||
|
@ -426,8 +440,13 @@ private:
|
||||||
bool i_close(bool final);
|
bool i_close(bool final);
|
||||||
// Reinitialize when adding/removing additional dbs
|
// Reinitialize when adding/removing additional dbs
|
||||||
bool adjustdbs();
|
bool adjustdbs();
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
bool stemExpand(const string &lang, const string &s,
|
bool stemExpand(const string &lang, const string &s,
|
||||||
TermMatchResult& result);
|
TermMatchResult& result);
|
||||||
|
#endif
|
||||||
|
bool idxTermMatch(int typ_sens, const string &lang, const string &term,
|
||||||
|
TermMatchResult& result, int max = -1,
|
||||||
|
const string& field = cstr_null);
|
||||||
|
|
||||||
// Flush when idxflushmb is reached
|
// Flush when idxflushmb is reached
|
||||||
bool maybeflush(off_t moretext);
|
bool maybeflush(off_t moretext);
|
||||||
|
|
|
@ -544,11 +544,12 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||||
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
||||||
|
|
||||||
// If there are no wildcards, add term to the list of user-entered terms
|
// If there are no wildcards, add term to the list of user-entered terms
|
||||||
if (!haswild)
|
if (!haswild) {
|
||||||
m_hldata.uterms.insert(term);
|
m_hldata.uterms.insert(term);
|
||||||
|
sterm = term;
|
||||||
|
}
|
||||||
// No stem expansion if there are wildcards or if prevented by caller
|
// No stem expansion if there are wildcards or if prevented by caller
|
||||||
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
|
||||||
if (haswild || getStemLang().empty()) {
|
if (haswild || getStemLang().empty()) {
|
||||||
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
||||||
nostemexp = true;
|
nostemexp = true;
|
||||||
|
@ -557,9 +558,11 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||||
// noexpansion can be modified further down by possible case/diac expansion
|
// noexpansion can be modified further down by possible case/diac expansion
|
||||||
bool noexpansion = nostemexp && !haswild;
|
bool noexpansion = nostemexp && !haswild;
|
||||||
|
|
||||||
|
int termmatchsens = 0;
|
||||||
|
|
||||||
#ifndef RCL_INDEX_STRIPCHARS
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
|
||||||
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
|
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
|
||||||
|
|
||||||
if (o_index_stripchars) {
|
if (o_index_stripchars) {
|
||||||
diac_sensitive = case_sensitive = false;
|
diac_sensitive = case_sensitive = false;
|
||||||
|
@ -596,134 +599,29 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||||
if (!case_sensitive || !diac_sensitive)
|
if (!case_sensitive || !diac_sensitive)
|
||||||
noexpansion = false;
|
noexpansion = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (case_sensitive)
|
||||||
|
termmatchsens |= Db::ET_CASESENS;
|
||||||
|
if (diac_sensitive)
|
||||||
|
termmatchsens |= Db::ET_DIACSENS;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (noexpansion) {
|
if (noexpansion) {
|
||||||
sterm = term;
|
|
||||||
oexp.push_back(prefix + term);
|
oexp.push_back(prefix + term);
|
||||||
m_hldata.terms[term] = m_hldata.uterms.size() - 1;
|
m_hldata.terms[term] = m_hldata.uterms.size() - 1;
|
||||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef RCL_INDEX_STRIPCHARS
|
Db::MatchType mtyp = haswild ? Db::ET_WILD :
|
||||||
// The case/diac expansion db
|
nostemexp ? Db::ET_NONE : Db::ET_STEM;
|
||||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
|
||||||
XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
|
|
||||||
&unacfoldtrans);
|
|
||||||
#endif // RCL_INDEX_STRIPCHARS
|
|
||||||
|
|
||||||
TermMatchResult res;
|
TermMatchResult res;
|
||||||
|
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
|
||||||
if (haswild) {
|
m_field)) {
|
||||||
#ifndef RCL_INDEX_STRIPCHARS
|
// Let it go through
|
||||||
if (!o_index_stripchars && (!diac_sensitive || !case_sensitive)) {
|
|
||||||
// Perform case/diac expansion on the exp as appropriate and
|
|
||||||
// expand the result.
|
|
||||||
vector<string> exp;
|
|
||||||
if (diac_sensitive) {
|
|
||||||
// Expand for diacritics and case, filtering for same diacritics
|
|
||||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
||||||
synac.keyWildExpand(term, exp, &foldtrans);
|
|
||||||
} else if (case_sensitive) {
|
|
||||||
// Expand for diacritics and case, filtering for same case
|
|
||||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
||||||
synac.keyWildExpand(term, exp, &unactrans);
|
|
||||||
} else {
|
|
||||||
// Expand for diacritics and case, no filtering
|
|
||||||
synac.keyWildExpand(term, exp);
|
|
||||||
}
|
}
|
||||||
// There are no wildcards in the result from above but
|
|
||||||
// calling termMatch gets the result into the right form
|
|
||||||
for (vector<string>::const_iterator it = exp.begin();
|
|
||||||
it != exp.end(); it++) {
|
|
||||||
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
|
||||||
maxexpand, m_field);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif // RCL_INDEX_STRIPCHARS
|
|
||||||
|
|
||||||
// Expand the original wildcard expression even if we did the
|
|
||||||
// case/diac dance above,
|
|
||||||
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
|
|
||||||
maxexpand, m_field);
|
|
||||||
goto termmatchtoresult;
|
|
||||||
}
|
|
||||||
|
|
||||||
sterm = term;
|
|
||||||
|
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
|
||||||
|
|
||||||
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
|
|
||||||
maxexpand, m_field);
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
if (o_index_stripchars) {
|
|
||||||
// If the index is stripped, we can only come here if
|
|
||||||
// nostemexp is unset and we just need stem expansion.
|
|
||||||
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
|
|
||||||
maxexpand, m_field);
|
|
||||||
} else {
|
|
||||||
vector<string> lexp;
|
|
||||||
if (diac_sensitive && case_sensitive) {
|
|
||||||
// No expansion whatsoever.
|
|
||||||
lexp.push_back(term);
|
|
||||||
} else if (diac_sensitive) {
|
|
||||||
// Expand for accents and case, filtering for same accents,
|
|
||||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
||||||
synac.synExpand(term, lexp, &foldtrans);
|
|
||||||
} else if (case_sensitive) {
|
|
||||||
// Expand for accents and case, filtering for same case
|
|
||||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
||||||
synac.synExpand(term, lexp, &unactrans);
|
|
||||||
} else {
|
|
||||||
// We are neither accent- nor case- sensitive and may need stem
|
|
||||||
// expansion or not. Expand for accents and case
|
|
||||||
synac.synExpand(term, lexp);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!nostemexp) {
|
|
||||||
// Need stem expansion. Lowercase the result of accent and case
|
|
||||||
// expansion for input to stemdb.
|
|
||||||
for (unsigned int i = 0; i < lexp.size(); i++) {
|
|
||||||
string lower;
|
|
||||||
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
|
||||||
lexp[i] = lower;
|
|
||||||
}
|
|
||||||
sort(lexp.begin(), lexp.end());
|
|
||||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
||||||
StemDb sdb(db.m_ndb->xrdb);
|
|
||||||
vector<string> exp1;
|
|
||||||
for (vector<string>::const_iterator it = lexp.begin();
|
|
||||||
it != lexp.end(); it++) {
|
|
||||||
sdb.stemExpand(getStemLang(), *it, exp1);
|
|
||||||
}
|
|
||||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
|
||||||
|
|
||||||
// Expand the resulting list for case (all stemdb content
|
|
||||||
// is lowercase)
|
|
||||||
lexp.clear();
|
|
||||||
for (vector<string>::const_iterator it = exp1.begin();
|
|
||||||
it != exp1.end(); it++) {
|
|
||||||
synac.synExpand(*it, lexp);
|
|
||||||
}
|
|
||||||
sort(lexp.begin(), lexp.end());
|
|
||||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
|
||||||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
|
||||||
for (vector<string>::const_iterator it = lexp.begin();
|
|
||||||
it != lexp.end(); it++) {
|
|
||||||
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
|
||||||
maxexpand, m_field);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Term match entries to vector of terms
|
// Term match entries to vector of terms
|
||||||
termmatchtoresult:
|
|
||||||
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
|
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
|
||||||
ermsg = "Maximum term expansion size exceeded."
|
ermsg = "Maximum term expansion size exceeded."
|
||||||
" Maybe increase maxTermExpand.";
|
" Maybe increase maxTermExpand.";
|
||||||
|
@ -734,7 +632,7 @@ termmatchtoresult:
|
||||||
oexp.push_back(it->term);
|
oexp.push_back(it->term);
|
||||||
}
|
}
|
||||||
// If the term does not exist at all in the db, the return from
|
// If the term does not exist at all in the db, the return from
|
||||||
// term match is going to be empty, which is not what we want (we
|
// termMatch() is going to be empty, which is not what we want (we
|
||||||
// would then compute an empty Xapian query)
|
// would then compute an empty Xapian query)
|
||||||
if (oexp.empty())
|
if (oexp.empty())
|
||||||
oexp.push_back(prefix + term);
|
oexp.push_back(prefix + term);
|
||||||
|
|
|
@ -28,6 +28,7 @@
|
||||||
#include "xmacros.h"
|
#include "xmacros.h"
|
||||||
#include "synfamily.h"
|
#include "synfamily.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
|
#include "refcntr.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
@ -182,50 +183,35 @@ bool XapComputableSynFamMember::synExpand(const string& term,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool XapComputableSynFamMember::keyWildExpand(const string& inexp,
|
bool XapComputableSynFamMember::synKeyExpand(StrMatcher* inexp,
|
||||||
vector<string>& result,
|
vector<string>& result,
|
||||||
SynTermTrans *filtertrans)
|
SynTermTrans *filtertrans)
|
||||||
{
|
{
|
||||||
LOGDEB(("XapCompSynFam::keyWildExpand: [%s]\n", inexp.c_str()));
|
LOGDEB(("XapCompSynFam::synKeyExpand: [%s]\n", inexp->exp().c_str()));
|
||||||
|
|
||||||
// Transform input into our key format (e.g.: case-folded + diac-stripped)
|
|
||||||
string stripped_exp = (*m_trans)(inexp);
|
|
||||||
|
|
||||||
// If set, compute filtering term (e.g.: only case-folded)
|
// If set, compute filtering term (e.g.: only case-folded)
|
||||||
string filter_exp;
|
RefCntr<StrMatcher> filter_exp;
|
||||||
if (filtertrans)
|
if (filtertrans) {
|
||||||
filter_exp = (*filtertrans)(inexp);
|
filter_exp = RefCntr<StrMatcher>(inexp->clone());
|
||||||
|
filter_exp->setExp((*filtertrans)(inexp->exp()));
|
||||||
// Find the initial section before any special chars
|
|
||||||
string::size_type es = stripped_exp.find_first_of(cstr_wildSpecStChars);
|
|
||||||
string is; // Initial section
|
|
||||||
switch (es) {
|
|
||||||
case string::npos:
|
|
||||||
// No special chars, no expansion.
|
|
||||||
result.push_back(inexp);
|
|
||||||
return true;
|
|
||||||
break;
|
|
||||||
case 0:
|
|
||||||
// Input starts with special char: start at bottom
|
|
||||||
is = m_prefix;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
// Compute initial section
|
|
||||||
is = m_prefix + stripped_exp.substr(0, es);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Input to matching: prefix + transformed input
|
// Transform input into our key format (e.g.: case-folded + diac-stripped),
|
||||||
string matchin = m_prefix + stripped_exp;
|
// and prepend prefix
|
||||||
|
inexp->setExp(m_prefix + (*m_trans)(inexp->exp()));
|
||||||
|
// Find the initial section before any special chars for skipping the keys
|
||||||
|
string::size_type es = inexp->baseprefixlen();
|
||||||
|
string is = inexp->exp().substr(0, es);
|
||||||
string::size_type preflen = m_prefix.size();
|
string::size_type preflen = m_prefix.size();
|
||||||
|
LOGDEB2(("XapCompSynFam::is: [%s]\n", is.c_str()));
|
||||||
|
|
||||||
string ermsg;
|
string ermsg;
|
||||||
try {
|
try {
|
||||||
for (Xapian::TermIterator xit = m_family.getdb().synonym_keys_begin(is);
|
for (Xapian::TermIterator xit = m_family.getdb().synonym_keys_begin(is);
|
||||||
xit != m_family.getdb().synonym_keys_end(is); xit++) {
|
xit != m_family.getdb().synonym_keys_end(is); xit++) {
|
||||||
LOGDEB2((" Checking1 [%s] against [%s]\n", (*xit).c_str(),
|
LOGDEB2((" Checking1 [%s] against [%s]\n", (*xit).c_str(),
|
||||||
matchin.c_str()));
|
inexp->exp().c_str()));
|
||||||
if (fnmatch(matchin.c_str(), (*xit).c_str(), 0) == FNM_NOMATCH)
|
if (!inexp->match(*xit))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// Push all the synonyms if they match the secondary filter
|
// Push all the synonyms if they match the secondary filter
|
||||||
|
@ -233,39 +219,34 @@ bool XapComputableSynFamMember::keyWildExpand(const string& inexp,
|
||||||
m_family.getdb().synonyms_begin(*xit);
|
m_family.getdb().synonyms_begin(*xit);
|
||||||
xit1 != m_family.getdb().synonyms_end(*xit); xit1++) {
|
xit1 != m_family.getdb().synonyms_end(*xit); xit1++) {
|
||||||
string term = *xit1;
|
string term = *xit1;
|
||||||
if (filtertrans) {
|
if (filter_exp.isNotNull()) {
|
||||||
string term1 = (*filtertrans)(term);
|
string term1 = (*filtertrans)(term);
|
||||||
LOGDEB2((" Testing [%s] against [%s]\n",
|
LOGDEB2((" Testing [%s] against [%s]\n",
|
||||||
term1.c_str(), filter_exp.c_str()));
|
term1.c_str(), filter_exp.c_str()));
|
||||||
if (fnmatch(filter_exp.c_str(),
|
if (!filter_exp->match(term1)) {
|
||||||
term1.c_str(), 0) == FNM_NOMATCH) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOGDEB(("XapCompSynFam::keyWildExpand: Pushing %s\n",
|
LOGDEB2(("XapCompSynFam::keyWildExpand: [%s]\n",
|
||||||
(*xit1).c_str()));
|
(*xit1).c_str()));
|
||||||
result.push_back(*xit1);
|
result.push_back(*xit1);
|
||||||
}
|
}
|
||||||
// Same with key itself
|
// Same with key itself
|
||||||
string term = (*xit).substr(preflen);
|
string term = (*xit).substr(preflen);
|
||||||
if (filtertrans) {
|
if (filter_exp.isNotNull()) {
|
||||||
string term1 = (*filtertrans)(term);
|
string term1 = (*filtertrans)(term);
|
||||||
LOGDEB((" Testing [%s] against [%s]\n",
|
LOGDEB2((" Testing [%s] against [%s]\n",
|
||||||
term1.c_str(), filter_exp.c_str()));
|
term1.c_str(), filter_exp->exp().c_str()));
|
||||||
if (fnmatch(filter_exp.c_str(),
|
if (!filter_exp->match(term1)) {
|
||||||
term1.c_str(), 0) == FNM_NOMATCH) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOGDEB(("XapCompSynFam::keyWildExpand: Pushing [%s]\n",
|
LOGDEB2(("XapCompSynFam::keyWildExpand: [%s]\n", term.c_str()));
|
||||||
term.c_str()));
|
|
||||||
result.push_back(term);
|
result.push_back(term);
|
||||||
}
|
}
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
LOGERR(("XapCompSynFam::keyWildExpand: error: term [%s]\n",
|
LOGERR(("XapCompSynFam::keyWildExpand: xapian: [%s]\n", ermsg.c_str()));
|
||||||
inexp.c_str()));
|
|
||||||
result.push_back(inexp);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
@ -304,6 +285,7 @@ static int op_flags;
|
||||||
#define OPT_l 0x20
|
#define OPT_l 0x20
|
||||||
#define OPT_s 0x40
|
#define OPT_s 0x40
|
||||||
#define OPT_e 0x80
|
#define OPT_e 0x80
|
||||||
|
|
||||||
static string usage =
|
static string usage =
|
||||||
" -d <dbdir> {-s|-a|-u} database dir and synfamily: stem accents/case ustem\n"
|
" -d <dbdir> {-s|-a|-u} database dir and synfamily: stem accents/case ustem\n"
|
||||||
" -l : list members\n"
|
" -l : list members\n"
|
||||||
|
|
|
@ -39,6 +39,7 @@
|
||||||
|
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "xmacros.h"
|
#include "xmacros.h"
|
||||||
|
#include "strmatcher.h"
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
|
@ -141,8 +142,9 @@ public:
|
||||||
bool synExpand(const std::string& term, std::vector<std::string>& result,
|
bool synExpand(const std::string& term, std::vector<std::string>& result,
|
||||||
SynTermTrans *filtertrans = 0);
|
SynTermTrans *filtertrans = 0);
|
||||||
|
|
||||||
/** Expand key to wildcard/regexp matching keys */
|
/** Same with also wildcard/regexp expansion of entry against the keys.
|
||||||
bool keyWildExpand(const std::string& in, std::vector<std::string>& result,
|
* The input matcher will be modified to fit our key format. */
|
||||||
|
bool synKeyExpand(StrMatcher* in, std::vector<std::string>& result,
|
||||||
SynTermTrans *filtertrans = 0);
|
SynTermTrans *filtertrans = 0);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue