Make Recoll optionally sensitive to case and diacritics

This commit is contained in:
Jean-Francois Dockes 2012-09-14 14:34:27 +02:00
parent 7fcfe27952
commit 166624f7f2
30 changed files with 849 additions and 487 deletions

View file

@ -63,26 +63,57 @@ bool unacmaybefold(const string &in, string &out,
return true; return true;
} }
// Functions to determine upper-case or accented status could be implemented
// hugely more efficiently inside the unac c code, but there only used for
// testing user-entered terms, so we don't really care.
bool unaciscapital(const string& in) bool unaciscapital(const string& in)
{ {
LOGDEB2(("unaciscapital: [%s]\n", in.c_str()));
if (in.empty()) if (in.empty())
return false; return false;
Utf8Iter it(in); Utf8Iter it(in);
string shorter; string shorter;
it.appendchartostring(shorter); it.appendchartostring(shorter);
string noacterm, noaclowterm; string lower;
if (!unacmaybefold(shorter, noacterm, "UTF-8", UNACOP_UNAC)) { if (!unacmaybefold(shorter, lower, "UTF-8", UNACOP_FOLD)) {
LOGINFO(("unaciscapital: unac failed for [%s]\n", in.c_str())); LOGINFO(("unaciscapital: unac/fold failed for [%s]\n", in.c_str()));
return false; return false;
} }
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", UNACOP_UNACFOLD)) { Utf8Iter it1(lower);
LOGINFO(("unaciscapital: unacfold failed for [%s]\n", in.c_str())); if (*it != *it1)
return true;
else
return false; return false;
} }
Utf8Iter it1(noacterm); bool unachasuppercase(const string& in)
Utf8Iter it2(noaclowterm); {
if (*it1 != *it2) LOGDEB2(("unachasuppercase: [%s]\n", in.c_str()));
if (in.empty())
return false;
string lower;
if (!unacmaybefold(in, lower, "UTF-8", UNACOP_FOLD)) {
LOGINFO(("unachasuppercase: unac/fold failed for [%s]\n", in.c_str()));
return false;
}
if (lower != in)
return true;
else
return false;
}
bool unachasaccents(const string& in)
{
LOGDEB2(("unachasaccents: [%s]\n", in.c_str()));
if (in.empty())
return false;
string noac;
if (!unacmaybefold(in, noac, "UTF-8", UNACOP_UNAC)) {
LOGINFO(("unachasaccents: unac/unac failed for [%s]\n", in.c_str()));
return false;
}
if (noac != in)
return true; return true;
else else
return false; return false;
@ -110,9 +141,12 @@ static char usage [] = "\n"
" Default : unaccent\n" " Default : unaccent\n"
" -c : unaccent and casefold\n" " -c : unaccent and casefold\n"
" -C : casefold only\n" " -C : casefold only\n"
"-t <string> test string as capitalized, upper-case anywhere, accents\n"
" the parameter is supposedly utf-8 so this can only work in an utf-8\n"
" locale\n"
"\n"; "\n";
; ;
static void static void
Usage(void) Usage(void)
{ {
@ -123,6 +157,7 @@ Usage(void)
static int op_flags; static int op_flags;
#define OPT_c 0x2 #define OPT_c 0x2
#define OPT_C 0x4 #define OPT_C 0x4
#define OPT_t 0x8
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
@ -140,21 +175,35 @@ int main(int argc, char **argv)
switch (*(*argv)++) { switch (*(*argv)++) {
case 'c': op_flags |= OPT_c; break; case 'c': op_flags |= OPT_c; break;
case 'C': op_flags |= OPT_C; break; case 'C': op_flags |= OPT_C; break;
case 't': op_flags |= OPT_t; break;
default: Usage(); break; default: Usage(); break;
} }
argc--; argv++; argc--; argv++;
} }
if (op_flags & OPT_t) {
if (argc != 1)
Usage();
string in = *argv++;argc--;
bool capital, upper, accent;
capital = unaciscapital(in);
upper = unachasuppercase(in);
accent = unachasaccents(in);
cout << "[" << in << "] : " <<
"capitalized: " << (capital ? "Yes. " : "No. ") <<
"has uppercase: " << (upper ? "Yes. " : "No. ") <<
"has accents: " << (accent ? "Yes. " : "No. ") <<
endl;
return 0;
} else {
if (argc != 3)
Usage();
if (op_flags & OPT_c) { if (op_flags & OPT_c) {
op = UNACOP_UNACFOLD; op = UNACOP_UNACFOLD;
} else if (op_flags & OPT_C) { } else if (op_flags & OPT_C) {
op = UNACOP_FOLD; op = UNACOP_FOLD;
} }
if (argc != 3) {
Usage();
}
const char *encoding = *argv++; argc--; const char *encoding = *argv++; argc--;
string ifn = *argv++; argc--; string ifn = *argv++; argc--;
if (!ifn.compare("stdin")) if (!ifn.compare("stdin"))
@ -193,5 +242,6 @@ int main(int argc, char **argv)
close(fd); close(fd);
return 0; return 0;
} }
}
#endif #endif

View file

@ -24,11 +24,17 @@ using std::string;
#endif /* NO_NAMESPACES */ #endif /* NO_NAMESPACES */
// A small stringified wrapper for unac.c // A small stringified wrapper for unac.c
enum UnacOp {UNACOP_UNAC, UNACOP_UNACFOLD, UNACOP_FOLD}; enum UnacOp {UNACOP_UNAC = 1, UNACOP_FOLD = 2, UNACOP_UNACFOLD = 3};
extern bool unacmaybefold(const string& in, string& out, extern bool unacmaybefold(const string& in, string& out,
const char *encoding, UnacOp what); const char *encoding, UnacOp what);
// Utility function to determine if string begins with capital // Utility function to determine if string begins with capital
extern bool unaciscapital(const string& in); extern bool unaciscapital(const string& in);
// Utility function to determine if string has upper-case anywhere
extern bool unachasuppercase(const string& in);
// Utility function to determine if any character is accented. This
// approprialey ignores the characters from unac_except_chars which
// are really separate letters
extern bool unachasaccents(const string& in);
#endif /* _UNACPP_H_INCLUDED_ */ #endif /* _UNACPP_H_INCLUDED_ */

View file

@ -17,6 +17,7 @@
#ifndef TEST_SUBTREELIST #ifndef TEST_SUBTREELIST
#include "cstr.h"
#include "refcntr.h" #include "refcntr.h"
#include "rcldb.h" #include "rcldb.h"
#include "searchdata.h" #include "searchdata.h"
@ -35,7 +36,7 @@ bool subtreelist(RclConfig *config, const string& top,
return false; return false;
} }
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR); Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, cstr_null);
RefCntr<Rcl::SearchData> rq(sd); RefCntr<Rcl::SearchData> rq(sd);
rq->addDirSpec(top); rq->addDirSpec(top);

View file

@ -6,8 +6,8 @@ LIBS = librcl.a
all: $(LIBS) all: $(LIBS)
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o expansiondbs.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp expansiondbs.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
librcl.a : $(DEPS) $(OBJS) librcl.a : $(DEPS) $(OBJS)
ar ru librcl.a $(OBJS) ar ru librcl.a $(OBJS)
@ -87,6 +87,8 @@ wasastringtoquery.o : ../query/wasastringtoquery.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp $(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp
wasatorcl.o : ../query/wasatorcl.cpp $(depth)/mk/localdefs wasatorcl.o : ../query/wasatorcl.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp $(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp
expansiondbs.o : ../rcldb/expansiondbs.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/expansiondbs.cpp
rcldb.o : ../rcldb/rcldb.cpp $(depth)/mk/localdefs rcldb.o : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp
rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
@ -278,6 +280,9 @@ wasastringtoquery.dep.stamp : ../query/wasastringtoquery.cpp $(depth)/mk/localde
wasatorcl.dep.stamp : ../query/wasatorcl.cpp $(depth)/mk/localdefs wasatorcl.dep.stamp : ../query/wasatorcl.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep $(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep
touch wasatorcl.dep.stamp touch wasatorcl.dep.stamp
expansiondbs.dep.stamp : ../rcldb/expansiondbs.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/expansiondbs.cpp > expansiondbs.dep
touch expansiondbs.dep.stamp
rcldb.dep.stamp : ../rcldb/rcldb.cpp $(depth)/mk/localdefs rcldb.dep.stamp : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep
touch rcldb.dep.stamp touch rcldb.dep.stamp
@ -405,6 +410,7 @@ include reslistpager.dep
include sortseq.dep include sortseq.dep
include wasastringtoquery.dep include wasastringtoquery.dep
include wasatorcl.dep include wasatorcl.dep
include expansiondbs.dep
include rcldb.dep include rcldb.dep
include rcldoc.dep include rcldoc.dep
include rclquery.dep include rclquery.dep

View file

@ -41,6 +41,7 @@ ${depth}/query/reslistpager.cpp \
${depth}/query/sortseq.cpp \ ${depth}/query/sortseq.cpp \
${depth}/query/wasastringtoquery.cpp \ ${depth}/query/wasastringtoquery.cpp \
${depth}/query/wasatorcl.cpp \ ${depth}/query/wasatorcl.cpp \
${depth}/rcldb/expansiondbs.cpp \
${depth}/rcldb/rcldb.cpp \ ${depth}/rcldb/rcldb.cpp \
${depth}/rcldb/rcldoc.cpp \ ${depth}/rcldb/rcldoc.cpp \
${depth}/rcldb/rclquery.cpp \ ${depth}/rcldb/rclquery.cpp \

View file

@ -93,7 +93,7 @@ SearchData_init(recoll_SearchDataObject *self, PyObject *args, PyObject *kwargs)
if (stp && strcasecmp(stp, "or")) { if (stp && strcasecmp(stp, "or")) {
tp = Rcl::SCLT_OR; tp = Rcl::SCLT_OR;
} }
self->sd = RefCntr<Rcl::SearchData>(new Rcl::SearchData(tp)); self->sd = RefCntr<Rcl::SearchData>(new Rcl::SearchData(tp, "english"));
return 0; return 0;
} }
@ -715,18 +715,18 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_AttributeError, "query"); PyErr_SetString(PyExc_AttributeError, "query");
return 0; return 0;
} }
// SearchData defaults to stemming in english
// Use default for now but need to add way to specify language
string reason; string reason;
Rcl::SearchData *sd = wasaStringToRcl(rclconfig, utf8, reason); Rcl::SearchData *sd = wasaStringToRcl(rclconfig, dostem ? "english" : "",
utf8, reason);
if (!sd) { if (!sd) {
PyErr_SetString(PyExc_ValueError, reason.c_str()); PyErr_SetString(PyExc_ValueError, reason.c_str());
return 0; return 0;
} }
// SearchData defaults to stemming in english
// Use default for now but need to add way to specify language
if (!dostem)
sd->setStemlang("");
RefCntr<Rcl::SearchData> rq(sd); RefCntr<Rcl::SearchData> rq(sd);
string sf = self->sortfield ? string(self->sortfield) : string(""); string sf = self->sortfield ? string(self->sortfield) : string("");
self->query->setSortBy(sf, self->ascending); self->query->setSortBy(sf, self->ascending);

View file

@ -356,8 +356,9 @@ size_t AdvSearch::stringToSize(QString qsize)
using namespace Rcl; using namespace Rcl;
void AdvSearch::runSearch() void AdvSearch::runSearch()
{ {
string stemLang = prefs.stemlang();
RefCntr<SearchData> sdata(new SearchData(conjunctCMB->currentIndex() == 0 ? RefCntr<SearchData> sdata(new SearchData(conjunctCMB->currentIndex() == 0 ?
SCLT_AND : SCLT_OR)); SCLT_AND : SCLT_OR, stemLang));
bool hasclause = false; bool hasclause = false;
for (list<SearchClauseW*>::iterator it = m_clauseWins.begin(); for (list<SearchClauseW*>::iterator it = m_clauseWins.begin();

View file

@ -372,6 +372,18 @@ void rwSettings(bool writing)
} }
} }
string PrefsPack::stemlang()
{
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") {
if (theconfig)
theconfig->getConfParam("indexstemminglanguages", stemLang);
else
stemLang = "";
}
return stemLang;
}
QString myGetFileName(bool isdir, QString caption, bool filenosave) QString myGetFileName(bool isdir, QString caption, bool filenosave)
{ {
LOGDEB1(("myFileDialog: isdir %d\n", isdir)); LOGDEB1(("myFileDialog: isdir %d\n", isdir));

View file

@ -120,6 +120,8 @@ class PrefsPack {
// Default paragraph format for result list // Default paragraph format for result list
static const char *dfltResListFormat; static const char *dfltResListFormat;
std::string stemlang();
PrefsPack() : PrefsPack() :
respagesize(8), respagesize(8),
reslistfontsize(10), reslistfontsize(10),

View file

@ -756,12 +756,6 @@ void RclMain::startSearch(RefCntr<Rcl::SearchData> sdata)
return; return;
} }
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") {
theconfig->getConfParam("indexstemminglanguages", stemLang);
}
sdata->setStemlang(stemLang);
Rcl::Query *query = new Rcl::Query(rcldb); Rcl::Query *query = new Rcl::Query(rcldb);
query->setCollapseDuplicates(prefs.collapseDuplicates); query->setCollapseDuplicates(prefs.collapseDuplicates);
@ -1073,9 +1067,7 @@ void RclMain::showActiveTypes()
// Get list of all mime types in index. For this, we use a // Get list of all mime types in index. For this, we use a
// wildcard field search on mtype // wildcard field search on mtype
Rcl::TermMatchResult matches; Rcl::TermMatchResult matches;
string prefix; if (!rcldb->termMatch(Rcl::Db::ET_WILD, "", "*", matches, -1, "mtype")) {
if (!rcldb->termMatch(Rcl::Db::ET_WILD, "", "*", matches, -1, "mtype",
&prefix)) {
QMessageBox::warning(0, tr("Error"), QMessageBox::warning(0, tr("Error"),
tr("Index query error"), tr("Index query error"),
QMessageBox::Ok, QMessageBox::Ok,
@ -1088,7 +1080,7 @@ void RclMain::showActiveTypes()
for (vector<Rcl::TermMatchEntry>::const_iterator it = for (vector<Rcl::TermMatchEntry>::const_iterator it =
matches.entries.begin(); matches.entries.begin();
it != matches.entries.end(); it++) { it != matches.entries.end(); it++) {
mtypesfromdb.insert(it->term.substr(prefix.size())); mtypesfromdb.insert(it->term.substr(matches.prefix.size()));
} }
// All types listed in mimeconf: // All types listed in mimeconf:
@ -1771,7 +1763,7 @@ void RclMain::showDocHistory()
} }
// Construct a bogus SearchData structure // Construct a bogus SearchData structure
RefCntr<Rcl::SearchData>searchdata = RefCntr<Rcl::SearchData>searchdata =
RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND)); RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND, cstr_null));
searchdata->setDescription((const char *)tr("History data").toUtf8()); searchdata->setDescription((const char *)tr("History data").toUtf8());

View file

@ -126,23 +126,25 @@ void SSearch::startSimpleSearch()
if (u8.length() == 0) if (u8.length() == 0)
return; return;
string stemlang = prefs.stemlang();
SSearchType tp = (SSearchType)searchTypCMB->currentIndex(); SSearchType tp = (SSearchType)searchTypCMB->currentIndex();
Rcl::SearchData *sdata = 0; Rcl::SearchData *sdata = 0;
if (tp == SST_LANG) { if (tp == SST_LANG) {
string reason; string reason;
if (prefs.autoSuffsEnable) if (prefs.autoSuffsEnable)
sdata = wasaStringToRcl(theconfig, u8, reason, sdata = wasaStringToRcl(theconfig, stemlang, u8, reason,
(const char *)prefs.autoSuffs.toUtf8()); (const char *)prefs.autoSuffs.toUtf8());
else else
sdata = wasaStringToRcl(theconfig, u8, reason); sdata = wasaStringToRcl(theconfig, stemlang, u8, reason);
if (sdata == 0) { if (sdata == 0) {
QMessageBox::warning(0, "Recoll", tr("Bad query string") + ": " + QMessageBox::warning(0, "Recoll", tr("Bad query string") + ": " +
QString::fromAscii(reason.c_str())); QString::fromAscii(reason.c_str()));
return; return;
} }
} else { } else {
sdata = new Rcl::SearchData(Rcl::SCLT_OR); sdata = new Rcl::SearchData(Rcl::SCLT_OR, stemlang);
if (sdata == 0) { if (sdata == 0) {
QMessageBox::warning(0, "Recoll", tr("Out of memory")); QMessageBox::warning(0, "Recoll", tr("Out of memory"));
return; return;
@ -166,11 +168,6 @@ void SSearch::startSimpleSearch()
} }
if (prefs.ssearchAutoPhrase && rcldb) { if (prefs.ssearchAutoPhrase && rcldb) {
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") {
theconfig->getConfParam("indexstemminglanguages", stemLang);
}
sdata->setStemlang(stemLang);
sdata->maybeAddAutoPhrase(*rcldb, sdata->maybeAddAutoPhrase(*rcldb,
prefs.ssearchAutoPhraseThreshPC / 100.0); prefs.ssearchAutoPhraseThreshPC / 100.0);
} }
@ -277,10 +274,9 @@ void SSearch::completion()
// Query database // Query database
const int max = 100; const int max = 100;
Rcl::TermMatchResult tmres; Rcl::TermMatchResult tmres;
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") { string stemLang = prefs.stemlang();
theconfig->getConfParam("indexstemminglanguages", stemLang);
}
if (!rcldb->termMatch(Rcl::Db::ET_WILD, stemLang, s, tmres, max) || if (!rcldb->termMatch(Rcl::Db::ET_WILD, stemLang, s, tmres, max) ||
tmres.entries.size() == 0) { tmres.entries.size() == 0) {
QApplication::beep(); QApplication::beep();

View file

@ -120,7 +120,8 @@ bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
LOGDEB(("DocSequenceDb::setFiltSpec\n")); LOGDEB(("DocSequenceDb::setFiltSpec\n"));
if (fs.isNotNull()) { if (fs.isNotNull()) {
// We build a search spec by adding a filtering layer to the base one. // We build a search spec by adding a filtering layer to the base one.
m_fsdata = RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND)); m_fsdata = RefCntr<Rcl::SearchData>(
new Rcl::SearchData(Rcl::SCLT_AND, m_sdata->getStemLang()));
Rcl::SearchDataClauseSub *cl = Rcl::SearchDataClauseSub *cl =
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB, m_sdata); new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB, m_sdata);
m_fsdata->addClause(cl); m_fsdata->addClause(cl);
@ -138,6 +139,7 @@ bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
string reason; string reason;
Rcl::SearchData *sd = Rcl::SearchData *sd =
wasaStringToRcl(m_q->whatDb()->getConf(), wasaStringToRcl(m_q->whatDb()->getConf(),
m_sdata->getStemLang(),
fs.values[i], reason); fs.values[i], reason);
if (sd) { if (sd) {
Rcl::SearchDataClauseSub *cl1 = Rcl::SearchDataClauseSub *cl1 =

View file

@ -50,7 +50,10 @@ static string vecStringToString(const vector<string>& t)
} }
struct MatchEntry { struct MatchEntry {
// Start/End byte offsets in the document text
pair<int, int> offs; pair<int, int> offs;
// Index of the search group this comes from: this is to relate a
// match to the original user input.
unsigned int grpidx; unsigned int grpidx;
MatchEntry(int sta, int sto, unsigned int idx) MatchEntry(int sta, int sto, unsigned int idx)
: offs(sta, sto), grpidx(idx) : offs(sta, sto), grpidx(idx)
@ -76,11 +79,23 @@ class TextSplitPTR : public TextSplit {
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin(); for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
vit != hdata.groups.end(); vit++) { vit != hdata.groups.end(); vit++) {
if (vit->size() == 1) { if (vit->size() == 1) {
#ifdef RCL_INDEX_STRIPCHARS
m_terms[vit->front()] = vit - hdata.groups.begin(); m_terms[vit->front()] = vit - hdata.groups.begin();
#else
string dumb = vit->front();
unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
m_terms[dumb] = vit - hdata.groups.begin();
#endif
} else if (vit->size() > 1) { } else if (vit->size() > 1) {
for (vector<string>::const_iterator it = vit->begin(); for (vector<string>::const_iterator it = vit->begin();
it != vit->end(); it++) { it != vit->end(); it++) {
#ifdef RCL_INDEX_STRIPCHARS
m_gterms.insert(*it); m_gterms.insert(*it);
#else
string dumb = *it;
unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
m_gterms.insert(dumb);
#endif
} }
} }
} }

View file

@ -286,7 +286,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
Rcl::SearchData *sd = 0; Rcl::SearchData *sd = 0;
if (op_flags & (OPT_a|OPT_o|OPT_f)) { if (op_flags & (OPT_a|OPT_o|OPT_f)) {
sd = new Rcl::SearchData(Rcl::SCLT_OR); sd = new Rcl::SearchData(Rcl::SCLT_OR, stemlang);
Rcl::SearchDataClause *clp = 0; Rcl::SearchDataClause *clp = 0;
if (op_flags & OPT_f) { if (op_flags & OPT_f) {
clp = new Rcl::SearchDataClauseFilename(qs); clp = new Rcl::SearchDataClauseFilename(qs);
@ -305,14 +305,13 @@ int recollq(RclConfig **cfp, int argc, char **argv)
if (sd) if (sd)
sd->addClause(clp); sd->addClause(clp);
} else { } else {
sd = wasaStringToRcl(rclconfig, qs, reason); sd = wasaStringToRcl(rclconfig, stemlang, qs, reason);
} }
if (!sd) { if (!sd) {
cerr << "Query string interpretation failed: " << reason << endl; cerr << "Query string interpretation failed: " << reason << endl;
return 1; return 1;
} }
sd->setStemlang(stemlang);
RefCntr<Rcl::SearchData> rq(sd); RefCntr<Rcl::SearchData> rq(sd);
Rcl::Query query(&rcldb); Rcl::Query query(&rcldb);

View file

@ -32,7 +32,9 @@ using std::list;
#include "refcntr.h" #include "refcntr.h"
#include "textsplit.h" #include "textsplit.h"
static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa, static Rcl::SearchData *wasaQueryToRcl(RclConfig *config,
const string& stemlang,
WasaQuery *wasa,
const string& autosuffs, string& reason) const string& autosuffs, string& reason)
{ {
if (wasa == 0) { if (wasa == 0) {
@ -47,7 +49,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
Rcl::SearchData *sdata = new Rcl::SearchData *sdata = new
Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND : Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND :
Rcl::SCLT_OR); Rcl::SCLT_OR, stemlang);
LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ? LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ?
"AND" : "OR")); "AND" : "OR"));
@ -250,7 +252,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str())); (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
// Create a subquery. // Create a subquery.
Rcl::SearchData *sub = Rcl::SearchData *sub =
wasaQueryToRcl(config, *it, autosuffs, reason); wasaQueryToRcl(config, stemlang, *it, autosuffs, reason);
if (sub == 0) { if (sub == 0) {
continue; continue;
} }
@ -278,7 +280,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
return sdata; return sdata;
} }
Rcl::SearchData *wasaStringToRcl(RclConfig *config, Rcl::SearchData *wasaStringToRcl(RclConfig *config, const string& stemlang,
const string &qs, string &reason, const string &qs, string &reason,
const string& autosuffs) const string& autosuffs)
{ {
@ -286,5 +288,5 @@ Rcl::SearchData *wasaStringToRcl(RclConfig *config,
WasaQuery *wq = parser.stringToQuery(qs, reason); WasaQuery *wq = parser.stringToQuery(qs, reason);
if (wq == 0) if (wq == 0)
return 0; return 0;
return wasaQueryToRcl(config, wq, autosuffs, reason); return wasaQueryToRcl(config, stemlang, wq, autosuffs, reason);
} }

View file

@ -25,7 +25,7 @@ using std::string;
class RclConfig; class RclConfig;
extern Rcl::SearchData *wasaStringToRcl(RclConfig *, extern Rcl::SearchData *wasaStringToRcl(RclConfig *, const string& stemlang,
const string& query, string &reason, const string& query, string &reason,
const string& autosuffs = string()); const string& autosuffs = string());
#endif /* _WASATORCL_H_INCLUDED_ */ #endif /* _WASATORCL_H_INCLUDED_ */

View file

@ -14,6 +14,9 @@
* Free Software Foundation, Inc., * Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/ */
#include "autoconfig.h"
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <signal.h> #include <signal.h>
@ -36,7 +39,8 @@ using namespace std;
static string thisprog; static string thisprog;
static string usage = static string usage =
" -d <dbdir> -e <output encoding>\n" " -d <dbdir> \n"
"-e <output encoding>\n"
" -i docid -D : get document data for docid\n" " -i docid -D : get document data for docid\n"
" -i docid -X : delete document docid\n" " -i docid -X : delete document docid\n"
" -i docid -b : 'rebuild' document from term positions\n" " -i docid -b : 'rebuild' document from term positions\n"
@ -112,6 +116,15 @@ static void sigcleanup(int sig)
exit(1); exit(1);
} }
inline bool has_prefix(const string& trm)
{
#ifdef RCL_INDEX_STRIPCHARS
return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z';
#else
return trm.size() > 0 && trm[0] == ':';
#endif
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
string dbdir = path_cat(path_home(), ".recoll/xapiandb"); string dbdir = path_cat(path_home(), ".recoll/xapiandb");
@ -201,8 +214,7 @@ int main(int argc, char **argv)
for (term = db->termlist_begin(docid); for (term = db->termlist_begin(docid);
term != db->termlist_end(docid);term++) { term != db->termlist_end(docid);term++) {
const string& s = *term; const string& s = *term;
if ((op_flags&OPT_l) && if ((op_flags&OPT_l) && has_prefix(s))
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
continue; continue;
cout << op << detailstring(s) << cl << endl; cout << op << detailstring(s) << cl << endl;
} }
@ -210,8 +222,7 @@ int main(int argc, char **argv)
for (term = db->allterms_begin(); for (term = db->allterms_begin();
term != db->allterms_end();term++) { term != db->allterms_end();term++) {
const string& s = *term; const string& s = *term;
if ((op_flags&OPT_l) && if ((op_flags&OPT_l) && has_prefix(s))
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
continue; continue;
if (op_flags & OPT_f) if (op_flags & OPT_f)
cout << db->get_collection_freq(*term) << " " cout << db->get_collection_freq(*term) << " "

View file

@ -72,7 +72,7 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
SynTermTransUnac transunac(UNACOP_UNACFOLD); SynTermTransUnac transunac(UNACOP_UNACFOLD);
XapWritableComputableSynFamMember XapWritableComputableSynFamMember
diacasedb(wdb, synFamDiac, "all", &transunac); diacasedb(wdb, synFamDiCa, "all", &transunac);
diacasedb.recreate(); diacasedb.recreate();
#endif #endif

View file

@ -14,6 +14,8 @@
* Free Software Foundation, Inc., * Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/ */
#include "autoconfig.h"
#include <stdio.h> #include <stdio.h>
#include <cstring> #include <cstring>
#include <unistd.h> #include <unistd.h>
@ -53,6 +55,7 @@ using namespace std;
#include "cancelcheck.h" #include "cancelcheck.h"
#include "ptmutex.h" #include "ptmutex.h"
#include "termproc.h" #include "termproc.h"
#include "expansiondbs.h"
#ifndef MAX #ifndef MAX
#define MAX(A,B) (A>B?A:B) #define MAX(A,B) (A>B?A:B)
@ -84,9 +87,15 @@ static const string xapday_prefix = "D";
static const string xapmonth_prefix = "M"; static const string xapmonth_prefix = "M";
static const string xapyear_prefix = "Y"; static const string xapyear_prefix = "Y";
const string pathelt_prefix = "XP"; const string pathelt_prefix = "XP";
#ifdef RCL_INDEX_STRIPCHARS
const string start_of_field_term = "XXST"; const string start_of_field_term = "XXST";
const string end_of_field_term = "XXND"; const string end_of_field_term = "XXND";
static const string page_break_term = "XXPG"; static const string page_break_term = "XXPG";
#else
const string start_of_field_term = "XXST/";
const string end_of_field_term = "XXND/";
static const string page_break_term = "XXPG/";
#endif
// Field name for the unsplit file name. Has to exist in the field file // Field name for the unsplit file name. Has to exist in the field file
// because of usage in termmatch() // because of usage in termmatch()
static const string unsplitFilenameFieldName = "rclUnsplitFN"; static const string unsplitFilenameFieldName = "rclUnsplitFN";
@ -197,7 +206,7 @@ static void noPrefixList(const vector<string>& in, vector<string>& out)
{ {
for (vector<string>::const_iterator qit = in.begin(); for (vector<string>::const_iterator qit = in.begin();
qit != in.end(); qit++) { qit != in.end(); qit++) {
if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z')) if (!has_prefix(*qit))
out.push_back(*qit); out.push_back(*qit);
} }
} }
@ -573,7 +582,7 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
for (term = xrdb.termlist_begin(docid); for (term = xrdb.termlist_begin(docid);
term != xrdb.termlist_end(docid); term++) { term != xrdb.termlist_end(docid); term++) {
// Ignore prefixed terms // Ignore prefixed terms
if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z') if (has_prefix(*term))
continue; continue;
if (cutoff-- < 0) { if (cutoff-- < 0) {
LOGDEB0(("makeAbstract: max term count cutoff\n")); LOGDEB0(("makeAbstract: max term count cutoff\n"));
@ -652,6 +661,8 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
vabs.push_back(chunk); vabs.push_back(chunk);
chunk.clear(); chunk.clear();
} else { } else {
if (it->second.compare(end_of_field_term) &&
it->second.compare(start_of_field_term))
chunk += it->second; chunk += it->second;
} }
} }
@ -874,11 +885,13 @@ int Db::termDocCnt(const string& _term)
if (!m_ndb || !m_ndb->m_isopen) if (!m_ndb || !m_ndb->m_isopen)
return -1; return -1;
string term; string term = _term;
#ifdef RCL_INDEX_STRIPCHARS
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str())); LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
return 0; return 0;
} }
#endif
if (m_stops.isStop(term)) { if (m_stops.isStop(term)) {
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str())); LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
@ -994,8 +1007,19 @@ class TextSplitDb : public TextSplitP {
{} {}
// Reimplement text_to_words to add start and end special terms // Reimplement text_to_words to add start and end special terms
virtual bool text_to_words(const string &in); virtual bool text_to_words(const string &in);
void setprefix(const string& pref) {prefix = pref;}
void setwdfinc(int i) {wdfinc = i;} void setprefix(const string& pref)
{
if (pref.empty())
prefix.clear();
else
prefix = wrap_prefix(pref);
}
void setwdfinc(int i)
{
wdfinc = i;
}
friend class TermProcIdx; friend class TermProcIdx;
@ -1127,11 +1151,13 @@ string Db::getSpellingSuggestion(const string& word)
{ {
if (m_ndb == 0) if (m_ndb == 0)
return string(); return string();
string term; string term = word;
#ifdef RCL_INDEX_STRIPCHARS
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) { if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str())); LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
return string(); return string();
} }
#endif
if (!isSpellingCandidate(term)) if (!isSpellingCandidate(term))
return string(); return string();
return m_ndb->xrdb.get_spelling_suggestion(term); return m_ndb->xrdb.get_spelling_suggestion(term);
@ -1240,7 +1266,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
TermProc *nxt = &tpidx; TermProc *nxt = &tpidx;
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop; TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon; //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
#ifdef RCL_INDEX_STRIPCHARS
TermProcPrep tpprep(nxt); nxt = &tpprep; TermProcPrep tpprep(nxt); nxt = &tpprep;
#endif
TextSplitDb splitter(newdocument, nxt); TextSplitDb splitter(newdocument, nxt);
tpidx.setTSD(&splitter); tpidx.setTSD(&splitter);
@ -1266,7 +1294,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
vector<string> vpath; vector<string> vpath;
stringToTokens(path, vpath, "/"); stringToTokens(path, vpath, "/");
splitter.curpos = 0; splitter.curpos = 0;
newdocument.add_posting(pathelt_prefix, newdocument.add_posting(wrap_prefix(pathelt_prefix),
splitter.basepos + splitter.curpos++); splitter.basepos + splitter.curpos++);
for (vector<string>::iterator it = vpath.begin(); for (vector<string>::iterator it = vpath.begin();
it != vpath.end(); it++){ it != vpath.end(); it++){
@ -1274,7 +1302,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
// Just truncate it. May still be useful because of wildcards // Just truncate it. May still be useful because of wildcards
*it = it->substr(0, 230); *it = it->substr(0, 230);
} }
newdocument.add_posting(pathelt_prefix + *it, newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
splitter.basepos + splitter.curpos++); splitter.basepos + splitter.curpos++);
} }
} }
@ -1319,7 +1347,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
////// Special terms for other metadata. No positions for these. ////// Special terms for other metadata. No positions for these.
// Mime type // Mime type
newdocument.add_term(mimetype_prefix + doc.mimetype); newdocument.add_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
// Simple file name indexed unsplit for specific "file name" // Simple file name indexed unsplit for specific "file name"
// searches. This is not the same as a filename: clause inside the // searches. This is not the same as a filename: clause inside the
@ -1335,9 +1363,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
utf8truncate(fn, 230); utf8truncate(fn, 230);
string::size_type pos = fn.rfind('.'); string::size_type pos = fn.rfind('.');
if (pos != string::npos && pos != fn.length() - 1) { if (pos != string::npos && pos != fn.length() - 1) {
newdocument.add_term(fileext_prefix + fn.substr(pos + 1)); newdocument.add_term(wrap_prefix(fileext_prefix) +
fn.substr(pos + 1));
} }
newdocument.add_term(unsplitfilename_prefix + fn); newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn);
} }
} }
@ -1357,11 +1386,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
char buf[9]; char buf[9];
snprintf(buf, 9, "%04d%02d%02d", snprintf(buf, 9, "%04d%02d%02d",
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday); tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD) // Date (YYYYMMDD)
newdocument.add_term(wrap_prefix(xapday_prefix) + string(buf));
// Month (YYYYMM)
buf[6] = '\0'; buf[6] = '\0';
newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM) newdocument.add_term(wrap_prefix(xapmonth_prefix) + string(buf));
// Year (YYYY)
buf[4] = '\0'; buf[4] = '\0';
newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY) newdocument.add_term(wrap_prefix(xapyear_prefix) + string(buf));
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
@ -1834,7 +1866,7 @@ bool Db::maxYearSpan(int *minyear, int *maxyear)
*minyear = 1000000; *minyear = 1000000;
*maxyear = -1000000; *maxyear = -1000000;
TermMatchResult result; TermMatchResult result;
if (!termMatch(ET_WILD, string(), "*", result, 5000, "xapyear")) if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear"))
return false; return false;
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin(); for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
it != result.entries.end(); it++) { it != result.entries.end(); it++) {
@ -1899,30 +1931,32 @@ const string cstr_wildSpecChars = "*?[";
const string cstr_regSpecChars = "(.[{"; const string cstr_regSpecChars = "(.[{";
// Find all index terms that match a wildcard or regular expression // Find all index terms that match a wildcard or regular expression
// If field is set, we return a list of appropriately prefixed terms (which
// are going to be used to build a Xapian query).
bool Db::termMatch(MatchType typ, const string &lang, bool Db::termMatch(MatchType typ, const string &lang,
const string &root, const string &root,
TermMatchResult& res, TermMatchResult& res,
int max, int max,
const string& field, const string& field)
string *prefixp
)
{ {
if (!m_ndb || !m_ndb->m_isopen) if (!m_ndb || !m_ndb->m_isopen)
return false; return false;
Xapian::Database xdb = m_ndb->xdb(); Xapian::Database xdb = m_ndb->xdb();
res.clear();
XAPTRY(res.dbdoccount = xdb.get_doccount(); XAPTRY(res.dbdoccount = xdb.get_doccount();
res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason); res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
if (!m_reason.empty()) if (!m_reason.empty())
return false; return false;
// Get rid of capitals and accents // Get rid of capitals and accents
string droot;
string droot = root;
#ifdef RCL_INDEX_STRIPCHARS
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) { if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str())); LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
return false; return false;
} }
#endif
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars; string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
string prefix; string prefix;
@ -1932,17 +1966,14 @@ bool Db::termMatch(MatchType typ, const string &lang,
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n", LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
field.c_str())); field.c_str()));
} else { } else {
prefix = ftp->pfx; prefix = wrap_prefix(ftp->pfx);
} }
if (prefixp)
*prefixp = prefix;
} }
res.prefix = prefix;
if (typ == ET_STEM) { if (typ == ET_STEM) {
if (!stemExpand(lang, root, res, max)) if (!stemExpand(lang, root, res, max))
return false; return false;
sort(res.entries.begin(), res.entries.end());
unique(res.entries.begin(), res.entries.end());
for (vector<TermMatchEntry>::iterator it = res.entries.begin(); for (vector<TermMatchEntry>::iterator it = res.entries.begin();
it != res.entries.end(); it++) { it != res.entries.end(); it++) {
XAPTRY(it->wcf = xdb.get_collection_freq(it->term); XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
@ -2032,7 +2063,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
TermMatchCmpByTerm tcmp; TermMatchCmpByTerm tcmp;
sort(res.entries.begin(), res.entries.end(), tcmp); sort(res.entries.begin(), res.entries.end(), tcmp);
TermMatchTermEqual teq; TermMatchTermEqual teq;
vector<TermMatchEntry>::iterator uit =
unique(res.entries.begin(), res.entries.end(), teq); unique(res.entries.begin(), res.entries.end(), teq);
res.entries.resize(uit - res.entries.begin());
TermMatchCmpByWcf wcmp; TermMatchCmpByWcf wcmp;
sort(res.entries.begin(), res.entries.end(), wcmp); sort(res.entries.begin(), res.entries.end(), wcmp);
if (max > 0) { if (max > 0) {

View file

@ -17,6 +17,8 @@
#ifndef _DB_H_INCLUDED_ #ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_ #define _DB_H_INCLUDED_
#include "autoconfig.h"
#include <string> #include <string>
#include <vector> #include <vector>
@ -73,21 +75,50 @@ class Query;
/** Used for returning result lists for index terms matching some criteria */ /** Used for returning result lists for index terms matching some criteria */
class TermMatchEntry { class TermMatchEntry {
public: public:
TermMatchEntry() : wcf(0) {} TermMatchEntry()
TermMatchEntry(const string&t, int f, int d) : term(t), wcf(f), docs(d) {} : wcf(0)
TermMatchEntry(const string&t) : term(t), wcf(0) {} {
bool operator==(const TermMatchEntry &o) const { return term == o.term;} }
bool operator<(const TermMatchEntry &o) const { return term < o.term;} TermMatchEntry(const string& t, int f, int d)
: term(t), wcf(f), docs(d)
{
}
TermMatchEntry(const string& t)
: term(t), wcf(0)
{
}
bool operator==(const TermMatchEntry &o) const
{
return term == o.term;
}
bool operator<(const TermMatchEntry &o) const
{
return term < o.term;
}
string term; string term;
int wcf; // Total count of occurrences within collection. int wcf; // Total count of occurrences within collection.
int docs; // Number of documents countaining term. int docs; // Number of documents countaining term.
}; };
/** Term match result list header: statistics and global info */
class TermMatchResult { class TermMatchResult {
public: public:
TermMatchResult() {clear();} TermMatchResult()
void clear() {entries.clear(); dbdoccount = 0; dbavgdoclen = 0;} {
clear();
}
void clear()
{
entries.clear();
dbdoccount = 0;
dbavgdoclen = 0;
}
// Term expansion
vector<TermMatchEntry> entries; vector<TermMatchEntry> entries;
// If a field was specified, this is the corresponding index prefix
string prefix;
// Index-wide stats
unsigned int dbdoccount; unsigned int dbdoccount;
double dbavgdoclen; double dbavgdoclen;
}; };
@ -95,6 +126,24 @@ public:
#ifdef IDX_THREADS #ifdef IDX_THREADS
extern void *DbUpdWorker(void*); extern void *DbUpdWorker(void*);
#endif // IDX_THREADS #endif // IDX_THREADS
inline bool has_prefix(const string& trm)
{
#ifdef RCL_INDEX_STRIPCHARS
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
#else
return !trm.empty() && trm[0] == ':';
#endif
}
inline string wrap_prefix(const string& pfx)
{
#ifdef RCL_INDEX_STRIPCHARS
return pfx;
#else
return cstr_colon + pfx + cstr_colon;
#endif
}
/** /**
* Wrapper class for the native database. * Wrapper class for the native database.
*/ */
@ -132,6 +181,8 @@ class Db {
{ {
if (term.empty() || term.length() > 50) if (term.empty() || term.length() > 50)
return false; return false;
if (has_prefix(term))
return false;
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
!= string::npos) != string::npos)
return false; return false;
@ -205,12 +256,23 @@ class Db {
/** Return the index terms that match the input string /** Return the index terms that match the input string
* Expansion is performed either with either wildcard or regexp processing * Expansion is performed either with either wildcard or regexp processing
* Stem expansion is performed if lang is not empty */ * Stem expansion is performed if lang is not empty
*
* @param typ defines the kind of expansion: wildcard, regexp or stemming
* @param lang sets the stemming language(s). Can be a space-separated list
* @param term is the term to expand
* @param result is the main output
* @param max defines the maximum result count
* @param field if set, defines the field within with the expansion should
* be performed. Only used for wildcards and regexps, stemming is
* always global. If this is set, the resulting output terms
* will be appropriately prefix and the prefix value will be set
* in the TermMatchResult header
*/
enum MatchType {ET_WILD, ET_REGEXP, ET_STEM}; enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
bool termMatch(MatchType typ, const string &lang, const string &s, bool termMatch(MatchType typ, const string &lang, const string &term,
TermMatchResult& result, int max = -1, TermMatchResult& result, int max = -1,
const string& field = cstr_null, const string& field = cstr_null
string *prefix = 0
); );
/** Return min and max years for doc mod times in db */ /** Return min and max years for doc mod times in db */
bool maxYearSpan(int *minyear, int *maxyear); bool maxYearSpan(int *minyear, int *maxyear);

View file

@ -18,12 +18,17 @@
#ifndef _rcldb_p_h_included_ #ifndef _rcldb_p_h_included_
#define _rcldb_p_h_included_ #define _rcldb_p_h_included_
#include "autoconfig.h"
#include <map> #include <map>
#include <xapian.h>
#ifdef IDX_THREADS #ifdef IDX_THREADS
#include "workqueue.h" #include "workqueue.h"
#include "debuglog.h"
#endif // IDX_THREADS #endif // IDX_THREADS
#include "xapian.h"
#include "xmacros.h" #include "xmacros.h"
namespace Rcl { namespace Rcl {

View file

@ -446,7 +446,7 @@ vector<string> Query::expand(const Doc &doc)
for (Xapian::ESetIterator it = eset.begin(); for (Xapian::ESetIterator it = eset.begin();
it != eset.end(); it++) { it != eset.end(); it++) {
LOGDEB((" [%s]\n", (*it).c_str())); LOGDEB((" [%s]\n", (*it).c_str()));
if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z')) if ((*it).empty() || has_prefix(*it))
continue; continue;
res.push_back(*it); res.push_back(*it);
if (res.size() >= 10) if (res.size() >= 10)

View file

@ -16,17 +16,22 @@
*/ */
// Handle translation from rcl's SearchData structures to Xapian Queries // Handle translation from rcl's SearchData structures to Xapian Queries
#include "autoconfig.h"
#include <stdio.h> #include <stdio.h>
#include <fnmatch.h> #include <fnmatch.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
using namespace std;
#include "xapian.h" #include "xapian.h"
#include "cstr.h" #include "cstr.h"
#include "rcldb.h" #include "rcldb.h"
#include "rcldb_p.h"
#include "searchdata.h" #include "searchdata.h"
#include "debuglog.h" #include "debuglog.h"
#include "smallut.h" #include "smallut.h"
@ -36,11 +41,11 @@
#include "stoplist.h" #include "stoplist.h"
#include "rclconfig.h" #include "rclconfig.h"
#include "termproc.h" #include "termproc.h"
#include "synfamily.h"
#include "stemdb.h"
#include "expansiondbs.h"
#ifndef NO_NAMESPACES
using namespace std;
namespace Rcl { namespace Rcl {
#endif
typedef vector<SearchDataClause *>::iterator qlist_it_t; typedef vector<SearchDataClause *>::iterator qlist_it_t;
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t; typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
@ -71,13 +76,23 @@ static const int original_term_wqf_booster = 10;
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA * USA
*/ */
#ifdef RCL_INDEX_STRIPCHARS
#define bufprefix(BUF, L) {(BUF)[0] = L;}
#define bpoffs 1
#else
#define bufprefix(BUF, L) {(BUF)[0] = ':'; (BUF)[1] = L; (BUF)[2] = ':';}
#define bpoffs 3
#endif
static Xapian::Query static Xapian::Query
date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2) date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
{ {
// Xapian uses a smallbuf and snprintf. Can't be bothered, we're // Xapian uses a smallbuf and snprintf. Can't be bothered, we're
// only doing %d's ! // only doing %d's !
char buf[200]; char buf[200];
sprintf(buf, "D%04d%02d", y1, m1); bufprefix(buf, 'D');
sprintf(buf+bpoffs, "%04d%02d", y1, m1);
vector<Xapian::Query> v; vector<Xapian::Query> v;
int d_last = monthdays(m1, y1); int d_last = monthdays(m1, y1);
@ -88,11 +103,11 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
// Deal with any initial partial month // Deal with any initial partial month
if (d1 > 1 || d_end < d_last) { if (d1 > 1 || d_end < d_last) {
for ( ; d1 <= d_end ; d1++) { for ( ; d1 <= d_end ; d1++) {
sprintf(buf + 7, "%02d", d1); sprintf(buf + 6 + bpoffs, "%02d", d1);
v.push_back(Xapian::Query(buf)); v.push_back(Xapian::Query(buf));
} }
} else { } else {
buf[0] = 'M'; bufprefix(buf, 'M');
v.push_back(Xapian::Query(buf)); v.push_back(Xapian::Query(buf));
} }
@ -102,36 +117,36 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
int m_last = (y1 < y2) ? 12 : m2 - 1; int m_last = (y1 < y2) ? 12 : m2 - 1;
while (++m1 <= m_last) { while (++m1 <= m_last) {
sprintf(buf + 5, "%02d", m1); sprintf(buf + 4 + bpoffs, "%02d", m1);
buf[0] = 'M'; bufprefix(buf, 'M');
v.push_back(Xapian::Query(buf)); v.push_back(Xapian::Query(buf));
} }
if (y1 < y2) { if (y1 < y2) {
while (++y1 < y2) { while (++y1 < y2) {
sprintf(buf + 1, "%04d", y1); sprintf(buf + bpoffs, "%04d", y1);
buf[0] = 'Y'; bufprefix(buf, 'Y');
v.push_back(Xapian::Query(buf)); v.push_back(Xapian::Query(buf));
} }
sprintf(buf + 1, "%04d", y2); sprintf(buf + bpoffs, "%04d", y2);
buf[0] = 'M'; bufprefix(buf, 'M');
for (m1 = 1; m1 < m2; m1++) { for (m1 = 1; m1 < m2; m1++) {
sprintf(buf + 5, "%02d", m1); sprintf(buf + 4 + bpoffs, "%02d", m1);
v.push_back(Xapian::Query(buf)); v.push_back(Xapian::Query(buf));
} }
} }
sprintf(buf + 5, "%02d", m2); sprintf(buf + 2 + bpoffs, "%02d", m2);
// Deal with any final partial month // Deal with any final partial month
if (d2 < monthdays(m2, y2)) { if (d2 < monthdays(m2, y2)) {
buf[0] = 'D'; bufprefix(buf, 'D');
for (d1 = 1 ; d1 <= d2; d1++) { for (d1 = 1 ; d1 <= d2; d1++) {
sprintf(buf + 7, "%02d", d1); sprintf(buf + 6 + bpoffs, "%02d", d1);
v.push_back(Xapian::Query(buf)); v.push_back(Xapian::Query(buf));
} }
} else { } else {
buf[0] = 'M'; bufprefix(buf, 'M');
v.push_back(Xapian::Query(buf)); v.push_back(Xapian::Query(buf));
} }
@ -172,31 +187,27 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)
return true; return true;
} }
bool SearchData::toNativeQuery(Rcl::Db &db, void *d) bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
vector<SearchDataClause*>& query,
string& reason, void *d)
{ {
LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n",
m_stemlang.c_str()));
Xapian::Query xq; Xapian::Query xq;
m_reason.erase(); for (qlist_it_t it = query.begin(); it != query.end(); it++) {
// Walk the clause list translating each in turn and building the
// Xapian query tree
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
Xapian::Query nq; Xapian::Query nq;
if (!(*it)->toNativeQuery(db, &nq, m_stemlang)) { if (!(*it)->toNativeQuery(db, &nq)) {
LOGERR(("SearchData::toNativeQuery: failed\n")); LOGERR(("SearchData::clausesToQuery: toNativeQuery failed\n"));
m_reason = (*it)->getReason(); reason = (*it)->getReason();
return false; return false;
} }
if (nq.empty()) { if (nq.empty()) {
LOGDEB(("SearchData::toNativeQuery: skipping empty clause\n")); LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
continue; continue;
} }
// If this structure is an AND list, must use AND_NOT for excl clauses. // If this structure is an AND list, must use AND_NOT for excl clauses.
// Else this is an OR list, and there can't be excl clauses (checked by // Else this is an OR list, and there can't be excl clauses (checked by
// addClause()) // addClause())
Xapian::Query::op op; Xapian::Query::op op;
if (m_tp == SCLT_AND) { if (tp == SCLT_AND) {
if ((*it)->m_tp == SCLT_EXCL) { if ((*it)->m_tp == SCLT_EXCL) {
op = Xapian::Query::OP_AND_NOT; op = Xapian::Query::OP_AND_NOT;
} else { } else {
@ -217,6 +228,23 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
if (xq.empty()) if (xq.empty())
xq = Xapian::Query::MatchAll; xq = Xapian::Query::MatchAll;
*((Xapian::Query *)d) = xq;
return true;
}
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
m_reason.erase();
// Walk the clause list translating each in turn and building the
// Xapian query tree
Xapian::Query xq;
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed\n"));
return false;
}
if (m_haveDates) { if (m_haveDates) {
// If one of the extremities is unset, compute db extremas // If one of the extremities is unset, compute db extremas
if (m_dates.y1 == 0 || m_dates.y2 == 0) { if (m_dates.y1 == 0 || m_dates.y2 == 0) {
@ -326,10 +354,10 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
stringToTokens(dit->dir, vpath, "/"); stringToTokens(dit->dir, vpath, "/");
vector<string> pvpath; vector<string> pvpath;
if (dit->dir[0] == '/') if (dit->dir[0] == '/')
pvpath.push_back(pathelt_prefix); pvpath.push_back(wrap_prefix(pathelt_prefix));
for (vector<string>::const_iterator pit = vpath.begin(); for (vector<string>::const_iterator pit = vpath.begin();
pit != vpath.end(); pit++){ pit != vpath.end(); pit++){
pvpath.push_back(pathelt_prefix + *pit); pvpath.push_back(wrap_prefix(pathelt_prefix) + *pit);
} }
Xapian::Query::op tdop; Xapian::Query::op tdop;
if (dit->weight == 1.0) { if (dit->weight == 1.0) {
@ -446,7 +474,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
// My type is AND. Change it to OR and insert two queries, one // My type is AND. Change it to OR and insert two queries, one
// being the original query as a subquery, the other the // being the original query as a subquery, the other the
// phrase. // phrase.
SearchData *sd = new SearchData(m_tp); SearchData *sd = new SearchData(m_tp, m_stemlang);
sd->m_query = m_query; sd->m_query = m_query;
sd->m_stemlang = m_stemlang; sd->m_stemlang = m_stemlang;
m_tp = SCLT_OR; m_tp = SCLT_OR;
@ -586,25 +614,28 @@ public:
{ } { }
bool processUserString(const string &iq, bool processUserString(const string &iq,
int mods,
string &ermsg, string &ermsg,
vector<Xapian::Query> &pqueries, vector<Xapian::Query> &pqueries,
const StopList &stops,
int slack = 0, bool useNear = false); int slack = 0, bool useNear = false);
private: private:
void expandTerm(bool dont, const string& term, vector<string>& exp, void expandTerm(int mods,
const string& term, vector<string>& exp,
string& sterm, const string& prefix); string& sterm, const string& prefix);
// After splitting entry on whitespace: process non-phrase element // After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, bool nostemexp, void processSimpleSpan(const string& span,
int mods,
vector<Xapian::Query> &pqueries); vector<Xapian::Query> &pqueries);
// Process phrase/near element // Process phrase/near element
void processPhraseOrNear(TextSplitQ *splitData, void processPhraseOrNear(TextSplitQ *splitData,
int mods,
vector<Xapian::Query> &pqueries, vector<Xapian::Query> &pqueries,
bool useNear, int slack, int mods); bool useNear, int slack);
Db& m_db; Db& m_db;
const string& m_field; const string& m_field;
const string& m_stemlang; const string& m_stemlang;
bool m_doBoostUserTerms; const bool m_doBoostUserTerms;
HighlightData& m_hld; HighlightData& m_hld;
}; };
@ -619,60 +650,187 @@ static void listVector(const string& what, const vector<string>&l)
} }
#endif #endif
/** Take simple term and expand stem and wildcards /** Expand term into term list, using appropriate mode: stem, wildcards,
* diacritics...
* *
* @param nostemexp don't perform stem expansion. This is mainly used to * @param mods stem expansion, case and diacritics sensitivity control.
* prevent stem expansion inside phrases (because the user probably
* does not expect it). This does NOT prevent wild card expansion.
* Other factors than nostemexp can prevent stem expansion:
* a null stemlang, resulting from a global user preference, a
* capitalized term, or wildcard(s)
* @param term input single word * @param term input single word
* @param exp output expansion list * @param exp output expansion list
* @param sterm output original input term if there were no wildcards * @param sterm output original input term if there were no wildcards
* @param prefix field prefix in index. We could recompute it, but the caller
* has it already. Used in the simple case where there is nothing to expand,
* and we just return the prefixed term (else Db::termMatch deals with it).
*/ */
void StringToXapianQ::expandTerm(bool nostemexp, void StringToXapianQ::expandTerm(int mods,
const string& term, const string& term,
vector<string>& exp, vector<string>& exp, string &sterm,
string &sterm, const string& prefix) const string& prefix)
{ {
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n", LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp)); m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase(); sterm.clear();
exp.clear(); exp.clear();
if (term.empty()) { if (term.empty())
return; return;
}
bool haswild = term.find_first_of(cstr_minwilds) != string::npos; bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
// No stemming if there are wildcards or prevented globally. // If there are no wildcards, add term to the list of user-entered terms
if (!haswild)
m_hld.uterms.insert(term);
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
// No stem expansion if there are wildcards or if prevented by caller
if (haswild || m_stemlang.empty()) { if (haswild || m_stemlang.empty()) {
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n")); LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
nostemexp = true; nostemexp = true;
} }
if (!haswild) bool noexpansion = nostemexp && !haswild;
m_hld.uterms.insert(term);
if (nostemexp && !haswild) { #ifndef RCL_INDEX_STRIPCHARS
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
// If we are working with a raw index, apply the rules for case and
// diacritics sensitivity.
// If any character has a diacritic, we become
// diacritic-sensitive. Note that the way that the test is
// performed (conversion+comparison) will automatically ignore
// accented characters which are actually a separate letter
if (unachasaccents(term))
diac_sensitive = true;
// If any character apart the first is uppercase, we become case-sensitive.
// The first character is reserved for turning off stemming. You need to
// use a query language modifier to search for Floor in a case-sensitive
// way.
Utf8Iter it(term);
it++;
if (unachasuppercase(term.substr(it.getBpos())))
case_sensitive = true;
// If we are sensitive to case or diacritics turn stemming off
if (diac_sensitive || case_sensitive)
nostemexp = true;
if (!case_sensitive || !diac_sensitive)
noexpansion = false;
#endif
if (noexpansion) {
sterm = term; sterm = term;
exp.resize(1); exp.push_back(prefix + term);
exp[0] = prefix + term;
} else { } else {
TermMatchResult res; TermMatchResult res;
if (haswild) { if (haswild) {
// Note that if there are wildcards, we do a direct from-index
// expansion, which means that we are casediac-sensitive. There
// would be nothing to prevent us to expand from the casediac
// synonyms first. To be done later
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
m_field); m_field);
} else { } else {
sterm = term; sterm = term;
#ifdef RCL_INDEX_STRIPCHARS
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1,
m_field); m_field);
#else
// No stem expansion when diacritic or case sensitivity is
// set, it makes no sense (it would mess with the
// diacritics anyway if they are not in the stem part).
// In these 3 cases, perform appropriate expansion from
// the charstripping db, and do a bogus wildcard expansion
// (there is no wild card) to generate the result:
if (diac_sensitive && case_sensitive) {
// No expansion whatsoever
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
m_field);
} else {
// Access case and diacritics expansion:
vector<string> exp;
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa,
"all", &unacfoldtrans);
if (diac_sensitive) {
// Expand for accents and case, filtering for same accents,
// then bogus wildcard expansion for generating result
SynTermTransUnac foldtrans(UNACOP_FOLD);
synac.synExpand(term, exp, &foldtrans);
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
-1, m_field);
} }
} else if (case_sensitive) {
// Expand for accents and case, filtering for same case,
// then bogus wildcard expansion for generating result
SynTermTransUnac unactrans(UNACOP_UNAC);
synac.synExpand(term, exp, &unactrans);
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
-1, m_field);
}
} else {
// Expand for accents and case, then lowercase
// result for input to stemdb.
synac.synExpand(term, exp);
for (unsigned int i = 0; i < exp.size(); i++) {
string lower;
unacmaybefold(exp[i], lower, "UTF-8", UNACOP_FOLD);
exp[i] = lower;
}
sort(exp.begin(), exp.end());
vector<string>::iterator uit =
unique(exp.begin(), exp.end());
exp.resize(uit - exp.begin());
LOGDEB(("ExpandTerm: after casediac: %s\n",
stringsToString(exp).c_str()));
StemDb db(m_db.m_ndb->xrdb);
vector<string> exp1;
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
db.stemExpand(m_stemlang, *it, exp1);
}
LOGDEB(("ExpandTerm: after stem: %s\n",
stringsToString(exp1).c_str()));
// Expand the resulting list for case (all stemdb content
// is lowercase)
exp.clear();
for (vector<string>::const_iterator it = exp1.begin();
it != exp1.end(); it++) {
synac.synExpand(*it, exp);
}
sort(exp.begin(), exp.end());
uit = unique(exp.begin(), exp.end());
exp.resize(uit - exp.begin());
LOGDEB(("ExpandTerm: after case exp of stem: %s\n",
stringsToString(exp).c_str()));
// Bogus wildcard expand to generate the result
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
-1, m_field);
}
}
}
#endif
}
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin(); for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
it != res.entries.end(); it++) { it != res.entries.end(); it++) {
exp.push_back(it->term); exp.push_back(it->term);
} }
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(exp).c_str()));
} }
} }
@ -710,21 +868,22 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
} }
} }
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp, void StringToXapianQ::processSimpleSpan(const string& span,
int mods,
vector<Xapian::Query> &pqueries) vector<Xapian::Query> &pqueries)
{ {
LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n", LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] mods %x\n",
span.c_str(), int(nostemexp))); span.c_str(), (unsigned int)mods));
vector<string> exp; vector<string> exp;
string sterm; // dumb version of user term string sterm; // dumb version of user term
string prefix; string prefix;
const FieldTraits *ftp; const FieldTraits *ftp;
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) { if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
prefix = ftp->pfx; prefix = wrap_prefix(ftp->pfx);
} }
expandTerm(nostemexp, span, exp, sterm, prefix); expandTerm(mods, span, exp, sterm, prefix);
// Set up the highlight data. No prefix should go in there // Set up the highlight data. No prefix should go in there
for (vector<string>::const_iterator it = exp.begin(); for (vector<string>::const_iterator it = exp.begin();
@ -755,8 +914,9 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
// queries if the terms get expanded by stemming or wildcards (we // queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though) // don't do stemming for PHRASE though)
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
int mods,
vector<Xapian::Query> &pqueries, vector<Xapian::Query> &pqueries,
bool useNear, int slack, int mods) bool useNear, int slack)
{ {
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE; Xapian::Query::OP_PHRASE;
@ -769,7 +929,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
string prefix; string prefix;
const FieldTraits *ftp; const FieldTraits *ftp;
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) { if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
prefix = ftp->pfx; prefix = wrap_prefix(ftp->pfx);
} }
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) { if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
@ -790,10 +950,12 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|| hadmultiple || hadmultiple
#endif // single OR inside NEAR #endif // single OR inside NEAR
; ;
int lmods = mods;
if (nostemexp)
lmods |= SearchDataClause::SDCM_NOSTEMMING;
string sterm; string sterm;
vector<string> exp; vector<string> exp;
expandTerm(nostemexp, *it, exp, sterm, prefix); expandTerm(lmods, *it, exp, sterm, prefix);
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size())); LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
listVector("", exp); listVector("", exp);
// groups is used for highlighting, we don't want prefixes in there. // groups is used for highlighting, we don't want prefixes in there.
@ -882,9 +1044,9 @@ static int stringToMods(string& s)
* count) * count)
*/ */
bool StringToXapianQ::processUserString(const string &iq, bool StringToXapianQ::processUserString(const string &iq,
int mods,
string &ermsg, string &ermsg,
vector<Xapian::Query> &pqueries, vector<Xapian::Query> &pqueries,
const StopList& stops,
int slack, int slack,
bool useNear bool useNear
) )
@ -892,6 +1054,8 @@ bool StringToXapianQ::processUserString(const string &iq,
LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear)); LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
ermsg.erase(); ermsg.erase();
const StopList stops = m_db.getStopList();
// Simple whitespace-split input into user-level words and // Simple whitespace-split input into user-level words and
// double-quoted phrases: word1 word2 "this is a phrase". // double-quoted phrases: word1 word2 "this is a phrase".
// //
@ -930,7 +1094,9 @@ bool StringToXapianQ::processUserString(const string &iq,
TermProcStop tpstop(nxt, stops); nxt = &tpstop; TermProcStop tpstop(nxt, stops); nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon; //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
//tpcommon.onlygrams(true); //tpcommon.onlygrams(true);
#ifdef RCL_INDEX_STRIPCHARS
TermProcPrep tpprep(nxt); nxt = &tpprep; TermProcPrep tpprep(nxt); nxt = &tpprep;
#endif
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD), TextSplit::TXTS_KEEPWILD),
@ -944,14 +1110,17 @@ bool StringToXapianQ::processUserString(const string &iq,
switch (splitter.terms.size() + terminc) { switch (splitter.terms.size() + terminc) {
case 0: case 0:
continue;// ?? continue;// ??
case 1: case 1: {
int lmods = mods;
if (splitter.nostemexps.front())
lmods |= SearchDataClause::SDCM_NOSTEMMING;
m_hld.ugroups.push_back(vector<string>(1, *it)); m_hld.ugroups.push_back(vector<string>(1, *it));
processSimpleSpan(splitter.terms.front(), processSimpleSpan(splitter.terms.front(), lmods, pqueries);
splitter.nostemexps.front(), pqueries); }
break; break;
default: default:
m_hld.ugroups.push_back(vector<string>(1, *it)); m_hld.ugroups.push_back(vector<string>(1, *it));
processPhraseOrNear(&splitter, pqueries, useNear, slack, mods); processPhraseOrNear(&splitter, mods, pqueries, useNear, slack);
} }
} }
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
@ -971,13 +1140,10 @@ bool StringToXapianQ::processUserString(const string &iq,
} }
// Translate a simple OR, AND, or EXCL search clause. // Translate a simple OR, AND, or EXCL search clause.
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
const string& stemlang)
{ {
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
stemlang;
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n", LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
stemlang.c_str())); getStemLang().c_str()));
Xapian::Query *qp = (Xapian::Query *)p; Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query(); *qp = Xapian::Query();
@ -1000,8 +1166,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
(m_parentSearch && !m_parentSearch->haveWildCards()) || (m_parentSearch && !m_parentSearch->haveWildCards()) ||
(m_parentSearch == 0 && !m_haveWildCards); (m_parentSearch == 0 && !m_haveWildCards);
StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm); StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
if (!tr.processUserString(m_text, m_reason, pqueries, db.getStopList())) if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries))
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n")); LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
@ -1024,8 +1190,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
// about expanding multiple fragments in the past. We just take the // about expanding multiple fragments in the past. We just take the
// value blanks and all and expand this against the indexed unsplit // value blanks and all and expand this against the indexed unsplit
// file names // file names
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
const string&)
{ {
Xapian::Query *qp = (Xapian::Query *)p; Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query(); *qp = Xapian::Query();
@ -1041,11 +1206,8 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
} }
// Translate NEAR or PHRASE clause. // Translate NEAR or PHRASE clause.
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
const string& stemlang)
{ {
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
stemlang;
LOGDEB(("SearchDataClauseDist::toNativeQuery\n")); LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
Xapian::Query *qp = (Xapian::Query *)p; Xapian::Query *qp = (Xapian::Query *)p;
@ -1069,8 +1231,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
} }
string s = cstr_dquote + m_text + cstr_dquote; string s = cstr_dquote + m_text + cstr_dquote;
bool useNear = (m_tp == SCLT_NEAR); bool useNear = (m_tp == SCLT_NEAR);
StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm); StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(), if (!tr.processUserString(s, getModifiers(), m_reason, pqueries,
m_slack, useNear)) m_slack, useNear))
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {

View file

@ -70,9 +70,9 @@ class SearchDataClause;
*/ */
class SearchData { class SearchData {
public: public:
SearchData(SClType tp) SearchData(SClType tp, const string& stemlang)
: m_tp(tp), m_haveDates(false), m_maxSize(size_t(-1)), : m_tp(tp), m_haveDates(false), m_maxSize(size_t(-1)),
m_minSize(size_t(-1)), m_haveWildCards(false) m_minSize(size_t(-1)), m_haveWildCards(false), m_stemlang(stemlang)
{ {
if (m_tp != SCLT_OR && m_tp != SCLT_AND) if (m_tp != SCLT_OR && m_tp != SCLT_AND)
m_tp = SCLT_OR; m_tp = SCLT_OR;
@ -91,6 +91,7 @@ public:
/** Translate to Xapian query. rcldb knows about the void* */ /** Translate to Xapian query. rcldb knows about the void* */
bool toNativeQuery(Rcl::Db &db, void *); bool toNativeQuery(Rcl::Db &db, void *);
/** We become the owner of cl and will delete it */ /** We become the owner of cl and will delete it */
bool addClause(SearchDataClause *cl); bool addClause(SearchDataClause *cl);
@ -109,6 +110,8 @@ public:
m_dirspecs.push_back(DirSpec(t, excl, w)); m_dirspecs.push_back(DirSpec(t, excl, w));
} }
const std::string& getStemLang() {return m_stemlang;}
void setMinSize(size_t size) {m_minSize = size;} void setMinSize(size_t size) {m_minSize = size;}
void setMaxSize(size_t size) {m_maxSize = size;} void setMaxSize(size_t size) {m_maxSize = size;}
@ -120,8 +123,6 @@ public:
/** Add file type to not wanted list */ /** Add file type to not wanted list */
void remFiletype(const std::string& ft) {m_nfiletypes.push_back(ft);} void remFiletype(const std::string& ft) {m_nfiletypes.push_back(ft);}
void setStemlang(const std::string& lang = "english") {m_stemlang = lang;}
/** Retrieve error description */ /** Retrieve error description */
std::string getReason() {return m_reason;} std::string getReason() {return m_reason;}
@ -170,7 +171,12 @@ private:
std::string m_reason; std::string m_reason;
bool m_haveWildCards; bool m_haveWildCards;
std::string m_stemlang; std::string m_stemlang;
bool expandFileTypes(RclConfig *cfg, std::vector<std::string>& exptps); bool expandFileTypes(RclConfig *cfg, std::vector<std::string>& exptps);
bool clausesToQuery(Rcl::Db &db, SClType tp,
std::vector<SearchDataClause*>& query,
string& reason, void *d);
/* Copyconst and assignment private and forbidden */ /* Copyconst and assignment private and forbidden */
SearchData(const SearchData &) {} SearchData(const SearchData &) {}
SearchData& operator=(const SearchData&) {return *this;}; SearchData& operator=(const SearchData&) {return *this;};
@ -186,7 +192,7 @@ public:
m_modifiers(SDCM_NONE), m_weight(1.0) m_modifiers(SDCM_NONE), m_weight(1.0)
{} {}
virtual ~SearchDataClause() {} virtual ~SearchDataClause() {}
virtual bool toNativeQuery(Rcl::Db &db, void *, const std::string&) = 0; virtual bool toNativeQuery(Rcl::Db &db, void *) = 0;
bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;} bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;}
virtual std::string getReason() const {return m_reason;} virtual std::string getReason() const {return m_reason;}
virtual void getTerms(HighlightData & hldata) const = 0; virtual void getTerms(HighlightData & hldata) const = 0;
@ -199,6 +205,11 @@ public:
{ {
m_parentSearch = p; m_parentSearch = p;
} }
string getStemLang()
{
return (m_modifiers & SDCM_NOSTEMMING) || m_parentSearch == 0 ?
cstr_null : m_parentSearch->getStemLang();
}
virtual void setModifiers(Modifier mod) virtual void setModifiers(Modifier mod)
{ {
m_modifiers = mod; m_modifiers = mod;
@ -255,7 +266,7 @@ public:
} }
/** Translate to Xapian query */ /** Translate to Xapian query */
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang); virtual bool toNativeQuery(Rcl::Db &, void *);
virtual void getTerms(HighlightData& hldata) const virtual void getTerms(HighlightData& hldata) const
{ {
@ -296,7 +307,7 @@ public:
{ {
} }
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang); virtual bool toNativeQuery(Rcl::Db &, void *);
}; };
/** /**
@ -315,7 +326,7 @@ public:
{ {
} }
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang); virtual bool toNativeQuery(Rcl::Db &, void *);
private: private:
int m_slack; int m_slack;
}; };
@ -323,17 +334,11 @@ private:
/** Subquery */ /** Subquery */
class SearchDataClauseSub : public SearchDataClause { class SearchDataClauseSub : public SearchDataClause {
public: public:
// We take charge of the SearchData * and will delete it.
SearchDataClauseSub(SClType tp, RefCntr<SearchData> sub) SearchDataClauseSub(SClType tp, RefCntr<SearchData> sub)
: SearchDataClause(tp), m_sub(sub) : SearchDataClause(tp), m_sub(sub)
{ {
} }
virtual bool toNativeQuery(Rcl::Db &db, void *p)
virtual ~SearchDataClauseSub()
{
}
virtual bool toNativeQuery(Rcl::Db &db, void *p, const std::string&)
{ {
return m_sub->toNativeQuery(db, p); return m_sub->toNativeQuery(db, p);
} }

View file

@ -19,6 +19,9 @@
* Management of the auxiliary databases listing stems and their expansion * Management of the auxiliary databases listing stems and their expansion
* terms * terms
*/ */
#include "autoconfig.h"
#include <unistd.h> #include <unistd.h>
#include <algorithm> #include <algorithm>
@ -27,13 +30,8 @@
#include <xapian.h> #include <xapian.h>
#include "stemdb.h" #include "stemdb.h"
#include "pathut.h"
#include "debuglog.h" #include "debuglog.h"
#include "smallut.h" #include "smallut.h"
#include "utf8iter.h"
#include "textsplit.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "synfamily.h" #include "synfamily.h"
#include "unacpp.h" #include "unacpp.h"
@ -43,140 +41,6 @@ using namespace std;
namespace Rcl { namespace Rcl {
// Fast raw detection of non-natural-language words: look for ascii
// chars which are not lowercase letters. Not too sure what islower()
// would do with 8 bit values, so not using it here. If we want to be
// more complete we'd need to go full utf-8
inline static bool p_notlowerascii(unsigned int c)
{
if (c < 'a' || (c > 'z' && c < 128))
return true;
return false;
}
/**
* Create database of stem to parents associations for a given language.
*/
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const vector<string>& langs)
{
LOGDEB(("StemDb::createExpansionDbs\n"));
Chrono cron;
vector<XapWritableSynFamily> stemdbs;
for (unsigned int i = 0; i < langs.size(); i++) {
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
stemdbs[i].deleteMember(langs[i]);
stemdbs[i].createMember(langs[i]);
stemdbs[i].setCurrentMemberName(langs[i]);
}
// We walk the list of all terms, and stem each. We skip terms which
// don't look like natural language.
// If the stem is not identical to the term, we add a synonym entry.
// Statistics
int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
int stemconst = 0; // Stem == term
int allsyns = 0; // Total number of entries created
string ermsg;
try {
vector<Xapian::Stem> stemmers;
for (unsigned int i = 0; i < langs.size(); i++) {
stemmers.push_back(Xapian::Stem(langs[i]));
}
for (Xapian::TermIterator it = wdb.allterms_begin();
it != wdb.allterms_end(); it++) {
// If the term has any non-lowercase 7bit char (that is,
// numbers, capitals and punctuation) dont stem.
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
++nostem;
LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
(*it).c_str(), *sit));
continue;
}
// Detect and skip CJK terms.
// We're still sending all other multibyte utf-8 chars to
// the stemmer, which is not too well defined for
// xapian<1.0 (very obsolete now), but seems to work
// anyway. There shouldn't be too many in any case because
// accents are stripped at this point.
// The effect of stripping accents on stemming is not good,
// (e.g: in french partimes -> partim, parti^mes -> part)
// but fixing the issue would be complicated.
Utf8Iter utfit(*it);
if (TextSplit::isCJK(*utfit)) {
// LOGDEB(("stemskipped: Skipping CJK\n"));
continue;
}
// Create stemming synonym for every lang
for (unsigned int i = 0; i < langs.size(); i++) {
string stem = stemmers[i](*it);
if (stem == *it) {
++stemconst;
} else {
stemdbs[i].addSynonym(stem, *it);
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
(*it).c_str(), langs[i].c_str(), stem.c_str()));
++allsyns;
}
}
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
return false;
}
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
nostem, stemconst, allsyns));
return true;
}
/**
* Expand term to list of all terms which stem to the same term, for one
* expansion language
*/
bool StemDb::expandOne(const std::string& lang,
const std::string& term,
vector<string>& result)
{
try {
Xapian::Stem stemmer(lang);
string stem = stemmer(term);
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
lang.c_str(), term.c_str(), stem.c_str()));
if (!synExpand(lang, stem, result)) {
// ?
}
// If the user term or stem are not in the list, add them
if (find(result.begin(), result.end(), term) == result.end()) {
result.push_back(term);
}
if (find(result.begin(), result.end(), stem) == result.end()) {
result.push_back(stem);
}
LOGDEB0(("stemExpand:%s: %s -> %s\n", lang.c_str(), stem.c_str(),
stringsToString(result).c_str()));
} catch (...) {
LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
lang.c_str()));
result.push_back(term);
return false;
}
return true;
}
/** /**
* Expand for one or several languages * Expand for one or several languages
*/ */
@ -186,14 +50,34 @@ bool StemDb::stemExpand(const std::string& langs,
{ {
vector<string> llangs; vector<string> llangs;
stringToStrings(langs, llangs); stringToStrings(langs, llangs);
for (vector<string>::const_iterator it = llangs.begin(); for (vector<string>::const_iterator it = llangs.begin();
it != llangs.end(); it++) { it != llangs.end(); it++) {
vector<string> oneexp; SynTermTransStem stemmer(*it);
expandOne(*it, term, oneexp); XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
result.insert(result.end(), oneexp.begin(), oneexp.end()); (void)expander.synExpand(term, result);
} }
#ifndef RCL_INDEX_STRIPCHARS
for (vector<string>::const_iterator it = llangs.begin();
it != llangs.end(); it++) {
SynTermTransStem stemmer(*it);
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
*it, &stemmer);
string unac;
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
(void)expander.synExpand(unac, result);
}
#endif
if (result.empty())
result.push_back(term);
sort(result.begin(), result.end()); sort(result.begin(), result.end());
unique(result.begin(), result.end()); vector<string>::iterator uit = unique(result.begin(), result.end());
result.resize(uit - result.begin());
LOGDEB0(("stemExpand:%s: %s -> %s\n", langs.c_str(), term.c_str(),
stringsToString(result).c_str()));
return true; return true;
} }

View file

@ -55,9 +55,30 @@
#include <xapian.h> #include <xapian.h>
#include "synfamily.h" #include "synfamily.h"
#include "unacpp.h"
namespace Rcl { namespace Rcl {
/* A stemming functor for using with XapComputableSynFamMember */
class SynTermTransStem : public SynTermTrans {
public:
SynTermTransStem(const std::string& lang)
: m_stemmer(lang), m_lang(lang)
{
}
virtual std::string operator()(const std::string& in)
{
string out = m_stemmer(in);
LOGDEB2(("SynTermTransStem(%s): in [%s] out [%s]\n", m_lang.c_str(),
in.c_str(), out.c_str()));
return out;
}
Xapian::Stem m_stemmer;
std::string m_lang;
};
/** Stemdb is a bit special as a SynFamily as we may want to expand for one
* or several members (languages) */
class StemDb : public XapSynFamily { class StemDb : public XapSynFamily {
public: public:
StemDb(Xapian::Database& xdb) StemDb(Xapian::Database& xdb)
@ -69,16 +90,8 @@ public:
bool stemExpand(const std::string& langs, bool stemExpand(const std::string& langs,
const std::string& term, const std::string& term,
std::vector<std::string>& result); std::vector<std::string>& result);
private:
/** Compute stem and call synExpand() */
bool expandOne(const std::string& lang,
const std::string& term,
std::vector<std::string>& result);
}; };
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const std::vector<std::string>& langs);
} }
#endif /* _STEMDB_H_INCLUDED_ */ #endif /* _STEMDB_H_INCLUDED_ */

View file

@ -28,31 +28,6 @@ using namespace std;
namespace Rcl { namespace Rcl {
bool XapSynFamily::synExpand(const string& member, const string& term,
vector<string>& result)
{
string key = entryprefix(member) + term;
string ermsg;
try {
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
xit != m_rdb.synonyms_end(key); xit++) {
result.push_back(*xit);
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("synFamily::synExpand: error for member [%s] term [%s]\n",
member.c_str(), term.c_str()));
return false;
}
#if 0
string out;
stringsToString(result, out);
LOGDEB0(("XapSynFamily::synExpand:%s: [%s] -> %s\n", member.c_str(),
term.c_str(), out.c_str()));
#endif
return true;
}
bool XapSynFamily::getMembers(vector<string>& members) bool XapSynFamily::getMembers(vector<string>& members)
{ {
string key = memberskey(); string key = memberskey();
@ -100,6 +75,35 @@ bool XapSynFamily::listMap(const string& membername)
return true; return true;
} }
bool XapSynFamily::synExpand(const string& member, const string& term,
vector<string>& result)
{
LOGDEB(("XapSynFamily::synExpand:(%s) %s for %s\n",
m_prefix1.c_str(), term.c_str(), member.c_str()));
string key = entryprefix(member) + term;
string ermsg;
try {
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
xit != m_rdb.synonyms_end(key); xit++) {
LOGDEB2((" Pushing %s\n", (*xit).c_str()));
result.push_back(*xit);
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("synFamily::synExpand: error for member [%s] term [%s]\n",
member.c_str(), term.c_str()));
result.push_back(term);
return false;
}
// If the input term is not in the list, add it
if (find(result.begin(), result.end(), term) == result.end()) {
result.push_back(term);
}
return true;
}
bool XapWritableSynFamily::deleteMember(const string& membername) bool XapWritableSynFamily::deleteMember(const string& membername)
{ {
string key = entryprefix(membername); string key = entryprefix(membername);
@ -119,31 +123,60 @@ bool XapWritableSynFamily::createMember(const string& membername)
m_wdb.add_synonym(memberskey(), membername); m_wdb.add_synonym(memberskey(), membername);
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR(("XapSynFamily::createMember: xapian error %s\n", ermsg.c_str())); LOGERR(("XapSynFamily::createMember: error: %s\n", ermsg.c_str()));
return false; return false;
} }
return true; return true;
} }
bool XapWritableSynFamily::addSynonyms(const string& membername, bool XapComputableSynFamMember::synExpand(const string& term,
const string& term, vector<string>& result,
const vector<string>& trans) SynTermTrans *filtertrans)
{ {
string key = entryprefix(membername) + term; string root = (*m_trans)(term);
string filter_root;
if (filtertrans)
filter_root = (*filtertrans)(term);
/* We could call XapSynFamily::synExpand() here instead of doing it
ourselves... */
string key = m_prefix + root;
LOGDEB(("XapCompSynFamMbr::synExpand([%s]): term [%s] root [%s] \n",
m_prefix.c_str(), term.c_str(), root.c_str()));
string ermsg; string ermsg;
try { try {
for (vector<string>::const_iterator it = trans.begin(); for (Xapian::TermIterator xit = m_family.getdb().synonyms_begin(key);
it != trans.end(); it++) { xit != m_family.getdb().synonyms_end(key); xit++) {
m_wdb.add_synonym(key, *it); if (!filtertrans || (*filtertrans)(*xit) == filter_root) {
LOGDEB2((" Pushing %s\n", (*xit).c_str()));
result.push_back(*xit);
}
} }
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR(("XapSynFamily::addSynonyms: xapian error %s\n", ermsg.c_str())); LOGERR(("XapSynDb::synExpand: error for term [%s] (key %s)\n",
term.c_str(), key.c_str()));
result.push_back(term);
return false; return false;
} }
return true;
// If the input term and root are not in the list, add them
if (find(result.begin(), result.end(), term) == result.end()) {
LOGDEB2((" Pushing %s\n", term.c_str()));
result.push_back(term);
}
if (root != term &&
find(result.begin(), result.end(), root) == result.end()) {
if (!filtertrans || (*filtertrans)(root) == filter_root) {
LOGDEB2((" Pushing %s\n", root.c_str()));
result.push_back(root);
}
} }
return true;
}
} }
@ -169,16 +202,16 @@ using namespace std;
static string thisprog; static string thisprog;
static int op_flags; static int op_flags;
#define OPT_a 0x4
#define OPT_c 0x8
#define OPT_D 0x1 #define OPT_D 0x1
#define OPT_d 0x10
#define OPT_L 0x2 #define OPT_L 0x2
#define OPT_a 0x4
#define OPT_u 0x8
#define OPT_d 0x10
#define OPT_l 0x20 #define OPT_l 0x20
#define OPT_s 0x40 #define OPT_s 0x40
#define OPT_e 0x80 #define OPT_e 0x80
static string usage = static string usage =
" -d <dbdir> {-s|-a|-c} database dir and synfamily: stem accents case\n" " -d <dbdir> {-s|-a|-u} database dir and synfamily: stem accents/case ustem\n"
" -l : list members\n" " -l : list members\n"
" -L <member>: list entries for given member\n" " -L <member>: list entries for given member\n"
" -e <member> <key> : list expansion for given member and key\n" " -e <member> <key> : list expansion for given member and key\n"
@ -209,7 +242,6 @@ int main(int argc, char **argv)
while (**argv) while (**argv)
switch (*(*argv)++) { switch (*(*argv)++) {
case 'a': op_flags |= OPT_a; break; case 'a': op_flags |= OPT_a; break;
case 'c': op_flags |= OPT_c; break;
case 'D': op_flags |= OPT_D; break; case 'D': op_flags |= OPT_D; break;
case 'd': op_flags |= OPT_d; if (argc < 2) Usage(); case 'd': op_flags |= OPT_d; if (argc < 2) Usage();
dbdir = *(++argv); argc--; dbdir = *(++argv); argc--;
@ -223,6 +255,7 @@ int main(int argc, char **argv)
member = *(++argv); argc--; member = *(++argv); argc--;
goto b1; goto b1;
case 's': op_flags |= OPT_s; break; case 's': op_flags |= OPT_s; break;
case 'u': op_flags |= OPT_u; break;
default: Usage(); break; default: Usage(); break;
} }
b1: argc--; argv++; b1: argc--; argv++;
@ -231,12 +264,11 @@ int main(int argc, char **argv)
if (argc != 0) if (argc != 0)
Usage(); Usage();
// We do stem only for now
string familyname; string familyname;
if (op_flags & OPT_a) { if (op_flags & OPT_a) {
familyname = Rcl::synFamDiac; familyname = Rcl::synFamDiCa;
} else if (op_flags &OPT_c) { } else if (op_flags & OPT_u) {
familyname = Rcl::synFamCase; familyname = Rcl::synFamStemUnac;
} else { } else {
familyname = Rcl::synFamStem; familyname = Rcl::synFamStem;
} }

View file

@ -53,38 +53,50 @@ public:
m_prefix1 = std::string(":") + familyname; m_prefix1 = std::string(":") + familyname;
} }
/** Expand one term (e.g.: familier) inside one family number (e.g: french)
*/
virtual bool synExpand(const std::string& fammember,
const std::string& key,
std::vector<std::string>& result);
/** Retrieve all members of this family (e.g: french english german...) */ /** Retrieve all members of this family (e.g: french english german...) */
virtual bool getMembers(std::vector<std::string>&); virtual bool getMembers(std::vector<std::string>&);
/** debug: list map for one member to stdout */ /** debug: list map for one member to stdout */
virtual bool listMap(const std::string& fam); virtual bool listMap(const std::string& fam);
protected: /** Expand term to list of synonyms for given member */
Xapian::Database m_rdb; bool synExpand(const std::string& membername,
std::string m_prefix1; const std::string& term, std::vector<std::string>& result);
// The prefix shared by all synonym entries inside a family member
virtual std::string entryprefix(const std::string& member) virtual std::string entryprefix(const std::string& member)
{ {
return m_prefix1 + ":" + member + ":"; return m_prefix1 + ":" + member + ":";
} }
// The key for the "list of members" entry
virtual std::string memberskey() virtual std::string memberskey()
{ {
return m_prefix1 + ";" + "members"; return m_prefix1 + ";" + "members";
} }
Xapian::Database& getdb()
{
return m_rdb;
}
protected:
Xapian::Database m_rdb;
std::string m_prefix1;
}; };
/** Modify ops for a synonyms family
*
* A method to add a synonym entry inside a given member would make sense,
* but would not be used presently as all these ops go through
* ComputableSynFamMember objects
*/
class XapWritableSynFamily : public XapSynFamily { class XapWritableSynFamily : public XapSynFamily {
public: public:
/** Construct with Xapian db open for r/w */ /** Construct with Xapian db open for r/w */
XapWritableSynFamily(Xapian::WritableDatabase db, const std::string& pfx) XapWritableSynFamily(Xapian::WritableDatabase db,
: XapSynFamily(db, pfx), m_wdb(db) const std::string& familyname)
: XapSynFamily(db, familyname), m_wdb(db)
{ {
} }
@ -95,36 +107,92 @@ public:
/** Add to list of members. Idempotent, does not affect actual expansions */ /** Add to list of members. Idempotent, does not affect actual expansions */
virtual bool createMember(const std::string& membername); virtual bool createMember(const std::string& membername);
/** Add expansion list for term inside family member (e.g., inside Xapian::WritableDatabase getdb() {return m_wdb;}
* the english member, add expansion for floor -> floors, flooring.. */
virtual bool addSynonyms(const std::string& membername,
const std::string& term,
const std::vector<std::string>& trans);
// Need to call setCurrentMemberName before addSynonym ! protected:
// We don't check it, for speed Xapian::WritableDatabase m_wdb;
virtual void setCurrentMemberName(const std::string& nm) };
/** A functor which transforms a string */
class SynTermTrans {
public:
virtual std::string operator()(const std::string&) = 0;
};
/** A member (set of root-synonyms associations) of a SynFamily for
* which the root is computable from the input term.
* The objects use a functor member to compute the term root on input
* (e.g. compute the term sterm or casefold it
*/
class XapComputableSynFamMember {
public:
XapComputableSynFamMember(Xapian::Database xdb, std::string familyname,
std::string membername, SynTermTrans* trans)
: m_family(xdb, familyname), m_membername(membername),
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
{ {
m_currentPrefix = entryprefix(nm);
} }
virtual bool addSynonym(const std::string& term, const std::string& trans)
/** Expand a term to its list of synonyms. If filtertrans is set we
* keep only the results which transform to the same value as the input */
bool synExpand(const std::string& term, std::vector<std::string>& result,
SynTermTrans *filtertrans = 0);
private:
XapSynFamily m_family;
std::string m_membername;
SynTermTrans *m_trans;
std::string m_prefix;
};
/** Computable term root SynFamily member, modify ops */
class XapWritableComputableSynFamMember {
public:
XapWritableComputableSynFamMember(
Xapian::WritableDatabase xdb, std::string familyname,
std::string membername, SynTermTrans* trans)
: m_family(xdb, familyname), m_membername(membername),
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
{ {
std::string key = m_currentPrefix + term; }
virtual bool addSynonym(const std::string& term)
{
LOGDEB2(("addSynonym:me %p term [%s] m_trans %p\n", this,
term.c_str(), m_trans));
std::string transformed = (*m_trans)(term);
LOGDEB2(("addSynonym: transformed [%s]\n", transformed.c_str()));
if (transformed == term)
return true;
std::string ermsg; std::string ermsg;
try { try {
m_wdb.add_synonym(key, trans); m_family.getdb().add_synonym(m_prefix + transformed, term);
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR(("XapSynFamily::addSynonym: xapian error %s\n", LOGERR(("XapWritableComputableSynFamMember::addSynonym: "
ermsg.c_str())); "xapian error %s\n", ermsg.c_str()));
return false; return false;
} }
return true; return true;
} }
protected: void clear()
Xapian::WritableDatabase m_wdb; {
std::string m_currentPrefix; m_family.deleteMember(m_membername);
}
void recreate()
{
clear();
m_family.createMember(m_membername);
}
private:
XapWritableSynFamily m_family;
std::string m_membername;
SynTermTrans *m_trans;
std::string m_prefix;
}; };
@ -133,11 +201,13 @@ protected:
// //
// Stem expansion family prefix. The family member name is the // Stem expansion family prefix. The family member name is the
// language ("all" for Dia and Cse) // language ("all" for Dia and Cse)
// Lowercase accented stem to expansion
static const std::string synFamStem("Stm"); static const std::string synFamStem("Stm");
static const std::string synFamDiac("Dia"); // Lowercase unaccented stem to expansion
static const std::string synFamCase("Cse"); static const std::string synFamStemUnac("StU");
// Lowercase unaccented term to case and accent variations
static const std::string synFamDiCa("DCa");
} }
#endif /* _SYNFAMILY_H_INCLUDED_ */ #endif /* _SYNFAMILY_H_INCLUDED_ */

View file

@ -35,7 +35,7 @@
# Also reserved: F(parentid), Q(uniqueid) # Also reserved: F(parentid), Q(uniqueid)
title = S ; wdfinc = 10 title = S ; wdfinc = 10
author = A author = A
abstract = abstract = XS
caption = S caption = S
title = S title = S
subject = S subject = S

View file

@ -103,7 +103,7 @@ public:
/** Append current utf-8 possibly multi-byte character to string param. /** Append current utf-8 possibly multi-byte character to string param.
This needs to be fast. No error checking. */ This needs to be fast. No error checking. */
unsigned int appendchartostring(std::string &out) { unsigned int appendchartostring(std::string &out) const {
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert(m_cl != 0); assert(m_cl != 0);
#endif #endif