Make Recoll optionally sensitive to case and diacritics

This commit is contained in:
Jean-Francois Dockes 2012-09-14 14:34:27 +02:00
parent 7fcfe27952
commit 166624f7f2
30 changed files with 849 additions and 487 deletions

View file

@ -63,26 +63,57 @@ bool unacmaybefold(const string &in, string &out,
return true;
}
// Functions to determine upper-case or accented status could be implemented
// hugely more efficiently inside the unac c code, but there only used for
// testing user-entered terms, so we don't really care.
bool unaciscapital(const string& in)
{
LOGDEB2(("unaciscapital: [%s]\n", in.c_str()));
if (in.empty())
return false;
Utf8Iter it(in);
string shorter;
it.appendchartostring(shorter);
string noacterm, noaclowterm;
if (!unacmaybefold(shorter, noacterm, "UTF-8", UNACOP_UNAC)) {
LOGINFO(("unaciscapital: unac failed for [%s]\n", in.c_str()));
string lower;
if (!unacmaybefold(shorter, lower, "UTF-8", UNACOP_FOLD)) {
LOGINFO(("unaciscapital: unac/fold failed for [%s]\n", in.c_str()));
return false;
}
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("unaciscapital: unacfold failed for [%s]\n", in.c_str()));
Utf8Iter it1(lower);
if (*it != *it1)
return true;
else
return false;
}
bool unachasuppercase(const string& in)
{
LOGDEB2(("unachasuppercase: [%s]\n", in.c_str()));
if (in.empty())
return false;
string lower;
if (!unacmaybefold(in, lower, "UTF-8", UNACOP_FOLD)) {
LOGINFO(("unachasuppercase: unac/fold failed for [%s]\n", in.c_str()));
return false;
}
Utf8Iter it1(noacterm);
Utf8Iter it2(noaclowterm);
if (*it1 != *it2)
if (lower != in)
return true;
else
return false;
}
bool unachasaccents(const string& in)
{
LOGDEB2(("unachasaccents: [%s]\n", in.c_str()));
if (in.empty())
return false;
string noac;
if (!unacmaybefold(in, noac, "UTF-8", UNACOP_UNAC)) {
LOGINFO(("unachasaccents: unac/unac failed for [%s]\n", in.c_str()));
return false;
}
if (noac != in)
return true;
else
return false;
@ -107,12 +138,15 @@ static char *thisprog;
static char usage [] = "\n"
"[-c|-C] <encoding> <infile> <outfile>\n"
" Default : unaccent\n"
" -c : unaccent and casefold\n"
" -C : casefold only\n"
" Default : unaccent\n"
" -c : unaccent and casefold\n"
" -C : casefold only\n"
"-t <string> test string as capitalized, upper-case anywhere, accents\n"
" the parameter is supposedly utf-8 so this can only work in an utf-8\n"
" locale\n"
"\n";
;
static void
Usage(void)
{
@ -123,6 +157,7 @@ Usage(void)
static int op_flags;
#define OPT_c 0x2
#define OPT_C 0x4
#define OPT_t 0x8
int main(int argc, char **argv)
{
@ -140,58 +175,73 @@ int main(int argc, char **argv)
switch (*(*argv)++) {
case 'c': op_flags |= OPT_c; break;
case 'C': op_flags |= OPT_C; break;
case 't': op_flags |= OPT_t; break;
default: Usage(); break;
}
argc--; argv++;
}
if (op_flags & OPT_c) {
op = UNACOP_UNACFOLD;
} else if (op_flags & OPT_C) {
op = UNACOP_FOLD;
}
if (argc != 3) {
Usage();
}
const char *encoding = *argv++; argc--;
string ifn = *argv++; argc--;
if (!ifn.compare("stdin"))
ifn.clear();
const char *ofn = *argv++; argc--;
string reason;
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
string odata;
if (!file_to_string(ifn, odata)) {
cerr << "file_to_string " << ifn << " : " << odata << endl;
return 1;
}
string ndata;
if (!unacmaybefold(odata, ndata, encoding, op)) {
cerr << "unac: " << ndata << endl;
return 1;
}
int fd;
if (strcmp(ofn, "stdout")) {
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
if (op_flags & OPT_t) {
if (argc != 1)
Usage();
string in = *argv++;argc--;
bool capital, upper, accent;
capital = unaciscapital(in);
upper = unachasuppercase(in);
accent = unachasaccents(in);
cout << "[" << in << "] : " <<
"capitalized: " << (capital ? "Yes. " : "No. ") <<
"has uppercase: " << (upper ? "Yes. " : "No. ") <<
"has accents: " << (accent ? "Yes. " : "No. ") <<
endl;
return 0;
} else {
fd = 1;
if (argc != 3)
Usage();
if (op_flags & OPT_c) {
op = UNACOP_UNACFOLD;
} else if (op_flags & OPT_C) {
op = UNACOP_FOLD;
}
const char *encoding = *argv++; argc--;
string ifn = *argv++; argc--;
if (!ifn.compare("stdin"))
ifn.clear();
const char *ofn = *argv++; argc--;
string reason;
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
string odata;
if (!file_to_string(ifn, odata)) {
cerr << "file_to_string " << ifn << " : " << odata << endl;
return 1;
}
string ndata;
if (!unacmaybefold(odata, ndata, encoding, op)) {
cerr << "unac: " << ndata << endl;
return 1;
}
int fd;
if (strcmp(ofn, "stdout")) {
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
} else {
fd = 1;
}
if (fd < 0) {
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
<< endl;
return 1;
}
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
cerr << "Write(2) failed: " << strerror(errno) << endl;
return 1;
}
close(fd);
return 0;
}
if (fd < 0) {
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
<< endl;
return 1;
}
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
cerr << "Write(2) failed: " << strerror(errno) << endl;
return 1;
}
close(fd);
return 0;
}
#endif

View file

@ -24,11 +24,17 @@ using std::string;
#endif /* NO_NAMESPACES */
// A small stringified wrapper for unac.c
enum UnacOp {UNACOP_UNAC, UNACOP_UNACFOLD, UNACOP_FOLD};
enum UnacOp {UNACOP_UNAC = 1, UNACOP_FOLD = 2, UNACOP_UNACFOLD = 3};
extern bool unacmaybefold(const string& in, string& out,
const char *encoding, UnacOp what);
// Utility function to determine if string begins with capital
extern bool unaciscapital(const string& in);
// Utility function to determine if string has upper-case anywhere
extern bool unachasuppercase(const string& in);
// Utility function to determine if any character is accented. This
// approprialey ignores the characters from unac_except_chars which
// are really separate letters
extern bool unachasaccents(const string& in);
#endif /* _UNACPP_H_INCLUDED_ */

View file

@ -17,6 +17,7 @@
#ifndef TEST_SUBTREELIST
#include "cstr.h"
#include "refcntr.h"
#include "rcldb.h"
#include "searchdata.h"
@ -35,7 +36,7 @@ bool subtreelist(RclConfig *config, const string& top,
return false;
}
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR);
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, cstr_null);
RefCntr<Rcl::SearchData> rq(sd);
rq->addDirSpec(top);

View file

@ -6,8 +6,8 @@ LIBS = librcl.a
all: $(LIBS)
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o expansiondbs.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp expansiondbs.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
librcl.a : $(DEPS) $(OBJS)
ar ru librcl.a $(OBJS)
@ -87,6 +87,8 @@ wasastringtoquery.o : ../query/wasastringtoquery.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp
wasatorcl.o : ../query/wasatorcl.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp
expansiondbs.o : ../rcldb/expansiondbs.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/expansiondbs.cpp
rcldb.o : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp
rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
@ -278,6 +280,9 @@ wasastringtoquery.dep.stamp : ../query/wasastringtoquery.cpp $(depth)/mk/localde
wasatorcl.dep.stamp : ../query/wasatorcl.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep
touch wasatorcl.dep.stamp
expansiondbs.dep.stamp : ../rcldb/expansiondbs.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/expansiondbs.cpp > expansiondbs.dep
touch expansiondbs.dep.stamp
rcldb.dep.stamp : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep
touch rcldb.dep.stamp
@ -405,6 +410,7 @@ include reslistpager.dep
include sortseq.dep
include wasastringtoquery.dep
include wasatorcl.dep
include expansiondbs.dep
include rcldb.dep
include rcldoc.dep
include rclquery.dep

View file

@ -41,6 +41,7 @@ ${depth}/query/reslistpager.cpp \
${depth}/query/sortseq.cpp \
${depth}/query/wasastringtoquery.cpp \
${depth}/query/wasatorcl.cpp \
${depth}/rcldb/expansiondbs.cpp \
${depth}/rcldb/rcldb.cpp \
${depth}/rcldb/rcldoc.cpp \
${depth}/rcldb/rclquery.cpp \

View file

@ -93,7 +93,7 @@ SearchData_init(recoll_SearchDataObject *self, PyObject *args, PyObject *kwargs)
if (stp && strcasecmp(stp, "or")) {
tp = Rcl::SCLT_OR;
}
self->sd = RefCntr<Rcl::SearchData>(new Rcl::SearchData(tp));
self->sd = RefCntr<Rcl::SearchData>(new Rcl::SearchData(tp, "english"));
return 0;
}
@ -715,18 +715,18 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_AttributeError, "query");
return 0;
}
// SearchData defaults to stemming in english
// Use default for now but need to add way to specify language
string reason;
Rcl::SearchData *sd = wasaStringToRcl(rclconfig, utf8, reason);
Rcl::SearchData *sd = wasaStringToRcl(rclconfig, dostem ? "english" : "",
utf8, reason);
if (!sd) {
PyErr_SetString(PyExc_ValueError, reason.c_str());
return 0;
}
// SearchData defaults to stemming in english
// Use default for now but need to add way to specify language
if (!dostem)
sd->setStemlang("");
RefCntr<Rcl::SearchData> rq(sd);
string sf = self->sortfield ? string(self->sortfield) : string("");
self->query->setSortBy(sf, self->ascending);

View file

@ -356,8 +356,9 @@ size_t AdvSearch::stringToSize(QString qsize)
using namespace Rcl;
void AdvSearch::runSearch()
{
string stemLang = prefs.stemlang();
RefCntr<SearchData> sdata(new SearchData(conjunctCMB->currentIndex() == 0 ?
SCLT_AND : SCLT_OR));
SCLT_AND : SCLT_OR, stemLang));
bool hasclause = false;
for (list<SearchClauseW*>::iterator it = m_clauseWins.begin();

View file

@ -372,6 +372,18 @@ void rwSettings(bool writing)
}
}
string PrefsPack::stemlang()
{
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") {
if (theconfig)
theconfig->getConfParam("indexstemminglanguages", stemLang);
else
stemLang = "";
}
return stemLang;
}
QString myGetFileName(bool isdir, QString caption, bool filenosave)
{
LOGDEB1(("myFileDialog: isdir %d\n", isdir));

View file

@ -120,6 +120,8 @@ class PrefsPack {
// Default paragraph format for result list
static const char *dfltResListFormat;
std::string stemlang();
PrefsPack() :
respagesize(8),
reslistfontsize(10),

View file

@ -756,12 +756,6 @@ void RclMain::startSearch(RefCntr<Rcl::SearchData> sdata)
return;
}
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") {
theconfig->getConfParam("indexstemminglanguages", stemLang);
}
sdata->setStemlang(stemLang);
Rcl::Query *query = new Rcl::Query(rcldb);
query->setCollapseDuplicates(prefs.collapseDuplicates);
@ -1073,9 +1067,7 @@ void RclMain::showActiveTypes()
// Get list of all mime types in index. For this, we use a
// wildcard field search on mtype
Rcl::TermMatchResult matches;
string prefix;
if (!rcldb->termMatch(Rcl::Db::ET_WILD, "", "*", matches, -1, "mtype",
&prefix)) {
if (!rcldb->termMatch(Rcl::Db::ET_WILD, "", "*", matches, -1, "mtype")) {
QMessageBox::warning(0, tr("Error"),
tr("Index query error"),
QMessageBox::Ok,
@ -1088,7 +1080,7 @@ void RclMain::showActiveTypes()
for (vector<Rcl::TermMatchEntry>::const_iterator it =
matches.entries.begin();
it != matches.entries.end(); it++) {
mtypesfromdb.insert(it->term.substr(prefix.size()));
mtypesfromdb.insert(it->term.substr(matches.prefix.size()));
}
// All types listed in mimeconf:
@ -1771,7 +1763,7 @@ void RclMain::showDocHistory()
}
// Construct a bogus SearchData structure
RefCntr<Rcl::SearchData>searchdata =
RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND));
RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND, cstr_null));
searchdata->setDescription((const char *)tr("History data").toUtf8());

View file

@ -126,23 +126,25 @@ void SSearch::startSimpleSearch()
if (u8.length() == 0)
return;
string stemlang = prefs.stemlang();
SSearchType tp = (SSearchType)searchTypCMB->currentIndex();
Rcl::SearchData *sdata = 0;
if (tp == SST_LANG) {
string reason;
if (prefs.autoSuffsEnable)
sdata = wasaStringToRcl(theconfig, u8, reason,
sdata = wasaStringToRcl(theconfig, stemlang, u8, reason,
(const char *)prefs.autoSuffs.toUtf8());
else
sdata = wasaStringToRcl(theconfig, u8, reason);
sdata = wasaStringToRcl(theconfig, stemlang, u8, reason);
if (sdata == 0) {
QMessageBox::warning(0, "Recoll", tr("Bad query string") + ": " +
QString::fromAscii(reason.c_str()));
return;
}
} else {
sdata = new Rcl::SearchData(Rcl::SCLT_OR);
sdata = new Rcl::SearchData(Rcl::SCLT_OR, stemlang);
if (sdata == 0) {
QMessageBox::warning(0, "Recoll", tr("Out of memory"));
return;
@ -166,11 +168,6 @@ void SSearch::startSimpleSearch()
}
if (prefs.ssearchAutoPhrase && rcldb) {
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") {
theconfig->getConfParam("indexstemminglanguages", stemLang);
}
sdata->setStemlang(stemLang);
sdata->maybeAddAutoPhrase(*rcldb,
prefs.ssearchAutoPhraseThreshPC / 100.0);
}
@ -277,10 +274,9 @@ void SSearch::completion()
// Query database
const int max = 100;
Rcl::TermMatchResult tmres;
string stemLang = (const char *)prefs.queryStemLang.toAscii();
if (stemLang == "ALL") {
theconfig->getConfParam("indexstemminglanguages", stemLang);
}
string stemLang = prefs.stemlang();
if (!rcldb->termMatch(Rcl::Db::ET_WILD, stemLang, s, tmres, max) ||
tmres.entries.size() == 0) {
QApplication::beep();

View file

@ -120,7 +120,8 @@ bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
LOGDEB(("DocSequenceDb::setFiltSpec\n"));
if (fs.isNotNull()) {
// We build a search spec by adding a filtering layer to the base one.
m_fsdata = RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND));
m_fsdata = RefCntr<Rcl::SearchData>(
new Rcl::SearchData(Rcl::SCLT_AND, m_sdata->getStemLang()));
Rcl::SearchDataClauseSub *cl =
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB, m_sdata);
m_fsdata->addClause(cl);
@ -138,6 +139,7 @@ bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
string reason;
Rcl::SearchData *sd =
wasaStringToRcl(m_q->whatDb()->getConf(),
m_sdata->getStemLang(),
fs.values[i], reason);
if (sd) {
Rcl::SearchDataClauseSub *cl1 =

View file

@ -50,7 +50,10 @@ static string vecStringToString(const vector<string>& t)
}
struct MatchEntry {
// Start/End byte offsets in the document text
pair<int, int> offs;
// Index of the search group this comes from: this is to relate a
// match to the original user input.
unsigned int grpidx;
MatchEntry(int sta, int sto, unsigned int idx)
: offs(sta, sto), grpidx(idx)
@ -76,11 +79,23 @@ class TextSplitPTR : public TextSplit {
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
vit != hdata.groups.end(); vit++) {
if (vit->size() == 1) {
#ifdef RCL_INDEX_STRIPCHARS
m_terms[vit->front()] = vit - hdata.groups.begin();
#else
string dumb = vit->front();
unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
m_terms[dumb] = vit - hdata.groups.begin();
#endif
} else if (vit->size() > 1) {
for (vector<string>::const_iterator it = vit->begin();
it != vit->end(); it++) {
#ifdef RCL_INDEX_STRIPCHARS
m_gterms.insert(*it);
#else
string dumb = *it;
unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
m_gterms.insert(dumb);
#endif
}
}
}

View file

@ -286,7 +286,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
Rcl::SearchData *sd = 0;
if (op_flags & (OPT_a|OPT_o|OPT_f)) {
sd = new Rcl::SearchData(Rcl::SCLT_OR);
sd = new Rcl::SearchData(Rcl::SCLT_OR, stemlang);
Rcl::SearchDataClause *clp = 0;
if (op_flags & OPT_f) {
clp = new Rcl::SearchDataClauseFilename(qs);
@ -305,14 +305,13 @@ int recollq(RclConfig **cfp, int argc, char **argv)
if (sd)
sd->addClause(clp);
} else {
sd = wasaStringToRcl(rclconfig, qs, reason);
sd = wasaStringToRcl(rclconfig, stemlang, qs, reason);
}
if (!sd) {
cerr << "Query string interpretation failed: " << reason << endl;
return 1;
}
sd->setStemlang(stemlang);
RefCntr<Rcl::SearchData> rq(sd);
Rcl::Query query(&rcldb);

View file

@ -32,7 +32,9 @@ using std::list;
#include "refcntr.h"
#include "textsplit.h"
static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
static Rcl::SearchData *wasaQueryToRcl(RclConfig *config,
const string& stemlang,
WasaQuery *wasa,
const string& autosuffs, string& reason)
{
if (wasa == 0) {
@ -47,7 +49,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
Rcl::SearchData *sdata = new
Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND :
Rcl::SCLT_OR);
Rcl::SCLT_OR, stemlang);
LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ?
"AND" : "OR"));
@ -250,7 +252,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
// Create a subquery.
Rcl::SearchData *sub =
wasaQueryToRcl(config, *it, autosuffs, reason);
wasaQueryToRcl(config, stemlang, *it, autosuffs, reason);
if (sub == 0) {
continue;
}
@ -278,7 +280,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
return sdata;
}
Rcl::SearchData *wasaStringToRcl(RclConfig *config,
Rcl::SearchData *wasaStringToRcl(RclConfig *config, const string& stemlang,
const string &qs, string &reason,
const string& autosuffs)
{
@ -286,5 +288,5 @@ Rcl::SearchData *wasaStringToRcl(RclConfig *config,
WasaQuery *wq = parser.stringToQuery(qs, reason);
if (wq == 0)
return 0;
return wasaQueryToRcl(config, wq, autosuffs, reason);
return wasaQueryToRcl(config, stemlang, wq, autosuffs, reason);
}

View file

@ -25,7 +25,7 @@ using std::string;
class RclConfig;
extern Rcl::SearchData *wasaStringToRcl(RclConfig *,
extern Rcl::SearchData *wasaStringToRcl(RclConfig *, const string& stemlang,
const string& query, string &reason,
const string& autosuffs = string());
#endif /* _WASATORCL_H_INCLUDED_ */

View file

@ -14,6 +14,9 @@
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
@ -36,7 +39,8 @@ using namespace std;
static string thisprog;
static string usage =
" -d <dbdir> -e <output encoding>\n"
" -d <dbdir> \n"
"-e <output encoding>\n"
" -i docid -D : get document data for docid\n"
" -i docid -X : delete document docid\n"
" -i docid -b : 'rebuild' document from term positions\n"
@ -112,6 +116,15 @@ static void sigcleanup(int sig)
exit(1);
}
inline bool has_prefix(const string& trm)
{
#ifdef RCL_INDEX_STRIPCHARS
return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z';
#else
return trm.size() > 0 && trm[0] == ':';
#endif
}
int main(int argc, char **argv)
{
string dbdir = path_cat(path_home(), ".recoll/xapiandb");
@ -201,8 +214,7 @@ int main(int argc, char **argv)
for (term = db->termlist_begin(docid);
term != db->termlist_end(docid);term++) {
const string& s = *term;
if ((op_flags&OPT_l) &&
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
if ((op_flags&OPT_l) && has_prefix(s))
continue;
cout << op << detailstring(s) << cl << endl;
}
@ -210,8 +222,7 @@ int main(int argc, char **argv)
for (term = db->allterms_begin();
term != db->allterms_end();term++) {
const string& s = *term;
if ((op_flags&OPT_l) &&
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
if ((op_flags&OPT_l) && has_prefix(s))
continue;
if (op_flags & OPT_f)
cout << db->get_collection_freq(*term) << " "

View file

@ -72,7 +72,7 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
SynTermTransUnac transunac(UNACOP_UNACFOLD);
XapWritableComputableSynFamMember
diacasedb(wdb, synFamDiac, "all", &transunac);
diacasedb(wdb, synFamDiCa, "all", &transunac);
diacasedb.recreate();
#endif

View file

@ -14,6 +14,8 @@
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <stdio.h>
#include <cstring>
#include <unistd.h>
@ -53,6 +55,7 @@ using namespace std;
#include "cancelcheck.h"
#include "ptmutex.h"
#include "termproc.h"
#include "expansiondbs.h"
#ifndef MAX
#define MAX(A,B) (A>B?A:B)
@ -84,9 +87,15 @@ static const string xapday_prefix = "D";
static const string xapmonth_prefix = "M";
static const string xapyear_prefix = "Y";
const string pathelt_prefix = "XP";
#ifdef RCL_INDEX_STRIPCHARS
const string start_of_field_term = "XXST";
const string end_of_field_term = "XXND";
static const string page_break_term = "XXPG";
#else
const string start_of_field_term = "XXST/";
const string end_of_field_term = "XXND/";
static const string page_break_term = "XXPG/";
#endif
// Field name for the unsplit file name. Has to exist in the field file
// because of usage in termmatch()
static const string unsplitFilenameFieldName = "rclUnsplitFN";
@ -197,7 +206,7 @@ static void noPrefixList(const vector<string>& in, vector<string>& out)
{
for (vector<string>::const_iterator qit = in.begin();
qit != in.end(); qit++) {
if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
if (!has_prefix(*qit))
out.push_back(*qit);
}
}
@ -573,7 +582,7 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
for (term = xrdb.termlist_begin(docid);
term != xrdb.termlist_end(docid); term++) {
// Ignore prefixed terms
if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
if (has_prefix(*term))
continue;
if (cutoff-- < 0) {
LOGDEB0(("makeAbstract: max term count cutoff\n"));
@ -652,7 +661,9 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
vabs.push_back(chunk);
chunk.clear();
} else {
chunk += it->second;
if (it->second.compare(end_of_field_term) &&
it->second.compare(start_of_field_term))
chunk += it->second;
}
}
if (!chunk.empty())
@ -874,11 +885,13 @@ int Db::termDocCnt(const string& _term)
if (!m_ndb || !m_ndb->m_isopen)
return -1;
string term;
string term = _term;
#ifdef RCL_INDEX_STRIPCHARS
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
return 0;
}
#endif
if (m_stops.isStop(term)) {
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
@ -994,8 +1007,19 @@ class TextSplitDb : public TextSplitP {
{}
// Reimplement text_to_words to add start and end special terms
virtual bool text_to_words(const string &in);
void setprefix(const string& pref) {prefix = pref;}
void setwdfinc(int i) {wdfinc = i;}
void setprefix(const string& pref)
{
if (pref.empty())
prefix.clear();
else
prefix = wrap_prefix(pref);
}
void setwdfinc(int i)
{
wdfinc = i;
}
friend class TermProcIdx;
@ -1127,11 +1151,13 @@ string Db::getSpellingSuggestion(const string& word)
{
if (m_ndb == 0)
return string();
string term;
string term = word;
#ifdef RCL_INDEX_STRIPCHARS
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
return string();
}
#endif
if (!isSpellingCandidate(term))
return string();
return m_ndb->xrdb.get_spelling_suggestion(term);
@ -1239,8 +1265,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
TermProcIdx tpidx;
TermProc *nxt = &tpidx;
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
#ifdef RCL_INDEX_STRIPCHARS
TermProcPrep tpprep(nxt); nxt = &tpprep;
#endif
TextSplitDb splitter(newdocument, nxt);
tpidx.setTSD(&splitter);
@ -1266,7 +1294,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
vector<string> vpath;
stringToTokens(path, vpath, "/");
splitter.curpos = 0;
newdocument.add_posting(pathelt_prefix,
newdocument.add_posting(wrap_prefix(pathelt_prefix),
splitter.basepos + splitter.curpos++);
for (vector<string>::iterator it = vpath.begin();
it != vpath.end(); it++){
@ -1274,7 +1302,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
// Just truncate it. May still be useful because of wildcards
*it = it->substr(0, 230);
}
newdocument.add_posting(pathelt_prefix + *it,
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
splitter.basepos + splitter.curpos++);
}
}
@ -1319,7 +1347,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
////// Special terms for other metadata. No positions for these.
// Mime type
newdocument.add_term(mimetype_prefix + doc.mimetype);
newdocument.add_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
// Simple file name indexed unsplit for specific "file name"
// searches. This is not the same as a filename: clause inside the
@ -1335,9 +1363,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
utf8truncate(fn, 230);
string::size_type pos = fn.rfind('.');
if (pos != string::npos && pos != fn.length() - 1) {
newdocument.add_term(fileext_prefix + fn.substr(pos + 1));
newdocument.add_term(wrap_prefix(fileext_prefix) +
fn.substr(pos + 1));
}
newdocument.add_term(unsplitfilename_prefix + fn);
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn);
}
}
@ -1356,12 +1385,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
struct tm *tm = localtime(&mtime);
char buf[9];
snprintf(buf, 9, "%04d%02d%02d",
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
// Date (YYYYMMDD)
newdocument.add_term(wrap_prefix(xapday_prefix) + string(buf));
// Month (YYYYMM)
buf[6] = '\0';
newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
newdocument.add_term(wrap_prefix(xapmonth_prefix) + string(buf));
// Year (YYYY)
buf[4] = '\0';
newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
newdocument.add_term(wrap_prefix(xapyear_prefix) + string(buf));
//////////////////////////////////////////////////////////////////
@ -1834,7 +1866,7 @@ bool Db::maxYearSpan(int *minyear, int *maxyear)
*minyear = 1000000;
*maxyear = -1000000;
TermMatchResult result;
if (!termMatch(ET_WILD, string(), "*", result, 5000, "xapyear"))
if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear"))
return false;
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
it != result.entries.end(); it++) {
@ -1899,30 +1931,32 @@ const string cstr_wildSpecChars = "*?[";
const string cstr_regSpecChars = "(.[{";
// Find all index terms that match a wildcard or regular expression
// If field is set, we return a list of appropriately prefixed terms (which
// are going to be used to build a Xapian query).
bool Db::termMatch(MatchType typ, const string &lang,
const string &root,
TermMatchResult& res,
int max,
const string& field,
string *prefixp
)
const string& field)
{
if (!m_ndb || !m_ndb->m_isopen)
return false;
Xapian::Database xdb = m_ndb->xdb();
res.clear();
XAPTRY(res.dbdoccount = xdb.get_doccount();
res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
if (!m_reason.empty())
return false;
// Get rid of capitals and accents
string droot;
string droot = root;
#ifdef RCL_INDEX_STRIPCHARS
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
return false;
}
#endif
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
string prefix;
@ -1932,17 +1966,14 @@ bool Db::termMatch(MatchType typ, const string &lang,
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
field.c_str()));
} else {
prefix = ftp->pfx;
prefix = wrap_prefix(ftp->pfx);
}
if (prefixp)
*prefixp = prefix;
}
res.prefix = prefix;
if (typ == ET_STEM) {
if (!stemExpand(lang, root, res, max))
return false;
sort(res.entries.begin(), res.entries.end());
unique(res.entries.begin(), res.entries.end());
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
it != res.entries.end(); it++) {
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
@ -2032,7 +2063,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
TermMatchCmpByTerm tcmp;
sort(res.entries.begin(), res.entries.end(), tcmp);
TermMatchTermEqual teq;
unique(res.entries.begin(), res.entries.end(), teq);
vector<TermMatchEntry>::iterator uit =
unique(res.entries.begin(), res.entries.end(), teq);
res.entries.resize(uit - res.entries.begin());
TermMatchCmpByWcf wcmp;
sort(res.entries.begin(), res.entries.end(), wcmp);
if (max > 0) {

View file

@ -17,6 +17,8 @@
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
#include "autoconfig.h"
#include <string>
#include <vector>
@ -73,21 +75,50 @@ class Query;
/** Used for returning result lists for index terms matching some criteria */
class TermMatchEntry {
public:
TermMatchEntry() : wcf(0) {}
TermMatchEntry(const string&t, int f, int d) : term(t), wcf(f), docs(d) {}
TermMatchEntry(const string&t) : term(t), wcf(0) {}
bool operator==(const TermMatchEntry &o) const { return term == o.term;}
bool operator<(const TermMatchEntry &o) const { return term < o.term;}
TermMatchEntry()
: wcf(0)
{
}
TermMatchEntry(const string& t, int f, int d)
: term(t), wcf(f), docs(d)
{
}
TermMatchEntry(const string& t)
: term(t), wcf(0)
{
}
bool operator==(const TermMatchEntry &o) const
{
return term == o.term;
}
bool operator<(const TermMatchEntry &o) const
{
return term < o.term;
}
string term;
int wcf; // Total count of occurrences within collection.
int docs; // Number of documents countaining term.
};
/** Term match result list header: statistics and global info */
class TermMatchResult {
public:
TermMatchResult() {clear();}
void clear() {entries.clear(); dbdoccount = 0; dbavgdoclen = 0;}
TermMatchResult()
{
clear();
}
void clear()
{
entries.clear();
dbdoccount = 0;
dbavgdoclen = 0;
}
// Term expansion
vector<TermMatchEntry> entries;
// If a field was specified, this is the corresponding index prefix
string prefix;
// Index-wide stats
unsigned int dbdoccount;
double dbavgdoclen;
};
@ -95,6 +126,24 @@ public:
#ifdef IDX_THREADS
extern void *DbUpdWorker(void*);
#endif // IDX_THREADS
inline bool has_prefix(const string& trm)
{
#ifdef RCL_INDEX_STRIPCHARS
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
#else
return !trm.empty() && trm[0] == ':';
#endif
}
inline string wrap_prefix(const string& pfx)
{
#ifdef RCL_INDEX_STRIPCHARS
return pfx;
#else
return cstr_colon + pfx + cstr_colon;
#endif
}
/**
* Wrapper class for the native database.
*/
@ -132,6 +181,8 @@ class Db {
{
if (term.empty() || term.length() > 50)
return false;
if (has_prefix(term))
return false;
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
!= string::npos)
return false;
@ -205,12 +256,23 @@ class Db {
/** Return the index terms that match the input string
* Expansion is performed either with either wildcard or regexp processing
* Stem expansion is performed if lang is not empty */
* Stem expansion is performed if lang is not empty
*
* @param typ defines the kind of expansion: wildcard, regexp or stemming
* @param lang sets the stemming language(s). Can be a space-separated list
* @param term is the term to expand
* @param result is the main output
* @param max defines the maximum result count
* @param field if set, defines the field within with the expansion should
* be performed. Only used for wildcards and regexps, stemming is
* always global. If this is set, the resulting output terms
* will be appropriately prefix and the prefix value will be set
* in the TermMatchResult header
*/
enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
bool termMatch(MatchType typ, const string &lang, const string &s,
bool termMatch(MatchType typ, const string &lang, const string &term,
TermMatchResult& result, int max = -1,
const string& field = cstr_null,
string *prefix = 0
const string& field = cstr_null
);
/** Return min and max years for doc mod times in db */
bool maxYearSpan(int *minyear, int *maxyear);

View file

@ -18,12 +18,17 @@
#ifndef _rcldb_p_h_included_
#define _rcldb_p_h_included_
#include "autoconfig.h"
#include <map>
#include <xapian.h>
#ifdef IDX_THREADS
#include "workqueue.h"
#include "debuglog.h"
#endif // IDX_THREADS
#include "xapian.h"
#include "xmacros.h"
namespace Rcl {

View file

@ -446,7 +446,7 @@ vector<string> Query::expand(const Doc &doc)
for (Xapian::ESetIterator it = eset.begin();
it != eset.end(); it++) {
LOGDEB((" [%s]\n", (*it).c_str()));
if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z'))
if ((*it).empty() || has_prefix(*it))
continue;
res.push_back(*it);
if (res.size() >= 10)

View file

@ -16,17 +16,22 @@
*/
// Handle translation from rcl's SearchData structures to Xapian Queries
#include "autoconfig.h"
#include <stdio.h>
#include <fnmatch.h>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
#include "xapian.h"
#include "cstr.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "searchdata.h"
#include "debuglog.h"
#include "smallut.h"
@ -36,11 +41,11 @@
#include "stoplist.h"
#include "rclconfig.h"
#include "termproc.h"
#include "synfamily.h"
#include "stemdb.h"
#include "expansiondbs.h"
#ifndef NO_NAMESPACES
using namespace std;
namespace Rcl {
#endif
typedef vector<SearchDataClause *>::iterator qlist_it_t;
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
@ -71,13 +76,23 @@ static const int original_term_wqf_booster = 10;
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
#ifdef RCL_INDEX_STRIPCHARS
#define bufprefix(BUF, L) {(BUF)[0] = L;}
#define bpoffs 1
#else
#define bufprefix(BUF, L) {(BUF)[0] = ':'; (BUF)[1] = L; (BUF)[2] = ':';}
#define bpoffs 3
#endif
static Xapian::Query
date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
{
// Xapian uses a smallbuf and snprintf. Can't be bothered, we're
// only doing %d's !
char buf[200];
sprintf(buf, "D%04d%02d", y1, m1);
bufprefix(buf, 'D');
sprintf(buf+bpoffs, "%04d%02d", y1, m1);
vector<Xapian::Query> v;
int d_last = monthdays(m1, y1);
@ -88,11 +103,11 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
// Deal with any initial partial month
if (d1 > 1 || d_end < d_last) {
for ( ; d1 <= d_end ; d1++) {
sprintf(buf + 7, "%02d", d1);
sprintf(buf + 6 + bpoffs, "%02d", d1);
v.push_back(Xapian::Query(buf));
}
} else {
buf[0] = 'M';
bufprefix(buf, 'M');
v.push_back(Xapian::Query(buf));
}
@ -102,36 +117,36 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
int m_last = (y1 < y2) ? 12 : m2 - 1;
while (++m1 <= m_last) {
sprintf(buf + 5, "%02d", m1);
buf[0] = 'M';
sprintf(buf + 4 + bpoffs, "%02d", m1);
bufprefix(buf, 'M');
v.push_back(Xapian::Query(buf));
}
if (y1 < y2) {
while (++y1 < y2) {
sprintf(buf + 1, "%04d", y1);
buf[0] = 'Y';
sprintf(buf + bpoffs, "%04d", y1);
bufprefix(buf, 'Y');
v.push_back(Xapian::Query(buf));
}
sprintf(buf + 1, "%04d", y2);
buf[0] = 'M';
sprintf(buf + bpoffs, "%04d", y2);
bufprefix(buf, 'M');
for (m1 = 1; m1 < m2; m1++) {
sprintf(buf + 5, "%02d", m1);
sprintf(buf + 4 + bpoffs, "%02d", m1);
v.push_back(Xapian::Query(buf));
}
}
sprintf(buf + 5, "%02d", m2);
sprintf(buf + 2 + bpoffs, "%02d", m2);
// Deal with any final partial month
if (d2 < monthdays(m2, y2)) {
buf[0] = 'D';
bufprefix(buf, 'D');
for (d1 = 1 ; d1 <= d2; d1++) {
sprintf(buf + 7, "%02d", d1);
sprintf(buf + 6 + bpoffs, "%02d", d1);
v.push_back(Xapian::Query(buf));
}
} else {
buf[0] = 'M';
bufprefix(buf, 'M');
v.push_back(Xapian::Query(buf));
}
@ -172,31 +187,27 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)
return true;
}
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
vector<SearchDataClause*>& query,
string& reason, void *d)
{
LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n",
m_stemlang.c_str()));
Xapian::Query xq;
m_reason.erase();
// Walk the clause list translating each in turn and building the
// Xapian query tree
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
Xapian::Query nq;
if (!(*it)->toNativeQuery(db, &nq, m_stemlang)) {
LOGERR(("SearchData::toNativeQuery: failed\n"));
m_reason = (*it)->getReason();
if (!(*it)->toNativeQuery(db, &nq)) {
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed\n"));
reason = (*it)->getReason();
return false;
}
if (nq.empty()) {
LOGDEB(("SearchData::toNativeQuery: skipping empty clause\n"));
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
continue;
}
// If this structure is an AND list, must use AND_NOT for excl clauses.
// Else this is an OR list, and there can't be excl clauses (checked by
// addClause())
Xapian::Query::op op;
if (m_tp == SCLT_AND) {
if (tp == SCLT_AND) {
if ((*it)->m_tp == SCLT_EXCL) {
op = Xapian::Query::OP_AND_NOT;
} else {
@ -217,6 +228,23 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
if (xq.empty())
xq = Xapian::Query::MatchAll;
*((Xapian::Query *)d) = xq;
return true;
}
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
m_reason.erase();
// Walk the clause list translating each in turn and building the
// Xapian query tree
Xapian::Query xq;
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed\n"));
return false;
}
if (m_haveDates) {
// If one of the extremities is unset, compute db extremas
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
@ -326,10 +354,10 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
stringToTokens(dit->dir, vpath, "/");
vector<string> pvpath;
if (dit->dir[0] == '/')
pvpath.push_back(pathelt_prefix);
pvpath.push_back(wrap_prefix(pathelt_prefix));
for (vector<string>::const_iterator pit = vpath.begin();
pit != vpath.end(); pit++){
pvpath.push_back(pathelt_prefix + *pit);
pvpath.push_back(wrap_prefix(pathelt_prefix) + *pit);
}
Xapian::Query::op tdop;
if (dit->weight == 1.0) {
@ -446,7 +474,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
// My type is AND. Change it to OR and insert two queries, one
// being the original query as a subquery, the other the
// phrase.
SearchData *sd = new SearchData(m_tp);
SearchData *sd = new SearchData(m_tp, m_stemlang);
sd->m_query = m_query;
sd->m_stemlang = m_stemlang;
m_tp = SCLT_OR;
@ -586,25 +614,28 @@ public:
{ }
bool processUserString(const string &iq,
int mods,
string &ermsg,
vector<Xapian::Query> &pqueries,
const StopList &stops,
int slack = 0, bool useNear = false);
private:
void expandTerm(bool dont, const string& term, vector<string>& exp,
void expandTerm(int mods,
const string& term, vector<string>& exp,
string& sterm, const string& prefix);
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, bool nostemexp,
void processSimpleSpan(const string& span,
int mods,
vector<Xapian::Query> &pqueries);
// Process phrase/near element
void processPhraseOrNear(TextSplitQ *splitData,
int mods,
vector<Xapian::Query> &pqueries,
bool useNear, int slack, int mods);
bool useNear, int slack);
Db& m_db;
const string& m_field;
const string& m_stemlang;
bool m_doBoostUserTerms;
const bool m_doBoostUserTerms;
HighlightData& m_hld;
};
@ -619,60 +650,187 @@ static void listVector(const string& what, const vector<string>&l)
}
#endif
/** Take simple term and expand stem and wildcards
/** Expand term into term list, using appropriate mode: stem, wildcards,
* diacritics...
*
* @param nostemexp don't perform stem expansion. This is mainly used to
* prevent stem expansion inside phrases (because the user probably
* does not expect it). This does NOT prevent wild card expansion.
* Other factors than nostemexp can prevent stem expansion:
* a null stemlang, resulting from a global user preference, a
* capitalized term, or wildcard(s)
* @param mods stem expansion, case and diacritics sensitivity control.
* @param term input single word
* @param exp output expansion list
* @param sterm output original input term if there were no wildcards
* @param prefix field prefix in index. We could recompute it, but the caller
* has it already. Used in the simple case where there is nothing to expand,
* and we just return the prefixed term (else Db::termMatch deals with it).
*/
void StringToXapianQ::expandTerm(bool nostemexp,
const string& term,
vector<string>& exp,
string &sterm, const string& prefix)
void StringToXapianQ::expandTerm(int mods,
const string& term,
vector<string>& exp, string &sterm,
const string& prefix)
{
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase();
sterm.clear();
exp.clear();
if (term.empty()) {
if (term.empty())
return;
}
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
// No stemming if there are wildcards or prevented globally.
// If there are no wildcards, add term to the list of user-entered terms
if (!haswild)
m_hld.uterms.insert(term);
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
// No stem expansion if there are wildcards or if prevented by caller
if (haswild || m_stemlang.empty()) {
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
nostemexp = true;
}
if (!haswild)
m_hld.uterms.insert(term);
bool noexpansion = nostemexp && !haswild;
if (nostemexp && !haswild) {
#ifndef RCL_INDEX_STRIPCHARS
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
// If we are working with a raw index, apply the rules for case and
// diacritics sensitivity.
// If any character has a diacritic, we become
// diacritic-sensitive. Note that the way that the test is
// performed (conversion+comparison) will automatically ignore
// accented characters which are actually a separate letter
if (unachasaccents(term))
diac_sensitive = true;
// If any character apart the first is uppercase, we become case-sensitive.
// The first character is reserved for turning off stemming. You need to
// use a query language modifier to search for Floor in a case-sensitive
// way.
Utf8Iter it(term);
it++;
if (unachasuppercase(term.substr(it.getBpos())))
case_sensitive = true;
// If we are sensitive to case or diacritics turn stemming off
if (diac_sensitive || case_sensitive)
nostemexp = true;
if (!case_sensitive || !diac_sensitive)
noexpansion = false;
#endif
if (noexpansion) {
sterm = term;
exp.resize(1);
exp[0] = prefix + term;
exp.push_back(prefix + term);
} else {
TermMatchResult res;
if (haswild) {
// Note that if there are wildcards, we do a direct from-index
// expansion, which means that we are casediac-sensitive. There
// would be nothing to prevent us to expand from the casediac
// synonyms first. To be done later
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
m_field);
} else {
sterm = term;
#ifdef RCL_INDEX_STRIPCHARS
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1,
m_field);
m_field);
#else
// No stem expansion when diacritic or case sensitivity is
// set, it makes no sense (it would mess with the
// diacritics anyway if they are not in the stem part).
// In these 3 cases, perform appropriate expansion from
// the charstripping db, and do a bogus wildcard expansion
// (there is no wild card) to generate the result:
if (diac_sensitive && case_sensitive) {
// No expansion whatsoever
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
m_field);
} else {
// Access case and diacritics expansion:
vector<string> exp;
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa,
"all", &unacfoldtrans);
if (diac_sensitive) {
// Expand for accents and case, filtering for same accents,
// then bogus wildcard expansion for generating result
SynTermTransUnac foldtrans(UNACOP_FOLD);
synac.synExpand(term, exp, &foldtrans);
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
-1, m_field);
}
} else if (case_sensitive) {
// Expand for accents and case, filtering for same case,
// then bogus wildcard expansion for generating result
SynTermTransUnac unactrans(UNACOP_UNAC);
synac.synExpand(term, exp, &unactrans);
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
-1, m_field);
}
} else {
// Expand for accents and case, then lowercase
// result for input to stemdb.
synac.synExpand(term, exp);
for (unsigned int i = 0; i < exp.size(); i++) {
string lower;
unacmaybefold(exp[i], lower, "UTF-8", UNACOP_FOLD);
exp[i] = lower;
}
sort(exp.begin(), exp.end());
vector<string>::iterator uit =
unique(exp.begin(), exp.end());
exp.resize(uit - exp.begin());
LOGDEB(("ExpandTerm: after casediac: %s\n",
stringsToString(exp).c_str()));
StemDb db(m_db.m_ndb->xrdb);
vector<string> exp1;
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
db.stemExpand(m_stemlang, *it, exp1);
}
LOGDEB(("ExpandTerm: after stem: %s\n",
stringsToString(exp1).c_str()));
// Expand the resulting list for case (all stemdb content
// is lowercase)
exp.clear();
for (vector<string>::const_iterator it = exp1.begin();
it != exp1.end(); it++) {
synac.synExpand(*it, exp);
}
sort(exp.begin(), exp.end());
uit = unique(exp.begin(), exp.end());
exp.resize(uit - exp.begin());
LOGDEB(("ExpandTerm: after case exp of stem: %s\n",
stringsToString(exp).c_str()));
// Bogus wildcard expand to generate the result
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
-1, m_field);
}
}
}
#endif
}
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
it != res.entries.end(); it++) {
exp.push_back(it->term);
}
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(exp).c_str()));
}
}
@ -710,21 +868,22 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
}
}
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
void StringToXapianQ::processSimpleSpan(const string& span,
int mods,
vector<Xapian::Query> &pqueries)
{
LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n",
span.c_str(), int(nostemexp)));
LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] mods %x\n",
span.c_str(), (unsigned int)mods));
vector<string> exp;
string sterm; // dumb version of user term
string prefix;
const FieldTraits *ftp;
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
prefix = ftp->pfx;
prefix = wrap_prefix(ftp->pfx);
}
expandTerm(nostemexp, span, exp, sterm, prefix);
expandTerm(mods, span, exp, sterm, prefix);
// Set up the highlight data. No prefix should go in there
for (vector<string>::const_iterator it = exp.begin();
@ -755,8 +914,9 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
int mods,
vector<Xapian::Query> &pqueries,
bool useNear, int slack, int mods)
bool useNear, int slack)
{
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE;
@ -769,7 +929,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
string prefix;
const FieldTraits *ftp;
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
prefix = ftp->pfx;
prefix = wrap_prefix(ftp->pfx);
}
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
@ -790,10 +950,12 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|| hadmultiple
#endif // single OR inside NEAR
;
int lmods = mods;
if (nostemexp)
lmods |= SearchDataClause::SDCM_NOSTEMMING;
string sterm;
vector<string> exp;
expandTerm(nostemexp, *it, exp, sterm, prefix);
expandTerm(lmods, *it, exp, sterm, prefix);
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
listVector("", exp);
// groups is used for highlighting, we don't want prefixes in there.
@ -882,9 +1044,9 @@ static int stringToMods(string& s)
* count)
*/
bool StringToXapianQ::processUserString(const string &iq,
int mods,
string &ermsg,
vector<Xapian::Query> &pqueries,
const StopList& stops,
int slack,
bool useNear
)
@ -892,6 +1054,8 @@ bool StringToXapianQ::processUserString(const string &iq,
LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
ermsg.erase();
const StopList stops = m_db.getStopList();
// Simple whitespace-split input into user-level words and
// double-quoted phrases: word1 word2 "this is a phrase".
//
@ -930,11 +1094,13 @@ bool StringToXapianQ::processUserString(const string &iq,
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
//tpcommon.onlygrams(true);
#ifdef RCL_INDEX_STRIPCHARS
TermProcPrep tpprep(nxt); nxt = &tpprep;
#endif
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD),
stops, nxt);
TextSplit::TXTS_KEEPWILD),
stops, nxt);
tpq.setTSQ(&splitter);
splitter.text_to_words(*it);
@ -944,14 +1110,17 @@ bool StringToXapianQ::processUserString(const string &iq,
switch (splitter.terms.size() + terminc) {
case 0:
continue;// ??
case 1:
case 1: {
int lmods = mods;
if (splitter.nostemexps.front())
lmods |= SearchDataClause::SDCM_NOSTEMMING;
m_hld.ugroups.push_back(vector<string>(1, *it));
processSimpleSpan(splitter.terms.front(),
splitter.nostemexps.front(), pqueries);
processSimpleSpan(splitter.terms.front(), lmods, pqueries);
}
break;
default:
m_hld.ugroups.push_back(vector<string>(1, *it));
processPhraseOrNear(&splitter, pqueries, useNear, slack, mods);
processPhraseOrNear(&splitter, mods, pqueries, useNear, slack);
}
}
} catch (const Xapian::Error &e) {
@ -971,13 +1140,10 @@ bool StringToXapianQ::processUserString(const string &iq,
}
// Translate a simple OR, AND, or EXCL search clause.
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
const string& stemlang)
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
{
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
stemlang;
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
stemlang.c_str()));
getStemLang().c_str()));
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
@ -1000,8 +1166,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
(m_parentSearch == 0 && !m_haveWildCards);
StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(m_text, m_reason, pqueries, db.getStopList()))
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries))
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
@ -1024,8 +1190,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
// about expanding multiple fragments in the past. We just take the
// value blanks and all and expand this against the indexed unsplit
// file names
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
const string&)
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
{
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
@ -1041,11 +1206,8 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
}
// Translate NEAR or PHRASE clause.
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
const string& stemlang)
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
{
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
stemlang;
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
Xapian::Query *qp = (Xapian::Query *)p;
@ -1069,8 +1231,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
}
string s = cstr_dquote + m_text + cstr_dquote;
bool useNear = (m_tp == SCLT_NEAR);
StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(),
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
if (!tr.processUserString(s, getModifiers(), m_reason, pqueries,
m_slack, useNear))
return false;
if (pqueries.empty()) {

View file

@ -70,9 +70,9 @@ class SearchDataClause;
*/
class SearchData {
public:
SearchData(SClType tp)
SearchData(SClType tp, const string& stemlang)
: m_tp(tp), m_haveDates(false), m_maxSize(size_t(-1)),
m_minSize(size_t(-1)), m_haveWildCards(false)
m_minSize(size_t(-1)), m_haveWildCards(false), m_stemlang(stemlang)
{
if (m_tp != SCLT_OR && m_tp != SCLT_AND)
m_tp = SCLT_OR;
@ -91,6 +91,7 @@ public:
/** Translate to Xapian query. rcldb knows about the void* */
bool toNativeQuery(Rcl::Db &db, void *);
/** We become the owner of cl and will delete it */
bool addClause(SearchDataClause *cl);
@ -109,6 +110,8 @@ public:
m_dirspecs.push_back(DirSpec(t, excl, w));
}
const std::string& getStemLang() {return m_stemlang;}
void setMinSize(size_t size) {m_minSize = size;}
void setMaxSize(size_t size) {m_maxSize = size;}
@ -120,8 +123,6 @@ public:
/** Add file type to not wanted list */
void remFiletype(const std::string& ft) {m_nfiletypes.push_back(ft);}
void setStemlang(const std::string& lang = "english") {m_stemlang = lang;}
/** Retrieve error description */
std::string getReason() {return m_reason;}
@ -170,7 +171,12 @@ private:
std::string m_reason;
bool m_haveWildCards;
std::string m_stemlang;
bool expandFileTypes(RclConfig *cfg, std::vector<std::string>& exptps);
bool clausesToQuery(Rcl::Db &db, SClType tp,
std::vector<SearchDataClause*>& query,
string& reason, void *d);
/* Copyconst and assignment private and forbidden */
SearchData(const SearchData &) {}
SearchData& operator=(const SearchData&) {return *this;};
@ -186,7 +192,7 @@ public:
m_modifiers(SDCM_NONE), m_weight(1.0)
{}
virtual ~SearchDataClause() {}
virtual bool toNativeQuery(Rcl::Db &db, void *, const std::string&) = 0;
virtual bool toNativeQuery(Rcl::Db &db, void *) = 0;
bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;}
virtual std::string getReason() const {return m_reason;}
virtual void getTerms(HighlightData & hldata) const = 0;
@ -199,6 +205,11 @@ public:
{
m_parentSearch = p;
}
string getStemLang()
{
return (m_modifiers & SDCM_NOSTEMMING) || m_parentSearch == 0 ?
cstr_null : m_parentSearch->getStemLang();
}
virtual void setModifiers(Modifier mod)
{
m_modifiers = mod;
@ -255,7 +266,7 @@ public:
}
/** Translate to Xapian query */
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang);
virtual bool toNativeQuery(Rcl::Db &, void *);
virtual void getTerms(HighlightData& hldata) const
{
@ -296,7 +307,7 @@ public:
{
}
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang);
virtual bool toNativeQuery(Rcl::Db &, void *);
};
/**
@ -315,7 +326,7 @@ public:
{
}
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang);
virtual bool toNativeQuery(Rcl::Db &, void *);
private:
int m_slack;
};
@ -323,17 +334,11 @@ private:
/** Subquery */
class SearchDataClauseSub : public SearchDataClause {
public:
// We take charge of the SearchData * and will delete it.
SearchDataClauseSub(SClType tp, RefCntr<SearchData> sub)
: SearchDataClause(tp), m_sub(sub)
{
}
virtual ~SearchDataClauseSub()
{
}
virtual bool toNativeQuery(Rcl::Db &db, void *p, const std::string&)
virtual bool toNativeQuery(Rcl::Db &db, void *p)
{
return m_sub->toNativeQuery(db, p);
}

View file

@ -19,6 +19,9 @@
* Management of the auxiliary databases listing stems and their expansion
* terms
*/
#include "autoconfig.h"
#include <unistd.h>
#include <algorithm>
@ -27,13 +30,8 @@
#include <xapian.h>
#include "stemdb.h"
#include "pathut.h"
#include "debuglog.h"
#include "smallut.h"
#include "utf8iter.h"
#include "textsplit.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "synfamily.h"
#include "unacpp.h"
@ -43,140 +41,6 @@ using namespace std;
namespace Rcl {
// Fast raw detection of non-natural-language words: look for ascii
// chars which are not lowercase letters. Not too sure what islower()
// would do with 8 bit values, so not using it here. If we want to be
// more complete we'd need to go full utf-8
inline static bool p_notlowerascii(unsigned int c)
{
if (c < 'a' || (c > 'z' && c < 128))
return true;
return false;
}
/**
* Create database of stem to parents associations for a given language.
*/
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const vector<string>& langs)
{
LOGDEB(("StemDb::createExpansionDbs\n"));
Chrono cron;
vector<XapWritableSynFamily> stemdbs;
for (unsigned int i = 0; i < langs.size(); i++) {
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
stemdbs[i].deleteMember(langs[i]);
stemdbs[i].createMember(langs[i]);
stemdbs[i].setCurrentMemberName(langs[i]);
}
// We walk the list of all terms, and stem each. We skip terms which
// don't look like natural language.
// If the stem is not identical to the term, we add a synonym entry.
// Statistics
int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
int stemconst = 0; // Stem == term
int allsyns = 0; // Total number of entries created
string ermsg;
try {
vector<Xapian::Stem> stemmers;
for (unsigned int i = 0; i < langs.size(); i++) {
stemmers.push_back(Xapian::Stem(langs[i]));
}
for (Xapian::TermIterator it = wdb.allterms_begin();
it != wdb.allterms_end(); it++) {
// If the term has any non-lowercase 7bit char (that is,
// numbers, capitals and punctuation) dont stem.
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
++nostem;
LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
(*it).c_str(), *sit));
continue;
}
// Detect and skip CJK terms.
// We're still sending all other multibyte utf-8 chars to
// the stemmer, which is not too well defined for
// xapian<1.0 (very obsolete now), but seems to work
// anyway. There shouldn't be too many in any case because
// accents are stripped at this point.
// The effect of stripping accents on stemming is not good,
// (e.g: in french partimes -> partim, parti^mes -> part)
// but fixing the issue would be complicated.
Utf8Iter utfit(*it);
if (TextSplit::isCJK(*utfit)) {
// LOGDEB(("stemskipped: Skipping CJK\n"));
continue;
}
// Create stemming synonym for every lang
for (unsigned int i = 0; i < langs.size(); i++) {
string stem = stemmers[i](*it);
if (stem == *it) {
++stemconst;
} else {
stemdbs[i].addSynonym(stem, *it);
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
(*it).c_str(), langs[i].c_str(), stem.c_str()));
++allsyns;
}
}
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
return false;
}
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
nostem, stemconst, allsyns));
return true;
}
/**
* Expand term to list of all terms which stem to the same term, for one
* expansion language
*/
bool StemDb::expandOne(const std::string& lang,
const std::string& term,
vector<string>& result)
{
try {
Xapian::Stem stemmer(lang);
string stem = stemmer(term);
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
lang.c_str(), term.c_str(), stem.c_str()));
if (!synExpand(lang, stem, result)) {
// ?
}
// If the user term or stem are not in the list, add them
if (find(result.begin(), result.end(), term) == result.end()) {
result.push_back(term);
}
if (find(result.begin(), result.end(), stem) == result.end()) {
result.push_back(stem);
}
LOGDEB0(("stemExpand:%s: %s -> %s\n", lang.c_str(), stem.c_str(),
stringsToString(result).c_str()));
} catch (...) {
LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
lang.c_str()));
result.push_back(term);
return false;
}
return true;
}
/**
* Expand for one or several languages
*/
@ -186,14 +50,34 @@ bool StemDb::stemExpand(const std::string& langs,
{
vector<string> llangs;
stringToStrings(langs, llangs);
for (vector<string>::const_iterator it = llangs.begin();
it != llangs.end(); it++) {
vector<string> oneexp;
expandOne(*it, term, oneexp);
result.insert(result.end(), oneexp.begin(), oneexp.end());
SynTermTransStem stemmer(*it);
XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
(void)expander.synExpand(term, result);
}
#ifndef RCL_INDEX_STRIPCHARS
for (vector<string>::const_iterator it = llangs.begin();
it != llangs.end(); it++) {
SynTermTransStem stemmer(*it);
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
*it, &stemmer);
string unac;
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
(void)expander.synExpand(unac, result);
}
#endif
if (result.empty())
result.push_back(term);
sort(result.begin(), result.end());
unique(result.begin(), result.end());
vector<string>::iterator uit = unique(result.begin(), result.end());
result.resize(uit - result.begin());
LOGDEB0(("stemExpand:%s: %s -> %s\n", langs.c_str(), term.c_str(),
stringsToString(result).c_str()));
return true;
}

View file

@ -55,9 +55,30 @@
#include <xapian.h>
#include "synfamily.h"
#include "unacpp.h"
namespace Rcl {
/* A stemming functor for using with XapComputableSynFamMember */
class SynTermTransStem : public SynTermTrans {
public:
SynTermTransStem(const std::string& lang)
: m_stemmer(lang), m_lang(lang)
{
}
virtual std::string operator()(const std::string& in)
{
string out = m_stemmer(in);
LOGDEB2(("SynTermTransStem(%s): in [%s] out [%s]\n", m_lang.c_str(),
in.c_str(), out.c_str()));
return out;
}
Xapian::Stem m_stemmer;
std::string m_lang;
};
/** Stemdb is a bit special as a SynFamily as we may want to expand for one
* or several members (languages) */
class StemDb : public XapSynFamily {
public:
StemDb(Xapian::Database& xdb)
@ -67,18 +88,10 @@ public:
/** Expand for a number of languages */
bool stemExpand(const std::string& langs,
const std::string& term,
std::vector<std::string>& result);
private:
/** Compute stem and call synExpand() */
bool expandOne(const std::string& lang,
const std::string& term,
std::vector<std::string>& result);
const std::string& term,
std::vector<std::string>& result);
};
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const std::vector<std::string>& langs);
}
#endif /* _STEMDB_H_INCLUDED_ */

View file

@ -28,31 +28,6 @@ using namespace std;
namespace Rcl {
bool XapSynFamily::synExpand(const string& member, const string& term,
vector<string>& result)
{
string key = entryprefix(member) + term;
string ermsg;
try {
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
xit != m_rdb.synonyms_end(key); xit++) {
result.push_back(*xit);
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("synFamily::synExpand: error for member [%s] term [%s]\n",
member.c_str(), term.c_str()));
return false;
}
#if 0
string out;
stringsToString(result, out);
LOGDEB0(("XapSynFamily::synExpand:%s: [%s] -> %s\n", member.c_str(),
term.c_str(), out.c_str()));
#endif
return true;
}
bool XapSynFamily::getMembers(vector<string>& members)
{
string key = memberskey();
@ -100,6 +75,35 @@ bool XapSynFamily::listMap(const string& membername)
return true;
}
bool XapSynFamily::synExpand(const string& member, const string& term,
vector<string>& result)
{
LOGDEB(("XapSynFamily::synExpand:(%s) %s for %s\n",
m_prefix1.c_str(), term.c_str(), member.c_str()));
string key = entryprefix(member) + term;
string ermsg;
try {
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
xit != m_rdb.synonyms_end(key); xit++) {
LOGDEB2((" Pushing %s\n", (*xit).c_str()));
result.push_back(*xit);
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("synFamily::synExpand: error for member [%s] term [%s]\n",
member.c_str(), term.c_str()));
result.push_back(term);
return false;
}
// If the input term is not in the list, add it
if (find(result.begin(), result.end(), term) == result.end()) {
result.push_back(term);
}
return true;
}
bool XapWritableSynFamily::deleteMember(const string& membername)
{
string key = entryprefix(membername);
@ -119,32 +123,61 @@ bool XapWritableSynFamily::createMember(const string& membername)
m_wdb.add_synonym(memberskey(), membername);
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("XapSynFamily::createMember: xapian error %s\n", ermsg.c_str()));
LOGERR(("XapSynFamily::createMember: error: %s\n", ermsg.c_str()));
return false;
}
return true;
}
bool XapWritableSynFamily::addSynonyms(const string& membername,
const string& term,
const vector<string>& trans)
bool XapComputableSynFamMember::synExpand(const string& term,
vector<string>& result,
SynTermTrans *filtertrans)
{
string key = entryprefix(membername) + term;
string root = (*m_trans)(term);
string filter_root;
if (filtertrans)
filter_root = (*filtertrans)(term);
/* We could call XapSynFamily::synExpand() here instead of doing it
ourselves... */
string key = m_prefix + root;
LOGDEB(("XapCompSynFamMbr::synExpand([%s]): term [%s] root [%s] \n",
m_prefix.c_str(), term.c_str(), root.c_str()));
string ermsg;
try {
for (vector<string>::const_iterator it = trans.begin();
it != trans.end(); it++) {
m_wdb.add_synonym(key, *it);
for (Xapian::TermIterator xit = m_family.getdb().synonyms_begin(key);
xit != m_family.getdb().synonyms_end(key); xit++) {
if (!filtertrans || (*filtertrans)(*xit) == filter_root) {
LOGDEB2((" Pushing %s\n", (*xit).c_str()));
result.push_back(*xit);
}
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("XapSynFamily::addSynonyms: xapian error %s\n", ermsg.c_str()));
LOGERR(("XapSynDb::synExpand: error for term [%s] (key %s)\n",
term.c_str(), key.c_str()));
result.push_back(term);
return false;
}
// If the input term and root are not in the list, add them
if (find(result.begin(), result.end(), term) == result.end()) {
LOGDEB2((" Pushing %s\n", term.c_str()));
result.push_back(term);
}
if (root != term &&
find(result.begin(), result.end(), root) == result.end()) {
if (!filtertrans || (*filtertrans)(root) == filter_root) {
LOGDEB2((" Pushing %s\n", root.c_str()));
result.push_back(root);
}
}
return true;
}
}
#else // TEST_SYNFAMILY
@ -169,16 +202,16 @@ using namespace std;
static string thisprog;
static int op_flags;
#define OPT_a 0x4
#define OPT_c 0x8
#define OPT_D 0x1
#define OPT_d 0x10
#define OPT_L 0x2
#define OPT_a 0x4
#define OPT_u 0x8
#define OPT_d 0x10
#define OPT_l 0x20
#define OPT_s 0x40
#define OPT_e 0x80
static string usage =
" -d <dbdir> {-s|-a|-c} database dir and synfamily: stem accents case\n"
" -d <dbdir> {-s|-a|-u} database dir and synfamily: stem accents/case ustem\n"
" -l : list members\n"
" -L <member>: list entries for given member\n"
" -e <member> <key> : list expansion for given member and key\n"
@ -209,7 +242,6 @@ int main(int argc, char **argv)
while (**argv)
switch (*(*argv)++) {
case 'a': op_flags |= OPT_a; break;
case 'c': op_flags |= OPT_c; break;
case 'D': op_flags |= OPT_D; break;
case 'd': op_flags |= OPT_d; if (argc < 2) Usage();
dbdir = *(++argv); argc--;
@ -223,6 +255,7 @@ int main(int argc, char **argv)
member = *(++argv); argc--;
goto b1;
case 's': op_flags |= OPT_s; break;
case 'u': op_flags |= OPT_u; break;
default: Usage(); break;
}
b1: argc--; argv++;
@ -231,12 +264,11 @@ int main(int argc, char **argv)
if (argc != 0)
Usage();
// We do stem only for now
string familyname;
if (op_flags & OPT_a) {
familyname = Rcl::synFamDiac;
} else if (op_flags &OPT_c) {
familyname = Rcl::synFamCase;
familyname = Rcl::synFamDiCa;
} else if (op_flags & OPT_u) {
familyname = Rcl::synFamStemUnac;
} else {
familyname = Rcl::synFamStem;
}

View file

@ -42,7 +42,7 @@
namespace Rcl {
class XapSynFamily {
class XapSynFamily {
public:
/**
* Construct from readable xapian database and family name (ie: Stm)
@ -53,38 +53,50 @@ public:
m_prefix1 = std::string(":") + familyname;
}
/** Expand one term (e.g.: familier) inside one family number (e.g: french)
*/
virtual bool synExpand(const std::string& fammember,
const std::string& key,
std::vector<std::string>& result);
/** Retrieve all members of this family (e.g: french english german...) */
virtual bool getMembers(std::vector<std::string>&);
/** debug: list map for one member to stdout */
virtual bool listMap(const std::string& fam);
protected:
Xapian::Database m_rdb;
std::string m_prefix1;
/** Expand term to list of synonyms for given member */
bool synExpand(const std::string& membername,
const std::string& term, std::vector<std::string>& result);
// The prefix shared by all synonym entries inside a family member
virtual std::string entryprefix(const std::string& member)
{
return m_prefix1 + ":" + member + ":";
}
// The key for the "list of members" entry
virtual std::string memberskey()
{
return m_prefix1 + ";" + "members";
}
Xapian::Database& getdb()
{
return m_rdb;
}
protected:
Xapian::Database m_rdb;
std::string m_prefix1;
};
/** Modify ops for a synonyms family
*
* A method to add a synonym entry inside a given member would make sense,
* but would not be used presently as all these ops go through
* ComputableSynFamMember objects
*/
class XapWritableSynFamily : public XapSynFamily {
public:
/** Construct with Xapian db open for r/w */
XapWritableSynFamily(Xapian::WritableDatabase db, const std::string& pfx)
: XapSynFamily(db, pfx), m_wdb(db)
XapWritableSynFamily(Xapian::WritableDatabase db,
const std::string& familyname)
: XapSynFamily(db, familyname), m_wdb(db)
{
}
@ -95,36 +107,92 @@ public:
/** Add to list of members. Idempotent, does not affect actual expansions */
virtual bool createMember(const std::string& membername);
/** Add expansion list for term inside family member (e.g., inside
* the english member, add expansion for floor -> floors, flooring.. */
virtual bool addSynonyms(const std::string& membername,
const std::string& term,
const std::vector<std::string>& trans);
Xapian::WritableDatabase getdb() {return m_wdb;}
// Need to call setCurrentMemberName before addSynonym !
// We don't check it, for speed
virtual void setCurrentMemberName(const std::string& nm)
protected:
Xapian::WritableDatabase m_wdb;
};
/** A functor which transforms a string */
class SynTermTrans {
public:
virtual std::string operator()(const std::string&) = 0;
};
/** A member (set of root-synonyms associations) of a SynFamily for
* which the root is computable from the input term.
* The objects use a functor member to compute the term root on input
* (e.g. compute the term sterm or casefold it
*/
class XapComputableSynFamMember {
public:
XapComputableSynFamMember(Xapian::Database xdb, std::string familyname,
std::string membername, SynTermTrans* trans)
: m_family(xdb, familyname), m_membername(membername),
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
{
m_currentPrefix = entryprefix(nm);
}
virtual bool addSynonym(const std::string& term, const std::string& trans)
/** Expand a term to its list of synonyms. If filtertrans is set we
* keep only the results which transform to the same value as the input */
bool synExpand(const std::string& term, std::vector<std::string>& result,
SynTermTrans *filtertrans = 0);
private:
XapSynFamily m_family;
std::string m_membername;
SynTermTrans *m_trans;
std::string m_prefix;
};
/** Computable term root SynFamily member, modify ops */
class XapWritableComputableSynFamMember {
public:
XapWritableComputableSynFamMember(
Xapian::WritableDatabase xdb, std::string familyname,
std::string membername, SynTermTrans* trans)
: m_family(xdb, familyname), m_membername(membername),
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
{
std::string key = m_currentPrefix + term;
}
virtual bool addSynonym(const std::string& term)
{
LOGDEB2(("addSynonym:me %p term [%s] m_trans %p\n", this,
term.c_str(), m_trans));
std::string transformed = (*m_trans)(term);
LOGDEB2(("addSynonym: transformed [%s]\n", transformed.c_str()));
if (transformed == term)
return true;
std::string ermsg;
try {
m_wdb.add_synonym(key, trans);
m_family.getdb().add_synonym(m_prefix + transformed, term);
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("XapSynFamily::addSynonym: xapian error %s\n",
ermsg.c_str()));
LOGERR(("XapWritableComputableSynFamMember::addSynonym: "
"xapian error %s\n", ermsg.c_str()));
return false;
}
return true;
}
protected:
Xapian::WritableDatabase m_wdb;
std::string m_currentPrefix;
void clear()
{
m_family.deleteMember(m_membername);
}
void recreate()
{
clear();
m_family.createMember(m_membername);
}
private:
XapWritableSynFamily m_family;
std::string m_membername;
SynTermTrans *m_trans;
std::string m_prefix;
};
@ -133,11 +201,13 @@ protected:
//
// Stem expansion family prefix. The family member name is the
// language ("all" for Dia and Cse)
// Lowercase accented stem to expansion
static const std::string synFamStem("Stm");
static const std::string synFamDiac("Dia");
static const std::string synFamCase("Cse");
// Lowercase unaccented stem to expansion
static const std::string synFamStemUnac("StU");
// Lowercase unaccented term to case and accent variations
static const std::string synFamDiCa("DCa");
}
#endif /* _SYNFAMILY_H_INCLUDED_ */

View file

@ -35,7 +35,7 @@
# Also reserved: F(parentid), Q(uniqueid)
title = S ; wdfinc = 10
author = A
abstract =
abstract = XS
caption = S
title = S
subject = S

View file

@ -103,7 +103,7 @@ public:
/** Append current utf-8 possibly multi-byte character to string param.
This needs to be fast. No error checking. */
unsigned int appendchartostring(std::string &out) {
unsigned int appendchartostring(std::string &out) const {
#ifdef UTF8ITER_CHECK
assert(m_cl != 0);
#endif