Make Recoll optionally sensitive to case and diacritics
This commit is contained in:
parent
7fcfe27952
commit
166624f7f2
30 changed files with 849 additions and 487 deletions
|
@ -63,26 +63,57 @@ bool unacmaybefold(const string &in, string &out,
|
|||
return true;
|
||||
}
|
||||
|
||||
// Functions to determine upper-case or accented status could be implemented
|
||||
// hugely more efficiently inside the unac c code, but there only used for
|
||||
// testing user-entered terms, so we don't really care.
|
||||
bool unaciscapital(const string& in)
|
||||
{
|
||||
LOGDEB2(("unaciscapital: [%s]\n", in.c_str()));
|
||||
if (in.empty())
|
||||
return false;
|
||||
Utf8Iter it(in);
|
||||
string shorter;
|
||||
it.appendchartostring(shorter);
|
||||
|
||||
string noacterm, noaclowterm;
|
||||
if (!unacmaybefold(shorter, noacterm, "UTF-8", UNACOP_UNAC)) {
|
||||
LOGINFO(("unaciscapital: unac failed for [%s]\n", in.c_str()));
|
||||
string lower;
|
||||
if (!unacmaybefold(shorter, lower, "UTF-8", UNACOP_FOLD)) {
|
||||
LOGINFO(("unaciscapital: unac/fold failed for [%s]\n", in.c_str()));
|
||||
return false;
|
||||
}
|
||||
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGINFO(("unaciscapital: unacfold failed for [%s]\n", in.c_str()));
|
||||
Utf8Iter it1(lower);
|
||||
if (*it != *it1)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
Utf8Iter it1(noacterm);
|
||||
Utf8Iter it2(noaclowterm);
|
||||
if (*it1 != *it2)
|
||||
}
|
||||
bool unachasuppercase(const string& in)
|
||||
{
|
||||
LOGDEB2(("unachasuppercase: [%s]\n", in.c_str()));
|
||||
if (in.empty())
|
||||
return false;
|
||||
|
||||
string lower;
|
||||
if (!unacmaybefold(in, lower, "UTF-8", UNACOP_FOLD)) {
|
||||
LOGINFO(("unachasuppercase: unac/fold failed for [%s]\n", in.c_str()));
|
||||
return false;
|
||||
}
|
||||
if (lower != in)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
bool unachasaccents(const string& in)
|
||||
{
|
||||
LOGDEB2(("unachasaccents: [%s]\n", in.c_str()));
|
||||
if (in.empty())
|
||||
return false;
|
||||
|
||||
string noac;
|
||||
if (!unacmaybefold(in, noac, "UTF-8", UNACOP_UNAC)) {
|
||||
LOGINFO(("unachasaccents: unac/unac failed for [%s]\n", in.c_str()));
|
||||
return false;
|
||||
}
|
||||
if (noac != in)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
|
@ -107,12 +138,15 @@ static char *thisprog;
|
|||
|
||||
static char usage [] = "\n"
|
||||
"[-c|-C] <encoding> <infile> <outfile>\n"
|
||||
" Default : unaccent\n"
|
||||
" -c : unaccent and casefold\n"
|
||||
" -C : casefold only\n"
|
||||
" Default : unaccent\n"
|
||||
" -c : unaccent and casefold\n"
|
||||
" -C : casefold only\n"
|
||||
"-t <string> test string as capitalized, upper-case anywhere, accents\n"
|
||||
" the parameter is supposedly utf-8 so this can only work in an utf-8\n"
|
||||
" locale\n"
|
||||
"\n";
|
||||
|
||||
;
|
||||
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
|
@ -123,6 +157,7 @@ Usage(void)
|
|||
static int op_flags;
|
||||
#define OPT_c 0x2
|
||||
#define OPT_C 0x4
|
||||
#define OPT_t 0x8
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
|
@ -140,58 +175,73 @@ int main(int argc, char **argv)
|
|||
switch (*(*argv)++) {
|
||||
case 'c': op_flags |= OPT_c; break;
|
||||
case 'C': op_flags |= OPT_C; break;
|
||||
case 't': op_flags |= OPT_t; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
argc--; argv++;
|
||||
}
|
||||
|
||||
if (op_flags & OPT_c) {
|
||||
op = UNACOP_UNACFOLD;
|
||||
} else if (op_flags & OPT_C) {
|
||||
op = UNACOP_FOLD;
|
||||
}
|
||||
|
||||
if (argc != 3) {
|
||||
Usage();
|
||||
}
|
||||
|
||||
const char *encoding = *argv++; argc--;
|
||||
string ifn = *argv++; argc--;
|
||||
if (!ifn.compare("stdin"))
|
||||
ifn.clear();
|
||||
const char *ofn = *argv++; argc--;
|
||||
|
||||
string reason;
|
||||
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
||||
|
||||
string odata;
|
||||
if (!file_to_string(ifn, odata)) {
|
||||
cerr << "file_to_string " << ifn << " : " << odata << endl;
|
||||
return 1;
|
||||
}
|
||||
string ndata;
|
||||
if (!unacmaybefold(odata, ndata, encoding, op)) {
|
||||
cerr << "unac: " << ndata << endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int fd;
|
||||
if (strcmp(ofn, "stdout")) {
|
||||
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
||||
if (op_flags & OPT_t) {
|
||||
if (argc != 1)
|
||||
Usage();
|
||||
string in = *argv++;argc--;
|
||||
bool capital, upper, accent;
|
||||
capital = unaciscapital(in);
|
||||
upper = unachasuppercase(in);
|
||||
accent = unachasaccents(in);
|
||||
cout << "[" << in << "] : " <<
|
||||
"capitalized: " << (capital ? "Yes. " : "No. ") <<
|
||||
"has uppercase: " << (upper ? "Yes. " : "No. ") <<
|
||||
"has accents: " << (accent ? "Yes. " : "No. ") <<
|
||||
endl;
|
||||
return 0;
|
||||
} else {
|
||||
fd = 1;
|
||||
if (argc != 3)
|
||||
Usage();
|
||||
if (op_flags & OPT_c) {
|
||||
op = UNACOP_UNACFOLD;
|
||||
} else if (op_flags & OPT_C) {
|
||||
op = UNACOP_FOLD;
|
||||
}
|
||||
|
||||
const char *encoding = *argv++; argc--;
|
||||
string ifn = *argv++; argc--;
|
||||
if (!ifn.compare("stdin"))
|
||||
ifn.clear();
|
||||
const char *ofn = *argv++; argc--;
|
||||
|
||||
string reason;
|
||||
(void)recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
||||
|
||||
string odata;
|
||||
if (!file_to_string(ifn, odata)) {
|
||||
cerr << "file_to_string " << ifn << " : " << odata << endl;
|
||||
return 1;
|
||||
}
|
||||
string ndata;
|
||||
if (!unacmaybefold(odata, ndata, encoding, op)) {
|
||||
cerr << "unac: " << ndata << endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int fd;
|
||||
if (strcmp(ofn, "stdout")) {
|
||||
fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666);
|
||||
} else {
|
||||
fd = 1;
|
||||
}
|
||||
if (fd < 0) {
|
||||
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
||||
<< endl;
|
||||
return 1;
|
||||
}
|
||||
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
|
||||
cerr << "Write(2) failed: " << strerror(errno) << endl;
|
||||
return 1;
|
||||
}
|
||||
close(fd);
|
||||
return 0;
|
||||
}
|
||||
if (fd < 0) {
|
||||
cerr << "Open/Create " << ofn << " failed: " << strerror(errno)
|
||||
<< endl;
|
||||
return 1;
|
||||
}
|
||||
if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) {
|
||||
cerr << "Write(2) failed: " << strerror(errno) << endl;
|
||||
return 1;
|
||||
}
|
||||
close(fd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -24,11 +24,17 @@ using std::string;
|
|||
#endif /* NO_NAMESPACES */
|
||||
|
||||
// A small stringified wrapper for unac.c
|
||||
enum UnacOp {UNACOP_UNAC, UNACOP_UNACFOLD, UNACOP_FOLD};
|
||||
enum UnacOp {UNACOP_UNAC = 1, UNACOP_FOLD = 2, UNACOP_UNACFOLD = 3};
|
||||
extern bool unacmaybefold(const string& in, string& out,
|
||||
const char *encoding, UnacOp what);
|
||||
|
||||
// Utility function to determine if string begins with capital
|
||||
extern bool unaciscapital(const string& in);
|
||||
// Utility function to determine if string has upper-case anywhere
|
||||
extern bool unachasuppercase(const string& in);
|
||||
// Utility function to determine if any character is accented. This
|
||||
// approprialey ignores the characters from unac_except_chars which
|
||||
// are really separate letters
|
||||
extern bool unachasaccents(const string& in);
|
||||
|
||||
#endif /* _UNACPP_H_INCLUDED_ */
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
#ifndef TEST_SUBTREELIST
|
||||
|
||||
#include "cstr.h"
|
||||
#include "refcntr.h"
|
||||
#include "rcldb.h"
|
||||
#include "searchdata.h"
|
||||
|
@ -35,7 +36,7 @@ bool subtreelist(RclConfig *config, const string& top,
|
|||
return false;
|
||||
}
|
||||
|
||||
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR);
|
||||
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, cstr_null);
|
||||
RefCntr<Rcl::SearchData> rq(sd);
|
||||
|
||||
rq->addDirSpec(top);
|
||||
|
|
|
@ -6,8 +6,8 @@ LIBS = librcl.a
|
|||
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o expansiondbs.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp expansiondbs.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
|
||||
librcl.a : $(DEPS) $(OBJS)
|
||||
ar ru librcl.a $(OBJS)
|
||||
|
@ -87,6 +87,8 @@ wasastringtoquery.o : ../query/wasastringtoquery.cpp $(depth)/mk/localdefs
|
|||
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp
|
||||
wasatorcl.o : ../query/wasatorcl.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp
|
||||
expansiondbs.o : ../rcldb/expansiondbs.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/expansiondbs.cpp
|
||||
rcldb.o : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp
|
||||
rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
|
||||
|
@ -278,6 +280,9 @@ wasastringtoquery.dep.stamp : ../query/wasastringtoquery.cpp $(depth)/mk/localde
|
|||
wasatorcl.dep.stamp : ../query/wasatorcl.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep
|
||||
touch wasatorcl.dep.stamp
|
||||
expansiondbs.dep.stamp : ../rcldb/expansiondbs.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/expansiondbs.cpp > expansiondbs.dep
|
||||
touch expansiondbs.dep.stamp
|
||||
rcldb.dep.stamp : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep
|
||||
touch rcldb.dep.stamp
|
||||
|
@ -405,6 +410,7 @@ include reslistpager.dep
|
|||
include sortseq.dep
|
||||
include wasastringtoquery.dep
|
||||
include wasatorcl.dep
|
||||
include expansiondbs.dep
|
||||
include rcldb.dep
|
||||
include rcldoc.dep
|
||||
include rclquery.dep
|
||||
|
|
|
@ -41,6 +41,7 @@ ${depth}/query/reslistpager.cpp \
|
|||
${depth}/query/sortseq.cpp \
|
||||
${depth}/query/wasastringtoquery.cpp \
|
||||
${depth}/query/wasatorcl.cpp \
|
||||
${depth}/rcldb/expansiondbs.cpp \
|
||||
${depth}/rcldb/rcldb.cpp \
|
||||
${depth}/rcldb/rcldoc.cpp \
|
||||
${depth}/rcldb/rclquery.cpp \
|
||||
|
|
|
@ -93,7 +93,7 @@ SearchData_init(recoll_SearchDataObject *self, PyObject *args, PyObject *kwargs)
|
|||
if (stp && strcasecmp(stp, "or")) {
|
||||
tp = Rcl::SCLT_OR;
|
||||
}
|
||||
self->sd = RefCntr<Rcl::SearchData>(new Rcl::SearchData(tp));
|
||||
self->sd = RefCntr<Rcl::SearchData>(new Rcl::SearchData(tp, "english"));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -715,18 +715,18 @@ Query_execute(recoll_QueryObject* self, PyObject *args, PyObject *kwargs)
|
|||
PyErr_SetString(PyExc_AttributeError, "query");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// SearchData defaults to stemming in english
|
||||
// Use default for now but need to add way to specify language
|
||||
string reason;
|
||||
Rcl::SearchData *sd = wasaStringToRcl(rclconfig, utf8, reason);
|
||||
Rcl::SearchData *sd = wasaStringToRcl(rclconfig, dostem ? "english" : "",
|
||||
utf8, reason);
|
||||
|
||||
if (!sd) {
|
||||
PyErr_SetString(PyExc_ValueError, reason.c_str());
|
||||
return 0;
|
||||
}
|
||||
|
||||
// SearchData defaults to stemming in english
|
||||
// Use default for now but need to add way to specify language
|
||||
if (!dostem)
|
||||
sd->setStemlang("");
|
||||
RefCntr<Rcl::SearchData> rq(sd);
|
||||
string sf = self->sortfield ? string(self->sortfield) : string("");
|
||||
self->query->setSortBy(sf, self->ascending);
|
||||
|
|
|
@ -356,8 +356,9 @@ size_t AdvSearch::stringToSize(QString qsize)
|
|||
using namespace Rcl;
|
||||
void AdvSearch::runSearch()
|
||||
{
|
||||
string stemLang = prefs.stemlang();
|
||||
RefCntr<SearchData> sdata(new SearchData(conjunctCMB->currentIndex() == 0 ?
|
||||
SCLT_AND : SCLT_OR));
|
||||
SCLT_AND : SCLT_OR, stemLang));
|
||||
bool hasclause = false;
|
||||
|
||||
for (list<SearchClauseW*>::iterator it = m_clauseWins.begin();
|
||||
|
|
|
@ -372,6 +372,18 @@ void rwSettings(bool writing)
|
|||
}
|
||||
}
|
||||
|
||||
string PrefsPack::stemlang()
|
||||
{
|
||||
string stemLang = (const char *)prefs.queryStemLang.toAscii();
|
||||
if (stemLang == "ALL") {
|
||||
if (theconfig)
|
||||
theconfig->getConfParam("indexstemminglanguages", stemLang);
|
||||
else
|
||||
stemLang = "";
|
||||
}
|
||||
return stemLang;
|
||||
}
|
||||
|
||||
QString myGetFileName(bool isdir, QString caption, bool filenosave)
|
||||
{
|
||||
LOGDEB1(("myFileDialog: isdir %d\n", isdir));
|
||||
|
|
|
@ -120,6 +120,8 @@ class PrefsPack {
|
|||
// Default paragraph format for result list
|
||||
static const char *dfltResListFormat;
|
||||
|
||||
std::string stemlang();
|
||||
|
||||
PrefsPack() :
|
||||
respagesize(8),
|
||||
reslistfontsize(10),
|
||||
|
|
|
@ -756,12 +756,6 @@ void RclMain::startSearch(RefCntr<Rcl::SearchData> sdata)
|
|||
return;
|
||||
}
|
||||
|
||||
string stemLang = (const char *)prefs.queryStemLang.toAscii();
|
||||
if (stemLang == "ALL") {
|
||||
theconfig->getConfParam("indexstemminglanguages", stemLang);
|
||||
}
|
||||
sdata->setStemlang(stemLang);
|
||||
|
||||
Rcl::Query *query = new Rcl::Query(rcldb);
|
||||
query->setCollapseDuplicates(prefs.collapseDuplicates);
|
||||
|
||||
|
@ -1073,9 +1067,7 @@ void RclMain::showActiveTypes()
|
|||
// Get list of all mime types in index. For this, we use a
|
||||
// wildcard field search on mtype
|
||||
Rcl::TermMatchResult matches;
|
||||
string prefix;
|
||||
if (!rcldb->termMatch(Rcl::Db::ET_WILD, "", "*", matches, -1, "mtype",
|
||||
&prefix)) {
|
||||
if (!rcldb->termMatch(Rcl::Db::ET_WILD, "", "*", matches, -1, "mtype")) {
|
||||
QMessageBox::warning(0, tr("Error"),
|
||||
tr("Index query error"),
|
||||
QMessageBox::Ok,
|
||||
|
@ -1088,7 +1080,7 @@ void RclMain::showActiveTypes()
|
|||
for (vector<Rcl::TermMatchEntry>::const_iterator it =
|
||||
matches.entries.begin();
|
||||
it != matches.entries.end(); it++) {
|
||||
mtypesfromdb.insert(it->term.substr(prefix.size()));
|
||||
mtypesfromdb.insert(it->term.substr(matches.prefix.size()));
|
||||
}
|
||||
|
||||
// All types listed in mimeconf:
|
||||
|
@ -1771,7 +1763,7 @@ void RclMain::showDocHistory()
|
|||
}
|
||||
// Construct a bogus SearchData structure
|
||||
RefCntr<Rcl::SearchData>searchdata =
|
||||
RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND));
|
||||
RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND, cstr_null));
|
||||
searchdata->setDescription((const char *)tr("History data").toUtf8());
|
||||
|
||||
|
||||
|
|
|
@ -126,23 +126,25 @@ void SSearch::startSimpleSearch()
|
|||
if (u8.length() == 0)
|
||||
return;
|
||||
|
||||
string stemlang = prefs.stemlang();
|
||||
|
||||
SSearchType tp = (SSearchType)searchTypCMB->currentIndex();
|
||||
Rcl::SearchData *sdata = 0;
|
||||
|
||||
if (tp == SST_LANG) {
|
||||
string reason;
|
||||
if (prefs.autoSuffsEnable)
|
||||
sdata = wasaStringToRcl(theconfig, u8, reason,
|
||||
sdata = wasaStringToRcl(theconfig, stemlang, u8, reason,
|
||||
(const char *)prefs.autoSuffs.toUtf8());
|
||||
else
|
||||
sdata = wasaStringToRcl(theconfig, u8, reason);
|
||||
sdata = wasaStringToRcl(theconfig, stemlang, u8, reason);
|
||||
if (sdata == 0) {
|
||||
QMessageBox::warning(0, "Recoll", tr("Bad query string") + ": " +
|
||||
QString::fromAscii(reason.c_str()));
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
sdata = new Rcl::SearchData(Rcl::SCLT_OR);
|
||||
sdata = new Rcl::SearchData(Rcl::SCLT_OR, stemlang);
|
||||
if (sdata == 0) {
|
||||
QMessageBox::warning(0, "Recoll", tr("Out of memory"));
|
||||
return;
|
||||
|
@ -166,11 +168,6 @@ void SSearch::startSimpleSearch()
|
|||
}
|
||||
|
||||
if (prefs.ssearchAutoPhrase && rcldb) {
|
||||
string stemLang = (const char *)prefs.queryStemLang.toAscii();
|
||||
if (stemLang == "ALL") {
|
||||
theconfig->getConfParam("indexstemminglanguages", stemLang);
|
||||
}
|
||||
sdata->setStemlang(stemLang);
|
||||
sdata->maybeAddAutoPhrase(*rcldb,
|
||||
prefs.ssearchAutoPhraseThreshPC / 100.0);
|
||||
}
|
||||
|
@ -277,10 +274,9 @@ void SSearch::completion()
|
|||
// Query database
|
||||
const int max = 100;
|
||||
Rcl::TermMatchResult tmres;
|
||||
string stemLang = (const char *)prefs.queryStemLang.toAscii();
|
||||
if (stemLang == "ALL") {
|
||||
theconfig->getConfParam("indexstemminglanguages", stemLang);
|
||||
}
|
||||
|
||||
string stemLang = prefs.stemlang();
|
||||
|
||||
if (!rcldb->termMatch(Rcl::Db::ET_WILD, stemLang, s, tmres, max) ||
|
||||
tmres.entries.size() == 0) {
|
||||
QApplication::beep();
|
||||
|
|
|
@ -120,7 +120,8 @@ bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
|
|||
LOGDEB(("DocSequenceDb::setFiltSpec\n"));
|
||||
if (fs.isNotNull()) {
|
||||
// We build a search spec by adding a filtering layer to the base one.
|
||||
m_fsdata = RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND));
|
||||
m_fsdata = RefCntr<Rcl::SearchData>(
|
||||
new Rcl::SearchData(Rcl::SCLT_AND, m_sdata->getStemLang()));
|
||||
Rcl::SearchDataClauseSub *cl =
|
||||
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB, m_sdata);
|
||||
m_fsdata->addClause(cl);
|
||||
|
@ -138,6 +139,7 @@ bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
|
|||
string reason;
|
||||
Rcl::SearchData *sd =
|
||||
wasaStringToRcl(m_q->whatDb()->getConf(),
|
||||
m_sdata->getStemLang(),
|
||||
fs.values[i], reason);
|
||||
if (sd) {
|
||||
Rcl::SearchDataClauseSub *cl1 =
|
||||
|
|
|
@ -50,7 +50,10 @@ static string vecStringToString(const vector<string>& t)
|
|||
}
|
||||
|
||||
struct MatchEntry {
|
||||
// Start/End byte offsets in the document text
|
||||
pair<int, int> offs;
|
||||
// Index of the search group this comes from: this is to relate a
|
||||
// match to the original user input.
|
||||
unsigned int grpidx;
|
||||
MatchEntry(int sta, int sto, unsigned int idx)
|
||||
: offs(sta, sto), grpidx(idx)
|
||||
|
@ -76,11 +79,23 @@ class TextSplitPTR : public TextSplit {
|
|||
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
|
||||
vit != hdata.groups.end(); vit++) {
|
||||
if (vit->size() == 1) {
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
m_terms[vit->front()] = vit - hdata.groups.begin();
|
||||
#else
|
||||
string dumb = vit->front();
|
||||
unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
|
||||
m_terms[dumb] = vit - hdata.groups.begin();
|
||||
#endif
|
||||
} else if (vit->size() > 1) {
|
||||
for (vector<string>::const_iterator it = vit->begin();
|
||||
it != vit->end(); it++) {
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
m_gterms.insert(*it);
|
||||
#else
|
||||
string dumb = *it;
|
||||
unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
|
||||
m_gterms.insert(dumb);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -286,7 +286,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
|
|||
Rcl::SearchData *sd = 0;
|
||||
|
||||
if (op_flags & (OPT_a|OPT_o|OPT_f)) {
|
||||
sd = new Rcl::SearchData(Rcl::SCLT_OR);
|
||||
sd = new Rcl::SearchData(Rcl::SCLT_OR, stemlang);
|
||||
Rcl::SearchDataClause *clp = 0;
|
||||
if (op_flags & OPT_f) {
|
||||
clp = new Rcl::SearchDataClauseFilename(qs);
|
||||
|
@ -305,14 +305,13 @@ int recollq(RclConfig **cfp, int argc, char **argv)
|
|||
if (sd)
|
||||
sd->addClause(clp);
|
||||
} else {
|
||||
sd = wasaStringToRcl(rclconfig, qs, reason);
|
||||
sd = wasaStringToRcl(rclconfig, stemlang, qs, reason);
|
||||
}
|
||||
|
||||
if (!sd) {
|
||||
cerr << "Query string interpretation failed: " << reason << endl;
|
||||
return 1;
|
||||
}
|
||||
sd->setStemlang(stemlang);
|
||||
|
||||
RefCntr<Rcl::SearchData> rq(sd);
|
||||
Rcl::Query query(&rcldb);
|
||||
|
|
|
@ -32,7 +32,9 @@ using std::list;
|
|||
#include "refcntr.h"
|
||||
#include "textsplit.h"
|
||||
|
||||
static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
||||
static Rcl::SearchData *wasaQueryToRcl(RclConfig *config,
|
||||
const string& stemlang,
|
||||
WasaQuery *wasa,
|
||||
const string& autosuffs, string& reason)
|
||||
{
|
||||
if (wasa == 0) {
|
||||
|
@ -47,7 +49,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
|||
|
||||
Rcl::SearchData *sdata = new
|
||||
Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND :
|
||||
Rcl::SCLT_OR);
|
||||
Rcl::SCLT_OR, stemlang);
|
||||
LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ?
|
||||
"AND" : "OR"));
|
||||
|
||||
|
@ -250,7 +252,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
|||
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
|
||||
// Create a subquery.
|
||||
Rcl::SearchData *sub =
|
||||
wasaQueryToRcl(config, *it, autosuffs, reason);
|
||||
wasaQueryToRcl(config, stemlang, *it, autosuffs, reason);
|
||||
if (sub == 0) {
|
||||
continue;
|
||||
}
|
||||
|
@ -278,7 +280,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
|||
return sdata;
|
||||
}
|
||||
|
||||
Rcl::SearchData *wasaStringToRcl(RclConfig *config,
|
||||
Rcl::SearchData *wasaStringToRcl(RclConfig *config, const string& stemlang,
|
||||
const string &qs, string &reason,
|
||||
const string& autosuffs)
|
||||
{
|
||||
|
@ -286,5 +288,5 @@ Rcl::SearchData *wasaStringToRcl(RclConfig *config,
|
|||
WasaQuery *wq = parser.stringToQuery(qs, reason);
|
||||
if (wq == 0)
|
||||
return 0;
|
||||
return wasaQueryToRcl(config, wq, autosuffs, reason);
|
||||
return wasaQueryToRcl(config, stemlang, wq, autosuffs, reason);
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ using std::string;
|
|||
|
||||
class RclConfig;
|
||||
|
||||
extern Rcl::SearchData *wasaStringToRcl(RclConfig *,
|
||||
extern Rcl::SearchData *wasaStringToRcl(RclConfig *, const string& stemlang,
|
||||
const string& query, string &reason,
|
||||
const string& autosuffs = string());
|
||||
#endif /* _WASATORCL_H_INCLUDED_ */
|
||||
|
|
|
@ -14,6 +14,9 @@
|
|||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <signal.h>
|
||||
|
@ -36,7 +39,8 @@ using namespace std;
|
|||
static string thisprog;
|
||||
|
||||
static string usage =
|
||||
" -d <dbdir> -e <output encoding>\n"
|
||||
" -d <dbdir> \n"
|
||||
"-e <output encoding>\n"
|
||||
" -i docid -D : get document data for docid\n"
|
||||
" -i docid -X : delete document docid\n"
|
||||
" -i docid -b : 'rebuild' document from term positions\n"
|
||||
|
@ -112,6 +116,15 @@ static void sigcleanup(int sig)
|
|||
exit(1);
|
||||
}
|
||||
|
||||
inline bool has_prefix(const string& trm)
|
||||
{
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z';
|
||||
#else
|
||||
return trm.size() > 0 && trm[0] == ':';
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
string dbdir = path_cat(path_home(), ".recoll/xapiandb");
|
||||
|
@ -201,8 +214,7 @@ int main(int argc, char **argv)
|
|||
for (term = db->termlist_begin(docid);
|
||||
term != db->termlist_end(docid);term++) {
|
||||
const string& s = *term;
|
||||
if ((op_flags&OPT_l) &&
|
||||
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
|
||||
if ((op_flags&OPT_l) && has_prefix(s))
|
||||
continue;
|
||||
cout << op << detailstring(s) << cl << endl;
|
||||
}
|
||||
|
@ -210,8 +222,7 @@ int main(int argc, char **argv)
|
|||
for (term = db->allterms_begin();
|
||||
term != db->allterms_end();term++) {
|
||||
const string& s = *term;
|
||||
if ((op_flags&OPT_l) &&
|
||||
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
|
||||
if ((op_flags&OPT_l) && has_prefix(s))
|
||||
continue;
|
||||
if (op_flags & OPT_f)
|
||||
cout << db->get_collection_freq(*term) << " "
|
||||
|
|
|
@ -72,7 +72,7 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|||
|
||||
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
||||
XapWritableComputableSynFamMember
|
||||
diacasedb(wdb, synFamDiac, "all", &transunac);
|
||||
diacasedb(wdb, synFamDiCa, "all", &transunac);
|
||||
diacasedb.recreate();
|
||||
#endif
|
||||
|
||||
|
|
|
@ -14,6 +14,8 @@
|
|||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <cstring>
|
||||
#include <unistd.h>
|
||||
|
@ -53,6 +55,7 @@ using namespace std;
|
|||
#include "cancelcheck.h"
|
||||
#include "ptmutex.h"
|
||||
#include "termproc.h"
|
||||
#include "expansiondbs.h"
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(A,B) (A>B?A:B)
|
||||
|
@ -84,9 +87,15 @@ static const string xapday_prefix = "D";
|
|||
static const string xapmonth_prefix = "M";
|
||||
static const string xapyear_prefix = "Y";
|
||||
const string pathelt_prefix = "XP";
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
const string start_of_field_term = "XXST";
|
||||
const string end_of_field_term = "XXND";
|
||||
static const string page_break_term = "XXPG";
|
||||
#else
|
||||
const string start_of_field_term = "XXST/";
|
||||
const string end_of_field_term = "XXND/";
|
||||
static const string page_break_term = "XXPG/";
|
||||
#endif
|
||||
// Field name for the unsplit file name. Has to exist in the field file
|
||||
// because of usage in termmatch()
|
||||
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
||||
|
@ -197,7 +206,7 @@ static void noPrefixList(const vector<string>& in, vector<string>& out)
|
|||
{
|
||||
for (vector<string>::const_iterator qit = in.begin();
|
||||
qit != in.end(); qit++) {
|
||||
if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
|
||||
if (!has_prefix(*qit))
|
||||
out.push_back(*qit);
|
||||
}
|
||||
}
|
||||
|
@ -573,7 +582,7 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||
for (term = xrdb.termlist_begin(docid);
|
||||
term != xrdb.termlist_end(docid); term++) {
|
||||
// Ignore prefixed terms
|
||||
if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
|
||||
if (has_prefix(*term))
|
||||
continue;
|
||||
if (cutoff-- < 0) {
|
||||
LOGDEB0(("makeAbstract: max term count cutoff\n"));
|
||||
|
@ -652,7 +661,9 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||
vabs.push_back(chunk);
|
||||
chunk.clear();
|
||||
} else {
|
||||
chunk += it->second;
|
||||
if (it->second.compare(end_of_field_term) &&
|
||||
it->second.compare(start_of_field_term))
|
||||
chunk += it->second;
|
||||
}
|
||||
}
|
||||
if (!chunk.empty())
|
||||
|
@ -874,11 +885,13 @@ int Db::termDocCnt(const string& _term)
|
|||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return -1;
|
||||
|
||||
string term;
|
||||
string term = _term;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (m_stops.isStop(term)) {
|
||||
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
|
||||
|
@ -994,8 +1007,19 @@ class TextSplitDb : public TextSplitP {
|
|||
{}
|
||||
// Reimplement text_to_words to add start and end special terms
|
||||
virtual bool text_to_words(const string &in);
|
||||
void setprefix(const string& pref) {prefix = pref;}
|
||||
void setwdfinc(int i) {wdfinc = i;}
|
||||
|
||||
void setprefix(const string& pref)
|
||||
{
|
||||
if (pref.empty())
|
||||
prefix.clear();
|
||||
else
|
||||
prefix = wrap_prefix(pref);
|
||||
}
|
||||
|
||||
void setwdfinc(int i)
|
||||
{
|
||||
wdfinc = i;
|
||||
}
|
||||
|
||||
friend class TermProcIdx;
|
||||
|
||||
|
@ -1127,11 +1151,13 @@ string Db::getSpellingSuggestion(const string& word)
|
|||
{
|
||||
if (m_ndb == 0)
|
||||
return string();
|
||||
string term;
|
||||
string term = word;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
||||
return string();
|
||||
}
|
||||
#endif
|
||||
if (!isSpellingCandidate(term))
|
||||
return string();
|
||||
return m_ndb->xrdb.get_spelling_suggestion(term);
|
||||
|
@ -1239,8 +1265,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||
TermProcIdx tpidx;
|
||||
TermProc *nxt = &tpidx;
|
||||
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
||||
// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
||||
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
||||
#endif
|
||||
|
||||
TextSplitDb splitter(newdocument, nxt);
|
||||
tpidx.setTSD(&splitter);
|
||||
|
@ -1266,7 +1294,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||
vector<string> vpath;
|
||||
stringToTokens(path, vpath, "/");
|
||||
splitter.curpos = 0;
|
||||
newdocument.add_posting(pathelt_prefix,
|
||||
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
||||
splitter.basepos + splitter.curpos++);
|
||||
for (vector<string>::iterator it = vpath.begin();
|
||||
it != vpath.end(); it++){
|
||||
|
@ -1274,7 +1302,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||
// Just truncate it. May still be useful because of wildcards
|
||||
*it = it->substr(0, 230);
|
||||
}
|
||||
newdocument.add_posting(pathelt_prefix + *it,
|
||||
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
||||
splitter.basepos + splitter.curpos++);
|
||||
}
|
||||
}
|
||||
|
@ -1319,7 +1347,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||
|
||||
////// Special terms for other metadata. No positions for these.
|
||||
// Mime type
|
||||
newdocument.add_term(mimetype_prefix + doc.mimetype);
|
||||
newdocument.add_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
||||
|
||||
// Simple file name indexed unsplit for specific "file name"
|
||||
// searches. This is not the same as a filename: clause inside the
|
||||
|
@ -1335,9 +1363,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||
utf8truncate(fn, 230);
|
||||
string::size_type pos = fn.rfind('.');
|
||||
if (pos != string::npos && pos != fn.length() - 1) {
|
||||
newdocument.add_term(fileext_prefix + fn.substr(pos + 1));
|
||||
newdocument.add_term(wrap_prefix(fileext_prefix) +
|
||||
fn.substr(pos + 1));
|
||||
}
|
||||
newdocument.add_term(unsplitfilename_prefix + fn);
|
||||
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1356,12 +1385,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||
struct tm *tm = localtime(&mtime);
|
||||
char buf[9];
|
||||
snprintf(buf, 9, "%04d%02d%02d",
|
||||
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
||||
newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
|
||||
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
||||
// Date (YYYYMMDD)
|
||||
newdocument.add_term(wrap_prefix(xapday_prefix) + string(buf));
|
||||
// Month (YYYYMM)
|
||||
buf[6] = '\0';
|
||||
newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
|
||||
newdocument.add_term(wrap_prefix(xapmonth_prefix) + string(buf));
|
||||
// Year (YYYY)
|
||||
buf[4] = '\0';
|
||||
newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
|
||||
newdocument.add_term(wrap_prefix(xapyear_prefix) + string(buf));
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
|
@ -1834,7 +1866,7 @@ bool Db::maxYearSpan(int *minyear, int *maxyear)
|
|||
*minyear = 1000000;
|
||||
*maxyear = -1000000;
|
||||
TermMatchResult result;
|
||||
if (!termMatch(ET_WILD, string(), "*", result, 5000, "xapyear"))
|
||||
if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear"))
|
||||
return false;
|
||||
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
||||
it != result.entries.end(); it++) {
|
||||
|
@ -1899,30 +1931,32 @@ const string cstr_wildSpecChars = "*?[";
|
|||
const string cstr_regSpecChars = "(.[{";
|
||||
|
||||
// Find all index terms that match a wildcard or regular expression
|
||||
// If field is set, we return a list of appropriately prefixed terms (which
|
||||
// are going to be used to build a Xapian query).
|
||||
bool Db::termMatch(MatchType typ, const string &lang,
|
||||
const string &root,
|
||||
TermMatchResult& res,
|
||||
int max,
|
||||
const string& field,
|
||||
string *prefixp
|
||||
)
|
||||
const string& field)
|
||||
{
|
||||
if (!m_ndb || !m_ndb->m_isopen)
|
||||
return false;
|
||||
Xapian::Database xdb = m_ndb->xdb();
|
||||
|
||||
res.clear();
|
||||
XAPTRY(res.dbdoccount = xdb.get_doccount();
|
||||
res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
|
||||
if (!m_reason.empty())
|
||||
return false;
|
||||
|
||||
// Get rid of capitals and accents
|
||||
string droot;
|
||||
|
||||
string droot = root;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
|
||||
|
||||
string prefix;
|
||||
|
@ -1932,17 +1966,14 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
|
||||
field.c_str()));
|
||||
} else {
|
||||
prefix = ftp->pfx;
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
if (prefixp)
|
||||
*prefixp = prefix;
|
||||
}
|
||||
res.prefix = prefix;
|
||||
|
||||
if (typ == ET_STEM) {
|
||||
if (!stemExpand(lang, root, res, max))
|
||||
return false;
|
||||
sort(res.entries.begin(), res.entries.end());
|
||||
unique(res.entries.begin(), res.entries.end());
|
||||
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
|
||||
|
@ -2032,7 +2063,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||
TermMatchCmpByTerm tcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), tcmp);
|
||||
TermMatchTermEqual teq;
|
||||
unique(res.entries.begin(), res.entries.end(), teq);
|
||||
vector<TermMatchEntry>::iterator uit =
|
||||
unique(res.entries.begin(), res.entries.end(), teq);
|
||||
res.entries.resize(uit - res.entries.begin());
|
||||
TermMatchCmpByWcf wcmp;
|
||||
sort(res.entries.begin(), res.entries.end(), wcmp);
|
||||
if (max > 0) {
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
@ -73,21 +75,50 @@ class Query;
|
|||
/** Used for returning result lists for index terms matching some criteria */
|
||||
class TermMatchEntry {
|
||||
public:
|
||||
TermMatchEntry() : wcf(0) {}
|
||||
TermMatchEntry(const string&t, int f, int d) : term(t), wcf(f), docs(d) {}
|
||||
TermMatchEntry(const string&t) : term(t), wcf(0) {}
|
||||
bool operator==(const TermMatchEntry &o) const { return term == o.term;}
|
||||
bool operator<(const TermMatchEntry &o) const { return term < o.term;}
|
||||
TermMatchEntry()
|
||||
: wcf(0)
|
||||
{
|
||||
}
|
||||
TermMatchEntry(const string& t, int f, int d)
|
||||
: term(t), wcf(f), docs(d)
|
||||
{
|
||||
}
|
||||
TermMatchEntry(const string& t)
|
||||
: term(t), wcf(0)
|
||||
{
|
||||
}
|
||||
bool operator==(const TermMatchEntry &o) const
|
||||
{
|
||||
return term == o.term;
|
||||
}
|
||||
bool operator<(const TermMatchEntry &o) const
|
||||
{
|
||||
return term < o.term;
|
||||
}
|
||||
|
||||
string term;
|
||||
int wcf; // Total count of occurrences within collection.
|
||||
int docs; // Number of documents countaining term.
|
||||
};
|
||||
|
||||
/** Term match result list header: statistics and global info */
|
||||
class TermMatchResult {
|
||||
public:
|
||||
TermMatchResult() {clear();}
|
||||
void clear() {entries.clear(); dbdoccount = 0; dbavgdoclen = 0;}
|
||||
TermMatchResult()
|
||||
{
|
||||
clear();
|
||||
}
|
||||
void clear()
|
||||
{
|
||||
entries.clear();
|
||||
dbdoccount = 0;
|
||||
dbavgdoclen = 0;
|
||||
}
|
||||
// Term expansion
|
||||
vector<TermMatchEntry> entries;
|
||||
// If a field was specified, this is the corresponding index prefix
|
||||
string prefix;
|
||||
// Index-wide stats
|
||||
unsigned int dbdoccount;
|
||||
double dbavgdoclen;
|
||||
};
|
||||
|
@ -95,6 +126,24 @@ public:
|
|||
#ifdef IDX_THREADS
|
||||
extern void *DbUpdWorker(void*);
|
||||
#endif // IDX_THREADS
|
||||
|
||||
inline bool has_prefix(const string& trm)
|
||||
{
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
|
||||
#else
|
||||
return !trm.empty() && trm[0] == ':';
|
||||
#endif
|
||||
}
|
||||
inline string wrap_prefix(const string& pfx)
|
||||
{
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
return pfx;
|
||||
#else
|
||||
return cstr_colon + pfx + cstr_colon;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapper class for the native database.
|
||||
*/
|
||||
|
@ -132,6 +181,8 @@ class Db {
|
|||
{
|
||||
if (term.empty() || term.length() > 50)
|
||||
return false;
|
||||
if (has_prefix(term))
|
||||
return false;
|
||||
if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
|
||||
!= string::npos)
|
||||
return false;
|
||||
|
@ -205,12 +256,23 @@ class Db {
|
|||
|
||||
/** Return the index terms that match the input string
|
||||
* Expansion is performed either with either wildcard or regexp processing
|
||||
* Stem expansion is performed if lang is not empty */
|
||||
* Stem expansion is performed if lang is not empty
|
||||
*
|
||||
* @param typ defines the kind of expansion: wildcard, regexp or stemming
|
||||
* @param lang sets the stemming language(s). Can be a space-separated list
|
||||
* @param term is the term to expand
|
||||
* @param result is the main output
|
||||
* @param max defines the maximum result count
|
||||
* @param field if set, defines the field within with the expansion should
|
||||
* be performed. Only used for wildcards and regexps, stemming is
|
||||
* always global. If this is set, the resulting output terms
|
||||
* will be appropriately prefix and the prefix value will be set
|
||||
* in the TermMatchResult header
|
||||
*/
|
||||
enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
|
||||
bool termMatch(MatchType typ, const string &lang, const string &s,
|
||||
bool termMatch(MatchType typ, const string &lang, const string &term,
|
||||
TermMatchResult& result, int max = -1,
|
||||
const string& field = cstr_null,
|
||||
string *prefix = 0
|
||||
const string& field = cstr_null
|
||||
);
|
||||
/** Return min and max years for doc mod times in db */
|
||||
bool maxYearSpan(int *minyear, int *maxyear);
|
||||
|
|
|
@ -18,12 +18,17 @@
|
|||
#ifndef _rcldb_p_h_included_
|
||||
#define _rcldb_p_h_included_
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
#include <xapian.h>
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
#include "workqueue.h"
|
||||
#include "debuglog.h"
|
||||
#endif // IDX_THREADS
|
||||
#include "xapian.h"
|
||||
|
||||
#include "xmacros.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
|
|
@ -446,7 +446,7 @@ vector<string> Query::expand(const Doc &doc)
|
|||
for (Xapian::ESetIterator it = eset.begin();
|
||||
it != eset.end(); it++) {
|
||||
LOGDEB((" [%s]\n", (*it).c_str()));
|
||||
if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z'))
|
||||
if ((*it).empty() || has_prefix(*it))
|
||||
continue;
|
||||
res.push_back(*it);
|
||||
if (res.size() >= 10)
|
||||
|
|
|
@ -16,17 +16,22 @@
|
|||
*/
|
||||
|
||||
// Handle translation from rcl's SearchData structures to Xapian Queries
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <fnmatch.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
#include "xapian.h"
|
||||
|
||||
#include "cstr.h"
|
||||
#include "rcldb.h"
|
||||
#include "rcldb_p.h"
|
||||
#include "searchdata.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
|
@ -36,11 +41,11 @@
|
|||
#include "stoplist.h"
|
||||
#include "rclconfig.h"
|
||||
#include "termproc.h"
|
||||
#include "synfamily.h"
|
||||
#include "stemdb.h"
|
||||
#include "expansiondbs.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
namespace Rcl {
|
||||
#endif
|
||||
|
||||
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
||||
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
||||
|
@ -71,13 +76,23 @@ static const int original_term_wqf_booster = 10;
|
|||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||
* USA
|
||||
*/
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
#define bufprefix(BUF, L) {(BUF)[0] = L;}
|
||||
#define bpoffs 1
|
||||
#else
|
||||
#define bufprefix(BUF, L) {(BUF)[0] = ':'; (BUF)[1] = L; (BUF)[2] = ':';}
|
||||
#define bpoffs 3
|
||||
#endif
|
||||
|
||||
static Xapian::Query
|
||||
date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
||||
{
|
||||
// Xapian uses a smallbuf and snprintf. Can't be bothered, we're
|
||||
// only doing %d's !
|
||||
char buf[200];
|
||||
sprintf(buf, "D%04d%02d", y1, m1);
|
||||
bufprefix(buf, 'D');
|
||||
sprintf(buf+bpoffs, "%04d%02d", y1, m1);
|
||||
vector<Xapian::Query> v;
|
||||
|
||||
int d_last = monthdays(m1, y1);
|
||||
|
@ -88,11 +103,11 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
|||
// Deal with any initial partial month
|
||||
if (d1 > 1 || d_end < d_last) {
|
||||
for ( ; d1 <= d_end ; d1++) {
|
||||
sprintf(buf + 7, "%02d", d1);
|
||||
sprintf(buf + 6 + bpoffs, "%02d", d1);
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
} else {
|
||||
buf[0] = 'M';
|
||||
bufprefix(buf, 'M');
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
|
||||
|
@ -102,36 +117,36 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
|||
|
||||
int m_last = (y1 < y2) ? 12 : m2 - 1;
|
||||
while (++m1 <= m_last) {
|
||||
sprintf(buf + 5, "%02d", m1);
|
||||
buf[0] = 'M';
|
||||
sprintf(buf + 4 + bpoffs, "%02d", m1);
|
||||
bufprefix(buf, 'M');
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
|
||||
if (y1 < y2) {
|
||||
while (++y1 < y2) {
|
||||
sprintf(buf + 1, "%04d", y1);
|
||||
buf[0] = 'Y';
|
||||
sprintf(buf + bpoffs, "%04d", y1);
|
||||
bufprefix(buf, 'Y');
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
sprintf(buf + 1, "%04d", y2);
|
||||
buf[0] = 'M';
|
||||
sprintf(buf + bpoffs, "%04d", y2);
|
||||
bufprefix(buf, 'M');
|
||||
for (m1 = 1; m1 < m2; m1++) {
|
||||
sprintf(buf + 5, "%02d", m1);
|
||||
sprintf(buf + 4 + bpoffs, "%02d", m1);
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
}
|
||||
|
||||
sprintf(buf + 5, "%02d", m2);
|
||||
sprintf(buf + 2 + bpoffs, "%02d", m2);
|
||||
|
||||
// Deal with any final partial month
|
||||
if (d2 < monthdays(m2, y2)) {
|
||||
buf[0] = 'D';
|
||||
bufprefix(buf, 'D');
|
||||
for (d1 = 1 ; d1 <= d2; d1++) {
|
||||
sprintf(buf + 7, "%02d", d1);
|
||||
sprintf(buf + 6 + bpoffs, "%02d", d1);
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
} else {
|
||||
buf[0] = 'M';
|
||||
bufprefix(buf, 'M');
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
|
||||
|
@ -172,31 +187,27 @@ bool SearchData::expandFileTypes(RclConfig *cfg, vector<string>& tps)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
||||
vector<SearchDataClause*>& query,
|
||||
string& reason, void *d)
|
||||
{
|
||||
LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n",
|
||||
m_stemlang.c_str()));
|
||||
Xapian::Query xq;
|
||||
m_reason.erase();
|
||||
|
||||
// Walk the clause list translating each in turn and building the
|
||||
// Xapian query tree
|
||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
|
||||
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
|
||||
Xapian::Query nq;
|
||||
if (!(*it)->toNativeQuery(db, &nq, m_stemlang)) {
|
||||
LOGERR(("SearchData::toNativeQuery: failed\n"));
|
||||
m_reason = (*it)->getReason();
|
||||
if (!(*it)->toNativeQuery(db, &nq)) {
|
||||
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed\n"));
|
||||
reason = (*it)->getReason();
|
||||
return false;
|
||||
}
|
||||
if (nq.empty()) {
|
||||
LOGDEB(("SearchData::toNativeQuery: skipping empty clause\n"));
|
||||
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
|
||||
continue;
|
||||
}
|
||||
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
||||
// Else this is an OR list, and there can't be excl clauses (checked by
|
||||
// addClause())
|
||||
Xapian::Query::op op;
|
||||
if (m_tp == SCLT_AND) {
|
||||
if (tp == SCLT_AND) {
|
||||
if ((*it)->m_tp == SCLT_EXCL) {
|
||||
op = Xapian::Query::OP_AND_NOT;
|
||||
} else {
|
||||
|
@ -217,6 +228,23 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
|||
if (xq.empty())
|
||||
xq = Xapian::Query::MatchAll;
|
||||
|
||||
*((Xapian::Query *)d) = xq;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||
{
|
||||
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
|
||||
m_reason.erase();
|
||||
|
||||
// Walk the clause list translating each in turn and building the
|
||||
// Xapian query tree
|
||||
Xapian::Query xq;
|
||||
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
|
||||
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_haveDates) {
|
||||
// If one of the extremities is unset, compute db extremas
|
||||
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
|
||||
|
@ -326,10 +354,10 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
|||
stringToTokens(dit->dir, vpath, "/");
|
||||
vector<string> pvpath;
|
||||
if (dit->dir[0] == '/')
|
||||
pvpath.push_back(pathelt_prefix);
|
||||
pvpath.push_back(wrap_prefix(pathelt_prefix));
|
||||
for (vector<string>::const_iterator pit = vpath.begin();
|
||||
pit != vpath.end(); pit++){
|
||||
pvpath.push_back(pathelt_prefix + *pit);
|
||||
pvpath.push_back(wrap_prefix(pathelt_prefix) + *pit);
|
||||
}
|
||||
Xapian::Query::op tdop;
|
||||
if (dit->weight == 1.0) {
|
||||
|
@ -446,7 +474,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
|||
// My type is AND. Change it to OR and insert two queries, one
|
||||
// being the original query as a subquery, the other the
|
||||
// phrase.
|
||||
SearchData *sd = new SearchData(m_tp);
|
||||
SearchData *sd = new SearchData(m_tp, m_stemlang);
|
||||
sd->m_query = m_query;
|
||||
sd->m_stemlang = m_stemlang;
|
||||
m_tp = SCLT_OR;
|
||||
|
@ -586,25 +614,28 @@ public:
|
|||
{ }
|
||||
|
||||
bool processUserString(const string &iq,
|
||||
int mods,
|
||||
string &ermsg,
|
||||
vector<Xapian::Query> &pqueries,
|
||||
const StopList &stops,
|
||||
int slack = 0, bool useNear = false);
|
||||
private:
|
||||
void expandTerm(bool dont, const string& term, vector<string>& exp,
|
||||
void expandTerm(int mods,
|
||||
const string& term, vector<string>& exp,
|
||||
string& sterm, const string& prefix);
|
||||
// After splitting entry on whitespace: process non-phrase element
|
||||
void processSimpleSpan(const string& span, bool nostemexp,
|
||||
void processSimpleSpan(const string& span,
|
||||
int mods,
|
||||
vector<Xapian::Query> &pqueries);
|
||||
// Process phrase/near element
|
||||
void processPhraseOrNear(TextSplitQ *splitData,
|
||||
int mods,
|
||||
vector<Xapian::Query> &pqueries,
|
||||
bool useNear, int slack, int mods);
|
||||
bool useNear, int slack);
|
||||
|
||||
Db& m_db;
|
||||
const string& m_field;
|
||||
const string& m_stemlang;
|
||||
bool m_doBoostUserTerms;
|
||||
const bool m_doBoostUserTerms;
|
||||
HighlightData& m_hld;
|
||||
};
|
||||
|
||||
|
@ -619,60 +650,187 @@ static void listVector(const string& what, const vector<string>&l)
|
|||
}
|
||||
#endif
|
||||
|
||||
/** Take simple term and expand stem and wildcards
|
||||
/** Expand term into term list, using appropriate mode: stem, wildcards,
|
||||
* diacritics...
|
||||
*
|
||||
* @param nostemexp don't perform stem expansion. This is mainly used to
|
||||
* prevent stem expansion inside phrases (because the user probably
|
||||
* does not expect it). This does NOT prevent wild card expansion.
|
||||
* Other factors than nostemexp can prevent stem expansion:
|
||||
* a null stemlang, resulting from a global user preference, a
|
||||
* capitalized term, or wildcard(s)
|
||||
* @param mods stem expansion, case and diacritics sensitivity control.
|
||||
* @param term input single word
|
||||
* @param exp output expansion list
|
||||
* @param sterm output original input term if there were no wildcards
|
||||
* @param prefix field prefix in index. We could recompute it, but the caller
|
||||
* has it already. Used in the simple case where there is nothing to expand,
|
||||
* and we just return the prefixed term (else Db::termMatch deals with it).
|
||||
*/
|
||||
void StringToXapianQ::expandTerm(bool nostemexp,
|
||||
const string& term,
|
||||
vector<string>& exp,
|
||||
string &sterm, const string& prefix)
|
||||
void StringToXapianQ::expandTerm(int mods,
|
||||
const string& term,
|
||||
vector<string>& exp, string &sterm,
|
||||
const string& prefix)
|
||||
{
|
||||
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
|
||||
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
|
||||
sterm.erase();
|
||||
sterm.clear();
|
||||
exp.clear();
|
||||
if (term.empty()) {
|
||||
if (term.empty())
|
||||
return;
|
||||
}
|
||||
|
||||
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
||||
|
||||
// No stemming if there are wildcards or prevented globally.
|
||||
// If there are no wildcards, add term to the list of user-entered terms
|
||||
if (!haswild)
|
||||
m_hld.uterms.insert(term);
|
||||
|
||||
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
||||
|
||||
// No stem expansion if there are wildcards or if prevented by caller
|
||||
if (haswild || m_stemlang.empty()) {
|
||||
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
||||
nostemexp = true;
|
||||
}
|
||||
|
||||
if (!haswild)
|
||||
m_hld.uterms.insert(term);
|
||||
bool noexpansion = nostemexp && !haswild;
|
||||
|
||||
if (nostemexp && !haswild) {
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
||||
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
|
||||
|
||||
// If we are working with a raw index, apply the rules for case and
|
||||
// diacritics sensitivity.
|
||||
|
||||
// If any character has a diacritic, we become
|
||||
// diacritic-sensitive. Note that the way that the test is
|
||||
// performed (conversion+comparison) will automatically ignore
|
||||
// accented characters which are actually a separate letter
|
||||
if (unachasaccents(term))
|
||||
diac_sensitive = true;
|
||||
|
||||
// If any character apart the first is uppercase, we become case-sensitive.
|
||||
// The first character is reserved for turning off stemming. You need to
|
||||
// use a query language modifier to search for Floor in a case-sensitive
|
||||
// way.
|
||||
Utf8Iter it(term);
|
||||
it++;
|
||||
if (unachasuppercase(term.substr(it.getBpos())))
|
||||
case_sensitive = true;
|
||||
|
||||
// If we are sensitive to case or diacritics turn stemming off
|
||||
if (diac_sensitive || case_sensitive)
|
||||
nostemexp = true;
|
||||
|
||||
if (!case_sensitive || !diac_sensitive)
|
||||
noexpansion = false;
|
||||
#endif
|
||||
|
||||
if (noexpansion) {
|
||||
sterm = term;
|
||||
exp.resize(1);
|
||||
exp[0] = prefix + term;
|
||||
exp.push_back(prefix + term);
|
||||
} else {
|
||||
TermMatchResult res;
|
||||
if (haswild) {
|
||||
// Note that if there are wildcards, we do a direct from-index
|
||||
// expansion, which means that we are casediac-sensitive. There
|
||||
// would be nothing to prevent us to expand from the casediac
|
||||
// synonyms first. To be done later
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
|
||||
m_field);
|
||||
} else {
|
||||
sterm = term;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1,
|
||||
m_field);
|
||||
m_field);
|
||||
#else
|
||||
// No stem expansion when diacritic or case sensitivity is
|
||||
// set, it makes no sense (it would mess with the
|
||||
// diacritics anyway if they are not in the stem part).
|
||||
// In these 3 cases, perform appropriate expansion from
|
||||
// the charstripping db, and do a bogus wildcard expansion
|
||||
// (there is no wild card) to generate the result:
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
// No expansion whatsoever
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
|
||||
m_field);
|
||||
} else {
|
||||
// Access case and diacritics expansion:
|
||||
vector<string> exp;
|
||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa,
|
||||
"all", &unacfoldtrans);
|
||||
|
||||
if (diac_sensitive) {
|
||||
// Expand for accents and case, filtering for same accents,
|
||||
// then bogus wildcard expansion for generating result
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synExpand(term, exp, &foldtrans);
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
|
||||
-1, m_field);
|
||||
}
|
||||
} else if (case_sensitive) {
|
||||
// Expand for accents and case, filtering for same case,
|
||||
// then bogus wildcard expansion for generating result
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synExpand(term, exp, &unactrans);
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
|
||||
-1, m_field);
|
||||
}
|
||||
} else {
|
||||
// Expand for accents and case, then lowercase
|
||||
// result for input to stemdb.
|
||||
synac.synExpand(term, exp);
|
||||
for (unsigned int i = 0; i < exp.size(); i++) {
|
||||
string lower;
|
||||
unacmaybefold(exp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||
exp[i] = lower;
|
||||
}
|
||||
sort(exp.begin(), exp.end());
|
||||
vector<string>::iterator uit =
|
||||
unique(exp.begin(), exp.end());
|
||||
exp.resize(uit - exp.begin());
|
||||
LOGDEB(("ExpandTerm: after casediac: %s\n",
|
||||
stringsToString(exp).c_str()));
|
||||
|
||||
StemDb db(m_db.m_ndb->xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
db.stemExpand(m_stemlang, *it, exp1);
|
||||
}
|
||||
LOGDEB(("ExpandTerm: after stem: %s\n",
|
||||
stringsToString(exp1).c_str()));
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
exp.clear();
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
synac.synExpand(*it, exp);
|
||||
}
|
||||
sort(exp.begin(), exp.end());
|
||||
uit = unique(exp.begin(), exp.end());
|
||||
exp.resize(uit - exp.begin());
|
||||
|
||||
LOGDEB(("ExpandTerm: after case exp of stem: %s\n",
|
||||
stringsToString(exp).c_str()));
|
||||
|
||||
// Bogus wildcard expand to generate the result
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
|
||||
-1, m_field);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
exp.push_back(it->term);
|
||||
}
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(exp).c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -710,21 +868,22 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
|||
}
|
||||
}
|
||||
|
||||
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
|
||||
void StringToXapianQ::processSimpleSpan(const string& span,
|
||||
int mods,
|
||||
vector<Xapian::Query> &pqueries)
|
||||
{
|
||||
LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n",
|
||||
span.c_str(), int(nostemexp)));
|
||||
LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] mods %x\n",
|
||||
span.c_str(), (unsigned int)mods));
|
||||
vector<string> exp;
|
||||
string sterm; // dumb version of user term
|
||||
|
||||
string prefix;
|
||||
const FieldTraits *ftp;
|
||||
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
|
||||
prefix = ftp->pfx;
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
|
||||
expandTerm(nostemexp, span, exp, sterm, prefix);
|
||||
expandTerm(mods, span, exp, sterm, prefix);
|
||||
|
||||
// Set up the highlight data. No prefix should go in there
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
|
@ -755,8 +914,9 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
|
|||
// queries if the terms get expanded by stemming or wildcards (we
|
||||
// don't do stemming for PHRASE though)
|
||||
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
||||
int mods,
|
||||
vector<Xapian::Query> &pqueries,
|
||||
bool useNear, int slack, int mods)
|
||||
bool useNear, int slack)
|
||||
{
|
||||
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
||||
Xapian::Query::OP_PHRASE;
|
||||
|
@ -769,7 +929,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
|||
string prefix;
|
||||
const FieldTraits *ftp;
|
||||
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
|
||||
prefix = ftp->pfx;
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
|
||||
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
|
||||
|
@ -790,10 +950,12 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
|||
|| hadmultiple
|
||||
#endif // single OR inside NEAR
|
||||
;
|
||||
|
||||
int lmods = mods;
|
||||
if (nostemexp)
|
||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||
string sterm;
|
||||
vector<string> exp;
|
||||
expandTerm(nostemexp, *it, exp, sterm, prefix);
|
||||
expandTerm(lmods, *it, exp, sterm, prefix);
|
||||
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
||||
listVector("", exp);
|
||||
// groups is used for highlighting, we don't want prefixes in there.
|
||||
|
@ -882,9 +1044,9 @@ static int stringToMods(string& s)
|
|||
* count)
|
||||
*/
|
||||
bool StringToXapianQ::processUserString(const string &iq,
|
||||
int mods,
|
||||
string &ermsg,
|
||||
vector<Xapian::Query> &pqueries,
|
||||
const StopList& stops,
|
||||
int slack,
|
||||
bool useNear
|
||||
)
|
||||
|
@ -892,6 +1054,8 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||
LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
|
||||
ermsg.erase();
|
||||
|
||||
const StopList stops = m_db.getStopList();
|
||||
|
||||
// Simple whitespace-split input into user-level words and
|
||||
// double-quoted phrases: word1 word2 "this is a phrase".
|
||||
//
|
||||
|
@ -930,11 +1094,13 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
||||
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
||||
//tpcommon.onlygrams(true);
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
||||
#endif
|
||||
|
||||
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops, nxt);
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops, nxt);
|
||||
tpq.setTSQ(&splitter);
|
||||
splitter.text_to_words(*it);
|
||||
|
||||
|
@ -944,14 +1110,17 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||
switch (splitter.terms.size() + terminc) {
|
||||
case 0:
|
||||
continue;// ??
|
||||
case 1:
|
||||
case 1: {
|
||||
int lmods = mods;
|
||||
if (splitter.nostemexps.front())
|
||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||
m_hld.ugroups.push_back(vector<string>(1, *it));
|
||||
processSimpleSpan(splitter.terms.front(),
|
||||
splitter.nostemexps.front(), pqueries);
|
||||
processSimpleSpan(splitter.terms.front(), lmods, pqueries);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
m_hld.ugroups.push_back(vector<string>(1, *it));
|
||||
processPhraseOrNear(&splitter, pqueries, useNear, slack, mods);
|
||||
processPhraseOrNear(&splitter, mods, pqueries, useNear, slack);
|
||||
}
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
|
@ -971,13 +1140,10 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||
}
|
||||
|
||||
// Translate a simple OR, AND, or EXCL search clause.
|
||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
const string& stemlang)
|
||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
|
||||
stemlang;
|
||||
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
|
||||
stemlang.c_str()));
|
||||
getStemLang().c_str()));
|
||||
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
@ -1000,8 +1166,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
|||
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
||||
(m_parentSearch == 0 && !m_haveWildCards);
|
||||
|
||||
StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm);
|
||||
if (!tr.processUserString(m_text, m_reason, pqueries, db.getStopList()))
|
||||
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
|
||||
if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||
|
@ -1024,8 +1190,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
|||
// about expanding multiple fragments in the past. We just take the
|
||||
// value blanks and all and expand this against the indexed unsplit
|
||||
// file names
|
||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
||||
const string&)
|
||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
@ -1041,11 +1206,8 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
|||
}
|
||||
|
||||
// Translate NEAR or PHRASE clause.
|
||||
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
||||
const string& stemlang)
|
||||
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
|
||||
stemlang;
|
||||
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
||||
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
|
@ -1069,8 +1231,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
|||
}
|
||||
string s = cstr_dquote + m_text + cstr_dquote;
|
||||
bool useNear = (m_tp == SCLT_NEAR);
|
||||
StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm);
|
||||
if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(),
|
||||
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
|
||||
if (!tr.processUserString(s, getModifiers(), m_reason, pqueries,
|
||||
m_slack, useNear))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
|
|
|
@ -70,9 +70,9 @@ class SearchDataClause;
|
|||
*/
|
||||
class SearchData {
|
||||
public:
|
||||
SearchData(SClType tp)
|
||||
SearchData(SClType tp, const string& stemlang)
|
||||
: m_tp(tp), m_haveDates(false), m_maxSize(size_t(-1)),
|
||||
m_minSize(size_t(-1)), m_haveWildCards(false)
|
||||
m_minSize(size_t(-1)), m_haveWildCards(false), m_stemlang(stemlang)
|
||||
{
|
||||
if (m_tp != SCLT_OR && m_tp != SCLT_AND)
|
||||
m_tp = SCLT_OR;
|
||||
|
@ -91,6 +91,7 @@ public:
|
|||
/** Translate to Xapian query. rcldb knows about the void* */
|
||||
bool toNativeQuery(Rcl::Db &db, void *);
|
||||
|
||||
|
||||
/** We become the owner of cl and will delete it */
|
||||
bool addClause(SearchDataClause *cl);
|
||||
|
||||
|
@ -109,6 +110,8 @@ public:
|
|||
m_dirspecs.push_back(DirSpec(t, excl, w));
|
||||
}
|
||||
|
||||
const std::string& getStemLang() {return m_stemlang;}
|
||||
|
||||
void setMinSize(size_t size) {m_minSize = size;}
|
||||
void setMaxSize(size_t size) {m_maxSize = size;}
|
||||
|
||||
|
@ -120,8 +123,6 @@ public:
|
|||
/** Add file type to not wanted list */
|
||||
void remFiletype(const std::string& ft) {m_nfiletypes.push_back(ft);}
|
||||
|
||||
void setStemlang(const std::string& lang = "english") {m_stemlang = lang;}
|
||||
|
||||
/** Retrieve error description */
|
||||
std::string getReason() {return m_reason;}
|
||||
|
||||
|
@ -170,7 +171,12 @@ private:
|
|||
std::string m_reason;
|
||||
bool m_haveWildCards;
|
||||
std::string m_stemlang;
|
||||
|
||||
bool expandFileTypes(RclConfig *cfg, std::vector<std::string>& exptps);
|
||||
bool clausesToQuery(Rcl::Db &db, SClType tp,
|
||||
std::vector<SearchDataClause*>& query,
|
||||
string& reason, void *d);
|
||||
|
||||
/* Copyconst and assignment private and forbidden */
|
||||
SearchData(const SearchData &) {}
|
||||
SearchData& operator=(const SearchData&) {return *this;};
|
||||
|
@ -186,7 +192,7 @@ public:
|
|||
m_modifiers(SDCM_NONE), m_weight(1.0)
|
||||
{}
|
||||
virtual ~SearchDataClause() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const std::string&) = 0;
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *) = 0;
|
||||
bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;}
|
||||
virtual std::string getReason() const {return m_reason;}
|
||||
virtual void getTerms(HighlightData & hldata) const = 0;
|
||||
|
@ -199,6 +205,11 @@ public:
|
|||
{
|
||||
m_parentSearch = p;
|
||||
}
|
||||
string getStemLang()
|
||||
{
|
||||
return (m_modifiers & SDCM_NOSTEMMING) || m_parentSearch == 0 ?
|
||||
cstr_null : m_parentSearch->getStemLang();
|
||||
}
|
||||
virtual void setModifiers(Modifier mod)
|
||||
{
|
||||
m_modifiers = mod;
|
||||
|
@ -255,7 +266,7 @@ public:
|
|||
}
|
||||
|
||||
/** Translate to Xapian query */
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang);
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||
|
||||
virtual void getTerms(HighlightData& hldata) const
|
||||
{
|
||||
|
@ -296,7 +307,7 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang);
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -315,7 +326,7 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *, const std::string& stemlang);
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||
private:
|
||||
int m_slack;
|
||||
};
|
||||
|
@ -323,17 +334,11 @@ private:
|
|||
/** Subquery */
|
||||
class SearchDataClauseSub : public SearchDataClause {
|
||||
public:
|
||||
// We take charge of the SearchData * and will delete it.
|
||||
SearchDataClauseSub(SClType tp, RefCntr<SearchData> sub)
|
||||
: SearchDataClause(tp), m_sub(sub)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~SearchDataClauseSub()
|
||||
{
|
||||
}
|
||||
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *p, const std::string&)
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
return m_sub->toNativeQuery(db, p);
|
||||
}
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
* Management of the auxiliary databases listing stems and their expansion
|
||||
* terms
|
||||
*/
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
@ -27,13 +30,8 @@
|
|||
#include <xapian.h>
|
||||
|
||||
#include "stemdb.h"
|
||||
#include "pathut.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
#include "utf8iter.h"
|
||||
#include "textsplit.h"
|
||||
#include "rcldb.h"
|
||||
#include "rcldb_p.h"
|
||||
#include "synfamily.h"
|
||||
#include "unacpp.h"
|
||||
|
||||
|
@ -43,140 +41,6 @@ using namespace std;
|
|||
|
||||
namespace Rcl {
|
||||
|
||||
// Fast raw detection of non-natural-language words: look for ascii
|
||||
// chars which are not lowercase letters. Not too sure what islower()
|
||||
// would do with 8 bit values, so not using it here. If we want to be
|
||||
// more complete we'd need to go full utf-8
|
||||
inline static bool p_notlowerascii(unsigned int c)
|
||||
{
|
||||
if (c < 'a' || (c > 'z' && c < 128))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create database of stem to parents associations for a given language.
|
||||
*/
|
||||
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
const vector<string>& langs)
|
||||
{
|
||||
LOGDEB(("StemDb::createExpansionDbs\n"));
|
||||
Chrono cron;
|
||||
|
||||
vector<XapWritableSynFamily> stemdbs;
|
||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
|
||||
stemdbs[i].deleteMember(langs[i]);
|
||||
stemdbs[i].createMember(langs[i]);
|
||||
stemdbs[i].setCurrentMemberName(langs[i]);
|
||||
}
|
||||
|
||||
// We walk the list of all terms, and stem each. We skip terms which
|
||||
// don't look like natural language.
|
||||
// If the stem is not identical to the term, we add a synonym entry.
|
||||
// Statistics
|
||||
int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
|
||||
int stemconst = 0; // Stem == term
|
||||
int allsyns = 0; // Total number of entries created
|
||||
|
||||
string ermsg;
|
||||
try {
|
||||
vector<Xapian::Stem> stemmers;
|
||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||
stemmers.push_back(Xapian::Stem(langs[i]));
|
||||
}
|
||||
|
||||
for (Xapian::TermIterator it = wdb.allterms_begin();
|
||||
it != wdb.allterms_end(); it++) {
|
||||
// If the term has any non-lowercase 7bit char (that is,
|
||||
// numbers, capitals and punctuation) dont stem.
|
||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
||||
++nostem;
|
||||
LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
|
||||
(*it).c_str(), *sit));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Detect and skip CJK terms.
|
||||
// We're still sending all other multibyte utf-8 chars to
|
||||
// the stemmer, which is not too well defined for
|
||||
// xapian<1.0 (very obsolete now), but seems to work
|
||||
// anyway. There shouldn't be too many in any case because
|
||||
// accents are stripped at this point.
|
||||
// The effect of stripping accents on stemming is not good,
|
||||
// (e.g: in french partimes -> partim, parti^mes -> part)
|
||||
// but fixing the issue would be complicated.
|
||||
Utf8Iter utfit(*it);
|
||||
if (TextSplit::isCJK(*utfit)) {
|
||||
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create stemming synonym for every lang
|
||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||
string stem = stemmers[i](*it);
|
||||
if (stem == *it) {
|
||||
++stemconst;
|
||||
} else {
|
||||
stemdbs[i].addSynonym(stem, *it);
|
||||
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
|
||||
(*it).c_str(), langs[i].c_str(), stem.c_str()));
|
||||
++allsyns;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
|
||||
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
|
||||
nostem, stemconst, allsyns));
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expand term to list of all terms which stem to the same term, for one
|
||||
* expansion language
|
||||
*/
|
||||
bool StemDb::expandOne(const std::string& lang,
|
||||
const std::string& term,
|
||||
vector<string>& result)
|
||||
{
|
||||
try {
|
||||
Xapian::Stem stemmer(lang);
|
||||
string stem = stemmer(term);
|
||||
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
|
||||
lang.c_str(), term.c_str(), stem.c_str()));
|
||||
|
||||
if (!synExpand(lang, stem, result)) {
|
||||
// ?
|
||||
}
|
||||
|
||||
// If the user term or stem are not in the list, add them
|
||||
if (find(result.begin(), result.end(), term) == result.end()) {
|
||||
result.push_back(term);
|
||||
}
|
||||
if (find(result.begin(), result.end(), stem) == result.end()) {
|
||||
result.push_back(stem);
|
||||
}
|
||||
LOGDEB0(("stemExpand:%s: %s -> %s\n", lang.c_str(), stem.c_str(),
|
||||
stringsToString(result).c_str()));
|
||||
|
||||
} catch (...) {
|
||||
LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
|
||||
lang.c_str()));
|
||||
result.push_back(term);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expand for one or several languages
|
||||
*/
|
||||
|
@ -186,14 +50,34 @@ bool StemDb::stemExpand(const std::string& langs,
|
|||
{
|
||||
vector<string> llangs;
|
||||
stringToStrings(langs, llangs);
|
||||
|
||||
for (vector<string>::const_iterator it = llangs.begin();
|
||||
it != llangs.end(); it++) {
|
||||
vector<string> oneexp;
|
||||
expandOne(*it, term, oneexp);
|
||||
result.insert(result.end(), oneexp.begin(), oneexp.end());
|
||||
SynTermTransStem stemmer(*it);
|
||||
XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
|
||||
(void)expander.synExpand(term, result);
|
||||
}
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
for (vector<string>::const_iterator it = llangs.begin();
|
||||
it != llangs.end(); it++) {
|
||||
SynTermTransStem stemmer(*it);
|
||||
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
|
||||
*it, &stemmer);
|
||||
string unac;
|
||||
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
|
||||
(void)expander.synExpand(unac, result);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (result.empty())
|
||||
result.push_back(term);
|
||||
|
||||
sort(result.begin(), result.end());
|
||||
unique(result.begin(), result.end());
|
||||
vector<string>::iterator uit = unique(result.begin(), result.end());
|
||||
result.resize(uit - result.begin());
|
||||
LOGDEB0(("stemExpand:%s: %s -> %s\n", langs.c_str(), term.c_str(),
|
||||
stringsToString(result).c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -55,9 +55,30 @@
|
|||
#include <xapian.h>
|
||||
|
||||
#include "synfamily.h"
|
||||
#include "unacpp.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
/* A stemming functor for using with XapComputableSynFamMember */
|
||||
class SynTermTransStem : public SynTermTrans {
|
||||
public:
|
||||
SynTermTransStem(const std::string& lang)
|
||||
: m_stemmer(lang), m_lang(lang)
|
||||
{
|
||||
}
|
||||
virtual std::string operator()(const std::string& in)
|
||||
{
|
||||
string out = m_stemmer(in);
|
||||
LOGDEB2(("SynTermTransStem(%s): in [%s] out [%s]\n", m_lang.c_str(),
|
||||
in.c_str(), out.c_str()));
|
||||
return out;
|
||||
}
|
||||
Xapian::Stem m_stemmer;
|
||||
std::string m_lang;
|
||||
};
|
||||
|
||||
/** Stemdb is a bit special as a SynFamily as we may want to expand for one
|
||||
* or several members (languages) */
|
||||
class StemDb : public XapSynFamily {
|
||||
public:
|
||||
StemDb(Xapian::Database& xdb)
|
||||
|
@ -67,18 +88,10 @@ public:
|
|||
|
||||
/** Expand for a number of languages */
|
||||
bool stemExpand(const std::string& langs,
|
||||
const std::string& term,
|
||||
std::vector<std::string>& result);
|
||||
private:
|
||||
/** Compute stem and call synExpand() */
|
||||
bool expandOne(const std::string& lang,
|
||||
const std::string& term,
|
||||
std::vector<std::string>& result);
|
||||
const std::string& term,
|
||||
std::vector<std::string>& result);
|
||||
};
|
||||
|
||||
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
const std::vector<std::string>& langs);
|
||||
|
||||
}
|
||||
|
||||
#endif /* _STEMDB_H_INCLUDED_ */
|
||||
|
|
|
@ -28,31 +28,6 @@ using namespace std;
|
|||
|
||||
namespace Rcl {
|
||||
|
||||
bool XapSynFamily::synExpand(const string& member, const string& term,
|
||||
vector<string>& result)
|
||||
{
|
||||
string key = entryprefix(member) + term;
|
||||
string ermsg;
|
||||
try {
|
||||
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
|
||||
xit != m_rdb.synonyms_end(key); xit++) {
|
||||
result.push_back(*xit);
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("synFamily::synExpand: error for member [%s] term [%s]\n",
|
||||
member.c_str(), term.c_str()));
|
||||
return false;
|
||||
}
|
||||
#if 0
|
||||
string out;
|
||||
stringsToString(result, out);
|
||||
LOGDEB0(("XapSynFamily::synExpand:%s: [%s] -> %s\n", member.c_str(),
|
||||
term.c_str(), out.c_str()));
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XapSynFamily::getMembers(vector<string>& members)
|
||||
{
|
||||
string key = memberskey();
|
||||
|
@ -100,6 +75,35 @@ bool XapSynFamily::listMap(const string& membername)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool XapSynFamily::synExpand(const string& member, const string& term,
|
||||
vector<string>& result)
|
||||
{
|
||||
LOGDEB(("XapSynFamily::synExpand:(%s) %s for %s\n",
|
||||
m_prefix1.c_str(), term.c_str(), member.c_str()));
|
||||
|
||||
string key = entryprefix(member) + term;
|
||||
string ermsg;
|
||||
try {
|
||||
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
|
||||
xit != m_rdb.synonyms_end(key); xit++) {
|
||||
LOGDEB2((" Pushing %s\n", (*xit).c_str()));
|
||||
result.push_back(*xit);
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("synFamily::synExpand: error for member [%s] term [%s]\n",
|
||||
member.c_str(), term.c_str()));
|
||||
result.push_back(term);
|
||||
return false;
|
||||
}
|
||||
// If the input term is not in the list, add it
|
||||
if (find(result.begin(), result.end(), term) == result.end()) {
|
||||
result.push_back(term);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XapWritableSynFamily::deleteMember(const string& membername)
|
||||
{
|
||||
string key = entryprefix(membername);
|
||||
|
@ -119,32 +123,61 @@ bool XapWritableSynFamily::createMember(const string& membername)
|
|||
m_wdb.add_synonym(memberskey(), membername);
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("XapSynFamily::createMember: xapian error %s\n", ermsg.c_str()));
|
||||
LOGERR(("XapSynFamily::createMember: error: %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XapWritableSynFamily::addSynonyms(const string& membername,
|
||||
const string& term,
|
||||
const vector<string>& trans)
|
||||
bool XapComputableSynFamMember::synExpand(const string& term,
|
||||
vector<string>& result,
|
||||
SynTermTrans *filtertrans)
|
||||
{
|
||||
string key = entryprefix(membername) + term;
|
||||
string root = (*m_trans)(term);
|
||||
string filter_root;
|
||||
if (filtertrans)
|
||||
filter_root = (*filtertrans)(term);
|
||||
|
||||
/* We could call XapSynFamily::synExpand() here instead of doing it
|
||||
ourselves... */
|
||||
string key = m_prefix + root;
|
||||
|
||||
LOGDEB(("XapCompSynFamMbr::synExpand([%s]): term [%s] root [%s] \n",
|
||||
m_prefix.c_str(), term.c_str(), root.c_str()));
|
||||
|
||||
string ermsg;
|
||||
try {
|
||||
for (vector<string>::const_iterator it = trans.begin();
|
||||
it != trans.end(); it++) {
|
||||
m_wdb.add_synonym(key, *it);
|
||||
for (Xapian::TermIterator xit = m_family.getdb().synonyms_begin(key);
|
||||
xit != m_family.getdb().synonyms_end(key); xit++) {
|
||||
if (!filtertrans || (*filtertrans)(*xit) == filter_root) {
|
||||
LOGDEB2((" Pushing %s\n", (*xit).c_str()));
|
||||
result.push_back(*xit);
|
||||
}
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("XapSynFamily::addSynonyms: xapian error %s\n", ermsg.c_str()));
|
||||
LOGERR(("XapSynDb::synExpand: error for term [%s] (key %s)\n",
|
||||
term.c_str(), key.c_str()));
|
||||
result.push_back(term);
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the input term and root are not in the list, add them
|
||||
if (find(result.begin(), result.end(), term) == result.end()) {
|
||||
LOGDEB2((" Pushing %s\n", term.c_str()));
|
||||
result.push_back(term);
|
||||
}
|
||||
if (root != term &&
|
||||
find(result.begin(), result.end(), root) == result.end()) {
|
||||
if (!filtertrans || (*filtertrans)(root) == filter_root) {
|
||||
LOGDEB2((" Pushing %s\n", root.c_str()));
|
||||
result.push_back(root);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
#else // TEST_SYNFAMILY
|
||||
|
@ -169,16 +202,16 @@ using namespace std;
|
|||
|
||||
static string thisprog;
|
||||
static int op_flags;
|
||||
#define OPT_a 0x4
|
||||
#define OPT_c 0x8
|
||||
#define OPT_D 0x1
|
||||
#define OPT_d 0x10
|
||||
#define OPT_L 0x2
|
||||
#define OPT_a 0x4
|
||||
#define OPT_u 0x8
|
||||
#define OPT_d 0x10
|
||||
#define OPT_l 0x20
|
||||
#define OPT_s 0x40
|
||||
#define OPT_e 0x80
|
||||
static string usage =
|
||||
" -d <dbdir> {-s|-a|-c} database dir and synfamily: stem accents case\n"
|
||||
" -d <dbdir> {-s|-a|-u} database dir and synfamily: stem accents/case ustem\n"
|
||||
" -l : list members\n"
|
||||
" -L <member>: list entries for given member\n"
|
||||
" -e <member> <key> : list expansion for given member and key\n"
|
||||
|
@ -209,7 +242,6 @@ int main(int argc, char **argv)
|
|||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 'a': op_flags |= OPT_a; break;
|
||||
case 'c': op_flags |= OPT_c; break;
|
||||
case 'D': op_flags |= OPT_D; break;
|
||||
case 'd': op_flags |= OPT_d; if (argc < 2) Usage();
|
||||
dbdir = *(++argv); argc--;
|
||||
|
@ -223,6 +255,7 @@ int main(int argc, char **argv)
|
|||
member = *(++argv); argc--;
|
||||
goto b1;
|
||||
case 's': op_flags |= OPT_s; break;
|
||||
case 'u': op_flags |= OPT_u; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
b1: argc--; argv++;
|
||||
|
@ -231,12 +264,11 @@ int main(int argc, char **argv)
|
|||
if (argc != 0)
|
||||
Usage();
|
||||
|
||||
// We do stem only for now
|
||||
string familyname;
|
||||
if (op_flags & OPT_a) {
|
||||
familyname = Rcl::synFamDiac;
|
||||
} else if (op_flags &OPT_c) {
|
||||
familyname = Rcl::synFamCase;
|
||||
familyname = Rcl::synFamDiCa;
|
||||
} else if (op_flags & OPT_u) {
|
||||
familyname = Rcl::synFamStemUnac;
|
||||
} else {
|
||||
familyname = Rcl::synFamStem;
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
|
||||
namespace Rcl {
|
||||
|
||||
class XapSynFamily {
|
||||
class XapSynFamily {
|
||||
public:
|
||||
/**
|
||||
* Construct from readable xapian database and family name (ie: Stm)
|
||||
|
@ -53,38 +53,50 @@ public:
|
|||
m_prefix1 = std::string(":") + familyname;
|
||||
}
|
||||
|
||||
/** Expand one term (e.g.: familier) inside one family number (e.g: french)
|
||||
*/
|
||||
virtual bool synExpand(const std::string& fammember,
|
||||
const std::string& key,
|
||||
std::vector<std::string>& result);
|
||||
|
||||
/** Retrieve all members of this family (e.g: french english german...) */
|
||||
virtual bool getMembers(std::vector<std::string>&);
|
||||
|
||||
/** debug: list map for one member to stdout */
|
||||
virtual bool listMap(const std::string& fam);
|
||||
|
||||
protected:
|
||||
Xapian::Database m_rdb;
|
||||
std::string m_prefix1;
|
||||
/** Expand term to list of synonyms for given member */
|
||||
bool synExpand(const std::string& membername,
|
||||
const std::string& term, std::vector<std::string>& result);
|
||||
|
||||
// The prefix shared by all synonym entries inside a family member
|
||||
virtual std::string entryprefix(const std::string& member)
|
||||
{
|
||||
return m_prefix1 + ":" + member + ":";
|
||||
}
|
||||
|
||||
// The key for the "list of members" entry
|
||||
virtual std::string memberskey()
|
||||
{
|
||||
return m_prefix1 + ";" + "members";
|
||||
}
|
||||
|
||||
Xapian::Database& getdb()
|
||||
{
|
||||
return m_rdb;
|
||||
}
|
||||
|
||||
protected:
|
||||
Xapian::Database m_rdb;
|
||||
std::string m_prefix1;
|
||||
};
|
||||
|
||||
/** Modify ops for a synonyms family
|
||||
*
|
||||
* A method to add a synonym entry inside a given member would make sense,
|
||||
* but would not be used presently as all these ops go through
|
||||
* ComputableSynFamMember objects
|
||||
*/
|
||||
class XapWritableSynFamily : public XapSynFamily {
|
||||
public:
|
||||
/** Construct with Xapian db open for r/w */
|
||||
XapWritableSynFamily(Xapian::WritableDatabase db, const std::string& pfx)
|
||||
: XapSynFamily(db, pfx), m_wdb(db)
|
||||
XapWritableSynFamily(Xapian::WritableDatabase db,
|
||||
const std::string& familyname)
|
||||
: XapSynFamily(db, familyname), m_wdb(db)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -95,36 +107,92 @@ public:
|
|||
/** Add to list of members. Idempotent, does not affect actual expansions */
|
||||
virtual bool createMember(const std::string& membername);
|
||||
|
||||
/** Add expansion list for term inside family member (e.g., inside
|
||||
* the english member, add expansion for floor -> floors, flooring.. */
|
||||
virtual bool addSynonyms(const std::string& membername,
|
||||
const std::string& term,
|
||||
const std::vector<std::string>& trans);
|
||||
Xapian::WritableDatabase getdb() {return m_wdb;}
|
||||
|
||||
// Need to call setCurrentMemberName before addSynonym !
|
||||
// We don't check it, for speed
|
||||
virtual void setCurrentMemberName(const std::string& nm)
|
||||
protected:
|
||||
Xapian::WritableDatabase m_wdb;
|
||||
};
|
||||
|
||||
/** A functor which transforms a string */
|
||||
class SynTermTrans {
|
||||
public:
|
||||
virtual std::string operator()(const std::string&) = 0;
|
||||
};
|
||||
|
||||
/** A member (set of root-synonyms associations) of a SynFamily for
|
||||
* which the root is computable from the input term.
|
||||
* The objects use a functor member to compute the term root on input
|
||||
* (e.g. compute the term sterm or casefold it
|
||||
*/
|
||||
class XapComputableSynFamMember {
|
||||
public:
|
||||
XapComputableSynFamMember(Xapian::Database xdb, std::string familyname,
|
||||
std::string membername, SynTermTrans* trans)
|
||||
: m_family(xdb, familyname), m_membername(membername),
|
||||
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
|
||||
{
|
||||
m_currentPrefix = entryprefix(nm);
|
||||
}
|
||||
virtual bool addSynonym(const std::string& term, const std::string& trans)
|
||||
|
||||
/** Expand a term to its list of synonyms. If filtertrans is set we
|
||||
* keep only the results which transform to the same value as the input */
|
||||
bool synExpand(const std::string& term, std::vector<std::string>& result,
|
||||
SynTermTrans *filtertrans = 0);
|
||||
|
||||
private:
|
||||
XapSynFamily m_family;
|
||||
std::string m_membername;
|
||||
SynTermTrans *m_trans;
|
||||
std::string m_prefix;
|
||||
};
|
||||
|
||||
/** Computable term root SynFamily member, modify ops */
|
||||
class XapWritableComputableSynFamMember {
|
||||
public:
|
||||
XapWritableComputableSynFamMember(
|
||||
Xapian::WritableDatabase xdb, std::string familyname,
|
||||
std::string membername, SynTermTrans* trans)
|
||||
: m_family(xdb, familyname), m_membername(membername),
|
||||
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
|
||||
{
|
||||
std::string key = m_currentPrefix + term;
|
||||
}
|
||||
|
||||
virtual bool addSynonym(const std::string& term)
|
||||
{
|
||||
LOGDEB2(("addSynonym:me %p term [%s] m_trans %p\n", this,
|
||||
term.c_str(), m_trans));
|
||||
std::string transformed = (*m_trans)(term);
|
||||
LOGDEB2(("addSynonym: transformed [%s]\n", transformed.c_str()));
|
||||
if (transformed == term)
|
||||
return true;
|
||||
|
||||
std::string ermsg;
|
||||
try {
|
||||
m_wdb.add_synonym(key, trans);
|
||||
m_family.getdb().add_synonym(m_prefix + transformed, term);
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("XapSynFamily::addSynonym: xapian error %s\n",
|
||||
ermsg.c_str()));
|
||||
LOGERR(("XapWritableComputableSynFamMember::addSynonym: "
|
||||
"xapian error %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
Xapian::WritableDatabase m_wdb;
|
||||
std::string m_currentPrefix;
|
||||
void clear()
|
||||
{
|
||||
m_family.deleteMember(m_membername);
|
||||
}
|
||||
|
||||
void recreate()
|
||||
{
|
||||
clear();
|
||||
m_family.createMember(m_membername);
|
||||
}
|
||||
|
||||
private:
|
||||
XapWritableSynFamily m_family;
|
||||
std::string m_membername;
|
||||
SynTermTrans *m_trans;
|
||||
std::string m_prefix;
|
||||
};
|
||||
|
||||
|
||||
|
@ -133,11 +201,13 @@ protected:
|
|||
//
|
||||
// Stem expansion family prefix. The family member name is the
|
||||
// language ("all" for Dia and Cse)
|
||||
|
||||
// Lowercase accented stem to expansion
|
||||
static const std::string synFamStem("Stm");
|
||||
static const std::string synFamDiac("Dia");
|
||||
static const std::string synFamCase("Cse");
|
||||
|
||||
|
||||
// Lowercase unaccented stem to expansion
|
||||
static const std::string synFamStemUnac("StU");
|
||||
// Lowercase unaccented term to case and accent variations
|
||||
static const std::string synFamDiCa("DCa");
|
||||
}
|
||||
|
||||
#endif /* _SYNFAMILY_H_INCLUDED_ */
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
# Also reserved: F(parentid), Q(uniqueid)
|
||||
title = S ; wdfinc = 10
|
||||
author = A
|
||||
abstract =
|
||||
abstract = XS
|
||||
caption = S
|
||||
title = S
|
||||
subject = S
|
||||
|
|
|
@ -103,7 +103,7 @@ public:
|
|||
|
||||
/** Append current utf-8 possibly multi-byte character to string param.
|
||||
This needs to be fast. No error checking. */
|
||||
unsigned int appendchartostring(std::string &out) {
|
||||
unsigned int appendchartostring(std::string &out) const {
|
||||
#ifdef UTF8ITER_CHECK
|
||||
assert(m_cl != 0);
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue