ensure that recoll configured with indexStripChars=1 runs as compiled with -DRCL_INDEX_STRIPCHARS
This commit is contained in:
parent
48e9a4f901
commit
e22b347767
17 changed files with 425 additions and 260 deletions
|
@ -23,9 +23,9 @@
|
||||||
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
#include <iostream>
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <vector>
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
#include ASPELL_INCLUDE
|
#include ASPELL_INCLUDE
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@
|
||||||
#include "execmd.h"
|
#include "execmd.h"
|
||||||
#include "rclaspell.h"
|
#include "rclaspell.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
|
#include "unacpp.h"
|
||||||
#include "ptmutex.h"
|
#include "ptmutex.h"
|
||||||
|
|
||||||
// Just a place where we keep the Aspell library entry points together
|
// Just a place where we keep the Aspell library entry points together
|
||||||
|
@ -260,6 +260,14 @@ public:
|
||||||
while (m_db.termWalkNext(m_tit, *m_input)) {
|
while (m_db.termWalkNext(m_tit, *m_input)) {
|
||||||
if (!Rcl::Db::isSpellingCandidate(*m_input))
|
if (!Rcl::Db::isSpellingCandidate(*m_input))
|
||||||
continue;
|
continue;
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (!o_index_stripchars) {
|
||||||
|
string lower;
|
||||||
|
if (!unacmaybefold(*m_input, lower, "UTF-8", UNACOP_FOLD))
|
||||||
|
continue;
|
||||||
|
m_input->swap(lower);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// Got a non-empty sort-of appropriate term, let's send it to
|
// Got a non-empty sort-of appropriate term, let's send it to
|
||||||
// aspell
|
// aspell
|
||||||
m_input->append("\n");
|
m_input->append("\n");
|
||||||
|
@ -335,17 +343,29 @@ bool Aspell::make_speller(string& reason)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Aspell::check(Rcl::Db &db, const string &term, string& reason)
|
bool Aspell::check(const string &iterm, string& reason)
|
||||||
{
|
{
|
||||||
LOGDEB2(("Aspell::check [%s]\n", term.c_str()));
|
LOGDEB2(("Aspell::check [%s]\n", iterm.c_str()));
|
||||||
|
string mterm(iterm);
|
||||||
|
|
||||||
if (!ok() || !make_speller(reason))
|
if (!ok() || !make_speller(reason))
|
||||||
return false;
|
return false;
|
||||||
if (term.empty())
|
if (iterm.empty())
|
||||||
return true; //??
|
return true; //??
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (!o_index_stripchars) {
|
||||||
|
string lower;
|
||||||
|
if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
|
||||||
|
LOGERR(("Aspell::check : cant lowercase input\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
mterm.swap(lower);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
int ret = aapi.aspell_speller_check(m_data->m_speller,
|
int ret = aapi.aspell_speller_check(m_data->m_speller,
|
||||||
term.c_str(), term.length());
|
mterm.c_str(), mterm.length());
|
||||||
reason.clear();
|
reason.clear();
|
||||||
switch (ret) {
|
switch (ret) {
|
||||||
case 0: return false;
|
case 0: return false;
|
||||||
|
@ -358,19 +378,31 @@ bool Aspell::check(Rcl::Db &db, const string &term, string& reason)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Aspell::suggest(Rcl::Db &db, const string &term,
|
bool Aspell::suggest(Rcl::Db &db, const string &_term,
|
||||||
list<string>& suggestions, string& reason)
|
list<string>& suggestions, string& reason)
|
||||||
{
|
{
|
||||||
if (!ok() || !make_speller(reason))
|
if (!ok() || !make_speller(reason))
|
||||||
return false;
|
return false;
|
||||||
if (term.empty())
|
string mterm(_term);
|
||||||
|
if (mterm.empty())
|
||||||
return true; //??
|
return true; //??
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (!o_index_stripchars) {
|
||||||
|
string lower;
|
||||||
|
if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
|
||||||
|
LOGERR(("Aspell::check : cant lowercase input\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
mterm.swap(lower);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
AspellCanHaveError *ret;
|
AspellCanHaveError *ret;
|
||||||
|
|
||||||
const AspellWordList *wl =
|
const AspellWordList *wl =
|
||||||
aapi.aspell_speller_suggest(m_data->m_speller,
|
aapi.aspell_speller_suggest(m_data->m_speller,
|
||||||
term.c_str(), term.length());
|
mterm.c_str(), mterm.length());
|
||||||
if (wl == 0) {
|
if (wl == 0) {
|
||||||
reason = aapi.aspell_speller_error_message(m_data->m_speller);
|
reason = aapi.aspell_speller_error_message(m_data->m_speller);
|
||||||
return false;
|
return false;
|
||||||
|
@ -385,7 +417,7 @@ bool Aspell::suggest(Rcl::Db &db, const string &term,
|
||||||
// ******** This should depend if
|
// ******** This should depend if
|
||||||
// stemming is turned on or not for querying *******
|
// stemming is turned on or not for querying *******
|
||||||
string sw(word);
|
string sw(word);
|
||||||
if (db.termExists(sw) && db.stemDiffers("english", sw, term))
|
if (db.termExists(sw) && db.stemDiffers("english", sw, mterm))
|
||||||
suggestions.push_back(word);
|
suggestions.push_back(word);
|
||||||
}
|
}
|
||||||
aapi.delete_aspell_string_enumeration(els);
|
aapi.delete_aspell_string_enumeration(els);
|
||||||
|
@ -418,7 +450,6 @@ using namespace std;
|
||||||
|
|
||||||
static char *thisprog;
|
static char *thisprog;
|
||||||
RclConfig *rclconfig;
|
RclConfig *rclconfig;
|
||||||
Rcl::Db rcldb;
|
|
||||||
|
|
||||||
static char usage [] =
|
static char usage [] =
|
||||||
" -b : build dictionary\n"
|
" -b : build dictionary\n"
|
||||||
|
@ -477,7 +508,9 @@ int main(int argc, char **argv)
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) {
|
Rcl::Db rcldb(rclconfig);
|
||||||
|
|
||||||
|
if (!rcldb.open(Rcl::Db::DbRO, 0)) {
|
||||||
fprintf(stderr, "Could not open database in %s\n", dbdir.c_str());
|
fprintf(stderr, "Could not open database in %s\n", dbdir.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,11 +37,6 @@
|
||||||
#include "rclconfig.h"
|
#include "rclconfig.h"
|
||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
|
||||||
using std::string;
|
|
||||||
using std::list;
|
|
||||||
#endif // NO_NAMESPACES
|
|
||||||
|
|
||||||
class AspellData;
|
class AspellData;
|
||||||
|
|
||||||
class Aspell {
|
class Aspell {
|
||||||
|
@ -53,26 +48,26 @@ class Aspell {
|
||||||
bool ok() const;
|
bool ok() const;
|
||||||
|
|
||||||
/** Find the aspell command and shared library, init function pointers */
|
/** Find the aspell command and shared library, init function pointers */
|
||||||
bool init(string &reason);
|
bool init(std::string &reason);
|
||||||
|
|
||||||
/** Build dictionary out of index term list. This is done at the end
|
/** Build dictionary out of index term list. This is done at the end
|
||||||
* of an indexing pass. */
|
* of an indexing pass. */
|
||||||
bool buildDict(Rcl::Db &db, string &reason);
|
bool buildDict(Rcl::Db &db, std::string &reason);
|
||||||
|
|
||||||
/** Check that word is in dictionary. ret==false && !reason.empty() => err*/
|
/** Check that word is in dictionary. ret==false && !reason.empty() => err*/
|
||||||
bool check(Rcl::Db &db, const string& term, string& reason);
|
bool check(const std::string& term, std::string& reason);
|
||||||
|
|
||||||
/** Return a list of possible expansions for a given word */
|
/** Return a list of possible expansions for a given word */
|
||||||
bool suggest(Rcl::Db &db, const string& term, list<string> &suggestions,
|
bool suggest(Rcl::Db &db, const std::string& term,
|
||||||
string &reason);
|
std::list<std::string> &suggestions, std::string &reason);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
string dicPath();
|
std::string dicPath();
|
||||||
RclConfig *m_config;
|
RclConfig *m_config;
|
||||||
string m_lang;
|
std::string m_lang;
|
||||||
AspellData *m_data;
|
AspellData *m_data;
|
||||||
|
|
||||||
bool make_speller(string& reason);
|
bool make_speller(std::string& reason);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* RCL_USE_ASPELL */
|
#endif /* RCL_USE_ASPELL */
|
||||||
|
|
|
@ -15,6 +15,8 @@
|
||||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
*/
|
*/
|
||||||
#ifndef TEST_RCLCONFIG
|
#ifndef TEST_RCLCONFIG
|
||||||
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
@ -34,6 +36,7 @@
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
#include "cstr.h"
|
#include "cstr.h"
|
||||||
#include "pathut.h"
|
#include "pathut.h"
|
||||||
|
@ -45,15 +48,8 @@
|
||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
#include "fstreewalk.h"
|
#include "fstreewalk.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
using namespace std;
|
bool o_index_stripchars;
|
||||||
#endif /* NO_NAMESPACES */
|
|
||||||
|
|
||||||
#ifndef MIN
|
|
||||||
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
|
|
||||||
#endif
|
|
||||||
#ifndef MAX
|
|
||||||
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool ParamStale::needrecompute()
|
bool ParamStale::needrecompute()
|
||||||
|
@ -77,6 +73,7 @@ bool ParamStale::needrecompute()
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ParamStale::init(RclConfig *rconf, ConfNull *cnf, const string& nm)
|
void ParamStale::init(RclConfig *rconf, ConfNull *cnf, const string& nm)
|
||||||
{
|
{
|
||||||
parent = rconf;
|
parent = rconf;
|
||||||
|
@ -239,6 +236,14 @@ bool RclConfig::updateMainConfig()
|
||||||
FsTreeWalker::setNoFnmPathname();
|
FsTreeWalker::setNoFnmPathname();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
static int m_index_stripchars_init = 0;
|
||||||
|
if (!m_index_stripchars_init) {
|
||||||
|
getConfParam("indexStripChars", &o_index_stripchars);
|
||||||
|
m_index_stripchars_init = 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -303,5 +303,13 @@ class RclConfig {
|
||||||
bool readFieldsConfig(const string& errloc);
|
bool readFieldsConfig(const string& errloc);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// This global variable defines if we are running with an index
|
||||||
|
// stripped of accents and case or a raw one. Ideally, it should be
|
||||||
|
// constant, but it needs to be initialized from the configuration, so
|
||||||
|
// there is no way to do this. It never changes after initialization
|
||||||
|
// of course. When set, it is supposed to get all of recoll to behave like if
|
||||||
|
// if was compiled with RCL_INDEX_STRIPCHARS
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
extern bool o_index_stripchars;
|
||||||
|
#endif
|
||||||
#endif /* _RCLCONFIG_H_INCLUDED_ */
|
#endif /* _RCLCONFIG_H_INCLUDED_ */
|
||||||
|
|
|
@ -197,10 +197,14 @@ void QtGuiResListPager::suggest(const vector<string>uterms,
|
||||||
// If the term is in the index, we don't suggest alternatives.
|
// If the term is in the index, we don't suggest alternatives.
|
||||||
// Actually, we may want to check the frequencies and propose something
|
// Actually, we may want to check the frequencies and propose something
|
||||||
// anyway if a possible variation is much more common (as google does)
|
// anyway if a possible variation is much more common (as google does)
|
||||||
if (aspell->check(*rcldb, *uit, reason))
|
#warning need to take case and diacs sensibility into account somehow
|
||||||
|
// Maybe use the xapian index instead ? How to retrieve the sensitivity flags ?
|
||||||
|
if (0) {
|
||||||
|
if (aspell->check(*uit, reason))
|
||||||
continue;
|
continue;
|
||||||
else if (!reason.empty())
|
else if (!reason.empty())
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
if (!aspell->suggest(*rcldb, *uit, asuggs, reason)) {
|
if (!aspell->suggest(*rcldb, *uit, asuggs, reason)) {
|
||||||
LOGERR(("QtGuiResListPager::suggest: aspell failed: %s\n",
|
LOGERR(("QtGuiResListPager::suggest: aspell failed: %s\n",
|
||||||
reason.c_str()));
|
reason.c_str()));
|
||||||
|
@ -336,6 +340,7 @@ ResList::~ResList()
|
||||||
QT_TR_NOOP("Open"),
|
QT_TR_NOOP("Open"),
|
||||||
QT_TR_NOOP("(show query)"),
|
QT_TR_NOOP("(show query)"),
|
||||||
QT_TR_NOOP("<p><i>Alternate spellings (accents suppressed): </i>"),
|
QT_TR_NOOP("<p><i>Alternate spellings (accents suppressed): </i>"),
|
||||||
|
QT_TR_NOOP("<p><i>Alternate spellings: </i>"),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -79,22 +79,30 @@ class TextSplitPTR : public TextSplit {
|
||||||
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
|
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
|
||||||
vit != hdata.groups.end(); vit++) {
|
vit != hdata.groups.end(); vit++) {
|
||||||
if (vit->size() == 1) {
|
if (vit->size() == 1) {
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
m_terms[vit->front()] = vit - hdata.groups.begin();
|
if (o_index_stripchars) {
|
||||||
#else
|
#endif
|
||||||
string dumb = vit->front();
|
m_terms[vit->front()] = vit - hdata.groups.begin();
|
||||||
unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
m_terms[dumb] = vit - hdata.groups.begin();
|
} else {
|
||||||
|
string dumb = vit->front();
|
||||||
|
unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
|
||||||
|
m_terms[dumb] = vit - hdata.groups.begin();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
} else if (vit->size() > 1) {
|
} else if (vit->size() > 1) {
|
||||||
for (vector<string>::const_iterator it = vit->begin();
|
for (vector<string>::const_iterator it = vit->begin();
|
||||||
it != vit->end(); it++) {
|
it != vit->end(); it++) {
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
#endif
|
||||||
m_gterms.insert(*it);
|
m_gterms.insert(*it);
|
||||||
#else
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
string dumb = *it;
|
string dumb = *it;
|
||||||
unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
|
unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
|
||||||
m_gterms.insert(dumb);
|
m_gterms.insert(dumb);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -320,9 +320,16 @@ void ResListPager::displayPage(RclConfig *config)
|
||||||
map<string, vector<string> > spellings;
|
map<string, vector<string> > spellings;
|
||||||
suggest(uterms, spellings);
|
suggest(uterms, spellings);
|
||||||
if (!spellings.empty()) {
|
if (!spellings.empty()) {
|
||||||
chunk <<
|
if (o_index_stripchars) {
|
||||||
trans("<p><i>Alternate spellings (accents suppressed): </i>")
|
chunk <<
|
||||||
<< "<br /><blockquote>";
|
trans("<p><i>Alternate spellings (accents suppressed): </i>")
|
||||||
|
<< "<br /><blockquote>";
|
||||||
|
} else {
|
||||||
|
chunk <<
|
||||||
|
trans("<p><i>Alternate spellings: </i>")
|
||||||
|
<< "<br /><blockquote>";
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
for (map<string, vector<string> >::const_iterator it0 =
|
for (map<string, vector<string> >::const_iterator it0 =
|
||||||
spellings.begin(); it0 != spellings.end(); it0++) {
|
spellings.begin(); it0 != spellings.end(); it0++) {
|
||||||
|
|
|
@ -116,12 +116,20 @@ static void sigcleanup(int sig)
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
bool o_index_stripchars;
|
||||||
|
#endif
|
||||||
|
|
||||||
inline bool has_prefix(const string& trm)
|
inline bool has_prefix(const string& trm)
|
||||||
{
|
{
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z';
|
if (o_index_stripchars) {
|
||||||
#else
|
#endif
|
||||||
return trm.size() > 0 && trm[0] == ':';
|
return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z';
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
|
return trm.size() > 0 && trm[0] == ':';
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,10 +209,22 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
try {
|
try {
|
||||||
db = new Xapian::Database(dbdir);
|
db = new Xapian::Database(dbdir);
|
||||||
|
|
||||||
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
|
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
|
||||||
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
|
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
|
||||||
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
// If we have terms with a leading ':' it's a new style,
|
||||||
|
// unstripped index
|
||||||
|
{
|
||||||
|
Xapian::TermIterator term = db->allterms_begin(":");
|
||||||
|
if (term == db->allterms_end())
|
||||||
|
o_index_stripchars = true;
|
||||||
|
else
|
||||||
|
o_index_stripchars = false;
|
||||||
|
cout<<"DB: terms are "<<(o_index_stripchars?"stripped":"raw")<<endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (op_flags & OPT_T) {
|
if (op_flags & OPT_T) {
|
||||||
Xapian::TermIterator term;
|
Xapian::TermIterator term;
|
||||||
string printable;
|
string printable;
|
||||||
|
|
|
@ -63,17 +63,19 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||||
// Unaccented stem dbs
|
// Unaccented stem dbs
|
||||||
vector<XapWritableComputableSynFamMember> unacstemdbs;
|
vector<XapWritableComputableSynFamMember> unacstemdbs;
|
||||||
// We can reuse the same stemmer pointers, the objects are stateless.
|
// We can reuse the same stemmer pointers, the objects are stateless.
|
||||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
if (!o_index_stripchars) {
|
||||||
unacstemdbs.push_back(
|
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||||
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
|
unacstemdbs.push_back(
|
||||||
stemmers.back().getptr()));
|
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
|
||||||
unacstemdbs.back().recreate();
|
stemmers.back().getptr()));
|
||||||
|
unacstemdbs.back().recreate();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
||||||
XapWritableComputableSynFamMember
|
XapWritableComputableSynFamMember
|
||||||
diacasedb(wdb, synFamDiCa, "all", &transunac);
|
diacasedb(wdb, synFamDiCa, "all", &transunac);
|
||||||
diacasedb.recreate();
|
if (!o_index_stripchars)
|
||||||
|
diacasedb.recreate();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Walk the list of all terms, and stem/unac each.
|
// Walk the list of all terms, and stem/unac each.
|
||||||
|
@ -109,8 +111,10 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||||
// is the input to the stem db, and add a synonym from the
|
// is the input to the stem db, and add a synonym from the
|
||||||
// stripped term to the cased and accented one, for accent
|
// stripped term to the cased and accented one, for accent
|
||||||
// and case expansion at query time
|
// and case expansion at query time
|
||||||
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
if (!o_index_stripchars) {
|
||||||
diacasedb.addSynonym(*it);
|
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
||||||
|
diacasedb.addSynonym(*it);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Create stemming synonym for every language. The input is the
|
// Create stemming synonym for every language. The input is the
|
||||||
|
@ -124,12 +128,15 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||||
// the unaccented term. While this may be incorrect, it is
|
// the unaccented term. While this may be incorrect, it is
|
||||||
// also necessary for searching in a diacritic-unsensitive
|
// also necessary for searching in a diacritic-unsensitive
|
||||||
// way on a raw index
|
// way on a raw index
|
||||||
string unac;
|
if (!o_index_stripchars) {
|
||||||
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
|
string unac;
|
||||||
if (unac != lower)
|
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
|
||||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
if (unac != lower) {
|
||||||
unacstemdbs[i].addSynonym(unac);
|
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||||
|
unacstemdbs[i].addSynonym(unac);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
|
|
|
@ -24,10 +24,13 @@
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
/* A Capitals/Diacritics removal functor for using with
|
/** A Capitals/Diacritics removal functor for using with
|
||||||
XapComputableSynFamMember */
|
* XapComputableSynFamMember */
|
||||||
class SynTermTransUnac : public SynTermTrans {
|
class SynTermTransUnac : public SynTermTrans {
|
||||||
public:
|
public:
|
||||||
|
/** Constructor
|
||||||
|
* @param op defines if we remove diacritics, case or both
|
||||||
|
*/
|
||||||
SynTermTransUnac(UnacOp op)
|
SynTermTransUnac(UnacOp op)
|
||||||
: m_op(op)
|
: m_op(op)
|
||||||
{
|
{
|
||||||
|
@ -43,7 +46,9 @@ public:
|
||||||
UnacOp m_op;
|
UnacOp m_op;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Walk the Xapian term list and create all the expansion dbs in one go */
|
/** Walk the Xapian term list and create all the expansion dbs in one go.
|
||||||
|
*
|
||||||
|
*/
|
||||||
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||||
const std::vector<std::string>& langs);
|
const std::vector<std::string>& langs);
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,10 +92,11 @@ const string start_of_field_term = "XXST";
|
||||||
const string end_of_field_term = "XXND";
|
const string end_of_field_term = "XXND";
|
||||||
static const string page_break_term = "XXPG";
|
static const string page_break_term = "XXPG";
|
||||||
#else
|
#else
|
||||||
const string start_of_field_term = "XXST/";
|
string start_of_field_term;
|
||||||
const string end_of_field_term = "XXND/";
|
string end_of_field_term;
|
||||||
static const string page_break_term = "XXPG/";
|
const string page_break_term = "XXPG/";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Field name for the unsplit file name. Has to exist in the field file
|
// Field name for the unsplit file name. Has to exist in the field file
|
||||||
// because of usage in termmatch()
|
// because of usage in termmatch()
|
||||||
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
||||||
|
@ -683,6 +684,18 @@ Db::Db(RclConfig *cfp)
|
||||||
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
|
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
|
||||||
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
||||||
{
|
{
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (start_of_field_term.empty()) {
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
start_of_field_term = "XXST";
|
||||||
|
end_of_field_term = "XXND";
|
||||||
|
} else {
|
||||||
|
start_of_field_term = "XXST/";
|
||||||
|
end_of_field_term = "XXND/";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
m_ndb = new Native(this);
|
m_ndb = new Native(this);
|
||||||
if (m_config) {
|
if (m_config) {
|
||||||
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
||||||
|
@ -886,12 +899,13 @@ int Db::termDocCnt(const string& _term)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
string term = _term;
|
string term = _term;
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
if (o_index_stripchars)
|
||||||
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
|
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (m_stops.isStop(term)) {
|
if (m_stops.isStop(term)) {
|
||||||
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
|
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
|
||||||
|
@ -1151,13 +1165,17 @@ string Db::getSpellingSuggestion(const string& word)
|
||||||
{
|
{
|
||||||
if (m_ndb == 0)
|
if (m_ndb == 0)
|
||||||
return string();
|
return string();
|
||||||
|
|
||||||
string term = word;
|
string term = word;
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars)
|
||||||
|
#endif
|
||||||
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
|
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
||||||
return string();
|
return string();
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
if (!isSpellingCandidate(term))
|
if (!isSpellingCandidate(term))
|
||||||
return string();
|
return string();
|
||||||
return m_ndb->xrdb.get_spelling_suggestion(term);
|
return m_ndb->xrdb.get_spelling_suggestion(term);
|
||||||
|
@ -1266,9 +1284,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||||
TermProc *nxt = &tpidx;
|
TermProc *nxt = &tpidx;
|
||||||
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
||||||
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
|
||||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
TermProcPrep tpprep(nxt);
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars)
|
||||||
#endif
|
#endif
|
||||||
|
nxt = &tpprep;
|
||||||
|
|
||||||
TextSplitDb splitter(newdocument, nxt);
|
TextSplitDb splitter(newdocument, nxt);
|
||||||
tpidx.setTSD(&splitter);
|
tpidx.setTSD(&splitter);
|
||||||
|
@ -1951,12 +1972,15 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
||||||
// Get rid of capitals and accents
|
// Get rid of capitals and accents
|
||||||
|
|
||||||
string droot = root;
|
string droot = root;
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
|
||||||
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
if (o_index_stripchars)
|
||||||
return false;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
|
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
|
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
|
||||||
|
|
||||||
string prefix;
|
string prefix;
|
||||||
|
|
|
@ -129,18 +129,27 @@ extern void *DbUpdWorker(void*);
|
||||||
|
|
||||||
inline bool has_prefix(const string& trm)
|
inline bool has_prefix(const string& trm)
|
||||||
{
|
{
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
|
if (o_index_stripchars) {
|
||||||
#else
|
#endif
|
||||||
return !trm.empty() && trm[0] == ':';
|
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
|
return !trm.empty() && trm[0] == ':';
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline string wrap_prefix(const string& pfx)
|
inline string wrap_prefix(const string& pfx)
|
||||||
{
|
{
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
return pfx;
|
if (o_index_stripchars) {
|
||||||
#else
|
#endif
|
||||||
return cstr_colon + pfx + cstr_colon;
|
return pfx;
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
|
return cstr_colon + pfx + cstr_colon;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -384,9 +393,13 @@ private:
|
||||||
string version_string();
|
string version_string();
|
||||||
|
|
||||||
extern const string pathelt_prefix;
|
extern const string pathelt_prefix;
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
extern const string start_of_field_term;
|
extern const string start_of_field_term;
|
||||||
extern const string end_of_field_term;
|
extern const string end_of_field_term;
|
||||||
|
#else
|
||||||
|
extern string start_of_field_term;
|
||||||
|
extern string end_of_field_term;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* _DB_H_INCLUDED_ */
|
#endif /* _DB_H_INCLUDED_ */
|
||||||
|
|
|
@ -79,10 +79,22 @@ static const int original_term_wqf_booster = 10;
|
||||||
|
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
#define bufprefix(BUF, L) {(BUF)[0] = L;}
|
#define bufprefix(BUF, L) {(BUF)[0] = L;}
|
||||||
#define bpoffs 1
|
#define bpoffs() 1
|
||||||
#else
|
#else
|
||||||
#define bufprefix(BUF, L) {(BUF)[0] = ':'; (BUF)[1] = L; (BUF)[2] = ':';}
|
static inline void bufprefix(char *buf, char c)
|
||||||
#define bpoffs 3
|
{
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
buf[0] = c;
|
||||||
|
} else {
|
||||||
|
buf[0] = ':';
|
||||||
|
buf[1] = c;
|
||||||
|
buf[2] = ':';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline int bpoffs()
|
||||||
|
{
|
||||||
|
return o_index_stripchars ? 1 : 3;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static Xapian::Query
|
static Xapian::Query
|
||||||
|
@ -92,7 +104,7 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
||||||
// only doing %d's !
|
// only doing %d's !
|
||||||
char buf[200];
|
char buf[200];
|
||||||
bufprefix(buf, 'D');
|
bufprefix(buf, 'D');
|
||||||
sprintf(buf+bpoffs, "%04d%02d", y1, m1);
|
sprintf(buf+bpoffs(), "%04d%02d", y1, m1);
|
||||||
vector<Xapian::Query> v;
|
vector<Xapian::Query> v;
|
||||||
|
|
||||||
int d_last = monthdays(m1, y1);
|
int d_last = monthdays(m1, y1);
|
||||||
|
@ -103,7 +115,7 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
||||||
// Deal with any initial partial month
|
// Deal with any initial partial month
|
||||||
if (d1 > 1 || d_end < d_last) {
|
if (d1 > 1 || d_end < d_last) {
|
||||||
for ( ; d1 <= d_end ; d1++) {
|
for ( ; d1 <= d_end ; d1++) {
|
||||||
sprintf(buf + 6 + bpoffs, "%02d", d1);
|
sprintf(buf + 6 + bpoffs(), "%02d", d1);
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -117,32 +129,32 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
||||||
|
|
||||||
int m_last = (y1 < y2) ? 12 : m2 - 1;
|
int m_last = (y1 < y2) ? 12 : m2 - 1;
|
||||||
while (++m1 <= m_last) {
|
while (++m1 <= m_last) {
|
||||||
sprintf(buf + 4 + bpoffs, "%02d", m1);
|
sprintf(buf + 4 + bpoffs(), "%02d", m1);
|
||||||
bufprefix(buf, 'M');
|
bufprefix(buf, 'M');
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (y1 < y2) {
|
if (y1 < y2) {
|
||||||
while (++y1 < y2) {
|
while (++y1 < y2) {
|
||||||
sprintf(buf + bpoffs, "%04d", y1);
|
sprintf(buf + bpoffs(), "%04d", y1);
|
||||||
bufprefix(buf, 'Y');
|
bufprefix(buf, 'Y');
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
sprintf(buf + bpoffs, "%04d", y2);
|
sprintf(buf + bpoffs(), "%04d", y2);
|
||||||
bufprefix(buf, 'M');
|
bufprefix(buf, 'M');
|
||||||
for (m1 = 1; m1 < m2; m1++) {
|
for (m1 = 1; m1 < m2; m1++) {
|
||||||
sprintf(buf + 4 + bpoffs, "%02d", m1);
|
sprintf(buf + 4 + bpoffs(), "%02d", m1);
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sprintf(buf + 2 + bpoffs, "%02d", m2);
|
sprintf(buf + 2 + bpoffs(), "%02d", m2);
|
||||||
|
|
||||||
// Deal with any final partial month
|
// Deal with any final partial month
|
||||||
if (d2 < monthdays(m2, y2)) {
|
if (d2 < monthdays(m2, y2)) {
|
||||||
bufprefix(buf, 'D');
|
bufprefix(buf, 'D');
|
||||||
for (d1 = 1 ; d1 <= d2; d1++) {
|
for (d1 = 1 ; d1 <= d2; d1++) {
|
||||||
sprintf(buf + 6 + bpoffs, "%02d", d1);
|
sprintf(buf + 6 + bpoffs(), "%02d", d1);
|
||||||
v.push_back(Xapian::Query(buf));
|
v.push_back(Xapian::Query(buf));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -663,13 +675,13 @@ static void listVector(const string& what, const vector<string>&l)
|
||||||
*/
|
*/
|
||||||
void StringToXapianQ::expandTerm(int mods,
|
void StringToXapianQ::expandTerm(int mods,
|
||||||
const string& term,
|
const string& term,
|
||||||
vector<string>& exp, string &sterm,
|
vector<string>& oexp, string &sterm,
|
||||||
const string& prefix)
|
const string& prefix)
|
||||||
{
|
{
|
||||||
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
||||||
mods, m_field.c_str(), term.c_str(), m_stemlang.c_str()));
|
mods, m_field.c_str(), term.c_str(), m_stemlang.c_str()));
|
||||||
sterm.clear();
|
sterm.clear();
|
||||||
exp.clear();
|
oexp.clear();
|
||||||
if (term.empty())
|
if (term.empty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
@ -693,145 +705,161 @@ void StringToXapianQ::expandTerm(int mods,
|
||||||
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
||||||
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
|
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
|
||||||
|
|
||||||
// If we are working with a raw index, apply the rules for case and
|
if (o_index_stripchars) {
|
||||||
// diacritics sensitivity.
|
diac_sensitive = case_sensitive = false;
|
||||||
|
} else {
|
||||||
|
// If we are working with a raw index, apply the rules for case and
|
||||||
|
// diacritics sensitivity.
|
||||||
|
|
||||||
// If any character has a diacritic, we become
|
// If any character has a diacritic, we become
|
||||||
// diacritic-sensitive. Note that the way that the test is
|
// diacritic-sensitive. Note that the way that the test is
|
||||||
// performed (conversion+comparison) will automatically ignore
|
// performed (conversion+comparison) will automatically ignore
|
||||||
// accented characters which are actually a separate letter
|
// accented characters which are actually a separate letter
|
||||||
if (unachasaccents(term))
|
if (unachasaccents(term))
|
||||||
diac_sensitive = true;
|
diac_sensitive = true;
|
||||||
|
|
||||||
// If any character apart the first is uppercase, we become case-sensitive.
|
// If any character apart the first is uppercase, we become
|
||||||
// The first character is reserved for turning off stemming. You need to
|
// case-sensitive. The first character is reserved for
|
||||||
// use a query language modifier to search for Floor in a case-sensitive
|
// turning off stemming. You need to use a query language
|
||||||
// way.
|
// modifier to search for Floor in a case-sensitive way.
|
||||||
Utf8Iter it(term);
|
Utf8Iter it(term);
|
||||||
it++;
|
it++;
|
||||||
if (unachasuppercase(term.substr(it.getBpos())))
|
if (unachasuppercase(term.substr(it.getBpos())))
|
||||||
case_sensitive = true;
|
case_sensitive = true;
|
||||||
|
|
||||||
// If we are sensitive to case or diacritics turn stemming off
|
// If we are sensitive to case or diacritics turn stemming off
|
||||||
if (diac_sensitive || case_sensitive)
|
if (diac_sensitive || case_sensitive)
|
||||||
nostemexp = true;
|
nostemexp = true;
|
||||||
|
|
||||||
if (!case_sensitive || !diac_sensitive)
|
if (!case_sensitive || !diac_sensitive)
|
||||||
noexpansion = false;
|
noexpansion = false;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (noexpansion) {
|
if (noexpansion) {
|
||||||
sterm = term;
|
sterm = term;
|
||||||
exp.push_back(prefix + term);
|
oexp.push_back(prefix + term);
|
||||||
} else {
|
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||||
TermMatchResult res;
|
return;
|
||||||
if (haswild) {
|
}
|
||||||
// Note that if there are wildcards, we do a direct from-index
|
|
||||||
// expansion, which means that we are casediac-sensitive. There
|
|
||||||
// would be nothing to prevent us to expand from the casediac
|
|
||||||
// synonyms first. To be done later
|
|
||||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
|
|
||||||
m_field);
|
|
||||||
} else {
|
|
||||||
sterm = term;
|
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
|
||||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1,
|
|
||||||
m_field);
|
|
||||||
#else
|
|
||||||
// No stem expansion when diacritic or case sensitivity is
|
|
||||||
// set, it makes no sense (it would mess with the
|
|
||||||
// diacritics anyway if they are not in the stem part).
|
|
||||||
// In these 3 cases, perform appropriate expansion from
|
|
||||||
// the charstripping db, and do a bogus wildcard expansion
|
|
||||||
// (there is no wild card) to generate the result:
|
|
||||||
if (diac_sensitive && case_sensitive) {
|
|
||||||
// No expansion whatsoever
|
|
||||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
|
|
||||||
m_field);
|
|
||||||
} else {
|
|
||||||
// Access case and diacritics expansion:
|
|
||||||
vector<string> exp;
|
|
||||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
|
||||||
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa,
|
|
||||||
"all", &unacfoldtrans);
|
|
||||||
|
|
||||||
if (diac_sensitive) {
|
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||||
// Expand for accents and case, filtering for same accents,
|
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa, "all",
|
||||||
// then bogus wildcard expansion for generating result
|
&unacfoldtrans);
|
||||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
vector<string> lexp;
|
||||||
synac.synExpand(term, exp, &foldtrans);
|
|
||||||
for (vector<string>::const_iterator it = exp.begin();
|
|
||||||
it != exp.end(); it++) {
|
|
||||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
|
|
||||||
-1, m_field);
|
|
||||||
}
|
|
||||||
} else if (case_sensitive) {
|
|
||||||
// Expand for accents and case, filtering for same case,
|
|
||||||
// then bogus wildcard expansion for generating result
|
|
||||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
||||||
synac.synExpand(term, exp, &unactrans);
|
|
||||||
for (vector<string>::const_iterator it = exp.begin();
|
|
||||||
it != exp.end(); it++) {
|
|
||||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
|
|
||||||
-1, m_field);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Expand for accents and case, then lowercase
|
|
||||||
// result for input to stemdb.
|
|
||||||
synac.synExpand(term, exp);
|
|
||||||
for (unsigned int i = 0; i < exp.size(); i++) {
|
|
||||||
string lower;
|
|
||||||
unacmaybefold(exp[i], lower, "UTF-8", UNACOP_FOLD);
|
|
||||||
exp[i] = lower;
|
|
||||||
}
|
|
||||||
sort(exp.begin(), exp.end());
|
|
||||||
vector<string>::iterator uit =
|
|
||||||
unique(exp.begin(), exp.end());
|
|
||||||
exp.resize(uit - exp.begin());
|
|
||||||
LOGDEB(("ExpandTerm: after casediac: %s\n",
|
|
||||||
stringsToString(exp).c_str()));
|
|
||||||
|
|
||||||
StemDb db(m_db.m_ndb->xrdb);
|
TermMatchResult res;
|
||||||
vector<string> exp1;
|
if (haswild) {
|
||||||
for (vector<string>::const_iterator it = exp.begin();
|
// Note that if there are wildcards, we do a direct from-index
|
||||||
it != exp.end(); it++) {
|
// expansion, which means that we are casediac-sensitive. There
|
||||||
db.stemExpand(m_stemlang, *it, exp1);
|
// would be nothing to prevent us to expand from the casediac
|
||||||
}
|
// synonyms first. To be done later
|
||||||
LOGDEB(("ExpandTerm: after stem: %s\n",
|
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
|
||||||
stringsToString(exp1).c_str()));
|
m_field);
|
||||||
|
goto termmatchtoresult;
|
||||||
// Expand the resulting list for case (all stemdb content
|
|
||||||
// is lowercase)
|
|
||||||
exp.clear();
|
|
||||||
for (vector<string>::const_iterator it = exp1.begin();
|
|
||||||
it != exp1.end(); it++) {
|
|
||||||
synac.synExpand(*it, exp);
|
|
||||||
}
|
|
||||||
sort(exp.begin(), exp.end());
|
|
||||||
uit = unique(exp.begin(), exp.end());
|
|
||||||
exp.resize(uit - exp.begin());
|
|
||||||
|
|
||||||
LOGDEB(("ExpandTerm: after case exp of stem: %s\n",
|
|
||||||
stringsToString(exp).c_str()));
|
|
||||||
|
|
||||||
// Bogus wildcard expand to generate the result
|
|
||||||
for (vector<string>::const_iterator it = exp.begin();
|
|
||||||
it != exp.end(); it++) {
|
|
||||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
|
|
||||||
-1, m_field);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
|
||||||
it != res.entries.end(); it++) {
|
|
||||||
exp.push_back(it->term);
|
|
||||||
}
|
|
||||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(exp).c_str()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sterm = term;
|
||||||
|
|
||||||
|
#ifdef RCL_INDEX_STRIPCHARS
|
||||||
|
|
||||||
|
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
// If the index is raw, we can only come here if nostemexp is unset
|
||||||
|
// and we just need stem expansion.
|
||||||
|
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
|
||||||
|
goto termmatchtoresult;
|
||||||
|
}
|
||||||
|
|
||||||
|
// No stem expansion when diacritic or case sensitivity is set, it
|
||||||
|
// makes no sense (it would mess with the diacritics anyway if
|
||||||
|
// they are not in the stem part). In these 3 cases, perform
|
||||||
|
// appropriate expansion from the charstripping db, and do a bogus
|
||||||
|
// wildcard expansion (there is no wild card) to generate the
|
||||||
|
// result:
|
||||||
|
|
||||||
|
if (diac_sensitive && case_sensitive) {
|
||||||
|
// No expansion whatsoever
|
||||||
|
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
|
||||||
|
goto termmatchtoresult;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (diac_sensitive) {
|
||||||
|
// Expand for accents and case, filtering for same accents,
|
||||||
|
// then bogus wildcard expansion for generating result
|
||||||
|
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||||
|
synac.synExpand(term, lexp, &foldtrans);
|
||||||
|
goto exptotermatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (case_sensitive) {
|
||||||
|
// Expand for accents and case, filtering for same case, then
|
||||||
|
// bogus wildcard expansion for generating result
|
||||||
|
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||||
|
synac.synExpand(term, lexp, &unactrans);
|
||||||
|
goto exptotermatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We are neither accent- nor case- sensitive and may need stem
|
||||||
|
// expansion or not.
|
||||||
|
|
||||||
|
// Expand for accents and case
|
||||||
|
synac.synExpand(term, lexp);
|
||||||
|
LOGDEB(("ExpTerm: casediac: %s\n", stringsToString(lexp).c_str()));
|
||||||
|
if (nostemexp)
|
||||||
|
goto exptotermatch;
|
||||||
|
|
||||||
|
// Need stem expansion. Lowercase the result of accent and case
|
||||||
|
// expansion for input to stemdb.
|
||||||
|
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||||
|
string lower;
|
||||||
|
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||||
|
lexp[i] = lower;
|
||||||
|
}
|
||||||
|
sort(lexp.begin(), lexp.end());
|
||||||
|
{
|
||||||
|
vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
|
||||||
|
lexp.resize(uit - lexp.begin());
|
||||||
|
StemDb db(m_db.m_ndb->xrdb);
|
||||||
|
vector<string> exp1;
|
||||||
|
for (vector<string>::const_iterator it = lexp.begin();
|
||||||
|
it != lexp.end(); it++) {
|
||||||
|
db.stemExpand(m_stemlang, *it, exp1);
|
||||||
|
}
|
||||||
|
LOGDEB(("ExpTerm: stem: %s\n", stringsToString(exp1).c_str()));
|
||||||
|
|
||||||
|
// Expand the resulting list for case (all stemdb content
|
||||||
|
// is lowercase)
|
||||||
|
lexp.clear();
|
||||||
|
for (vector<string>::const_iterator it = exp1.begin();
|
||||||
|
it != exp1.end(); it++) {
|
||||||
|
synac.synExpand(*it, lexp);
|
||||||
|
}
|
||||||
|
sort(lexp.begin(), lexp.end());
|
||||||
|
uit = unique(lexp.begin(), lexp.end());
|
||||||
|
lexp.resize(uit - lexp.begin());
|
||||||
|
}
|
||||||
|
LOGDEB(("ExpTerm: case exp of stem: %s\n", stringsToString(lexp).c_str()));
|
||||||
|
|
||||||
|
// Bogus wildcard expand to generate the result
|
||||||
|
exptotermatch:
|
||||||
|
for (vector<string>::const_iterator it = lexp.begin();
|
||||||
|
it != lexp.end(); it++) {
|
||||||
|
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it,
|
||||||
|
res, -1, m_field);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Term match entries to vector of terms
|
||||||
|
termmatchtoresult:
|
||||||
|
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
||||||
|
it != res.entries.end(); it++) {
|
||||||
|
oexp.push_back(it->term);
|
||||||
|
}
|
||||||
|
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
||||||
|
@ -1097,9 +1125,11 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||||
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
||||||
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
||||||
//tpcommon.onlygrams(true);
|
//tpcommon.onlygrams(true);
|
||||||
#ifdef RCL_INDEX_STRIPCHARS
|
TermProcPrep tpprep(nxt);
|
||||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars)
|
||||||
#endif
|
#endif
|
||||||
|
nxt = &tpprep;
|
||||||
|
|
||||||
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||||
TextSplit::TXTS_KEEPWILD),
|
TextSplit::TXTS_KEEPWILD),
|
||||||
|
|
|
@ -26,6 +26,8 @@
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <iostream>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
#include <xapian.h>
|
#include <xapian.h>
|
||||||
|
|
||||||
|
@ -34,18 +36,14 @@
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "synfamily.h"
|
#include "synfamily.h"
|
||||||
#include "unacpp.h"
|
#include "unacpp.h"
|
||||||
|
#include "rclconfig.h"
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Expand for one or several languages
|
* Expand for one or several languages
|
||||||
*/
|
*/
|
||||||
bool StemDb::stemExpand(const std::string& langs,
|
bool StemDb::stemExpand(const std::string& langs, const std::string& term,
|
||||||
const std::string& term,
|
|
||||||
vector<string>& result)
|
vector<string>& result)
|
||||||
{
|
{
|
||||||
vector<string> llangs;
|
vector<string> llangs;
|
||||||
|
@ -59,14 +57,17 @@ bool StemDb::stemExpand(const std::string& langs,
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef RCL_INDEX_STRIPCHARS
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
for (vector<string>::const_iterator it = llangs.begin();
|
// Expand the unaccented stem
|
||||||
it != llangs.end(); it++) {
|
if (!o_index_stripchars) {
|
||||||
SynTermTransStem stemmer(*it);
|
for (vector<string>::const_iterator it = llangs.begin();
|
||||||
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
|
it != llangs.end(); it++) {
|
||||||
*it, &stemmer);
|
SynTermTransStem stemmer(*it);
|
||||||
string unac;
|
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
|
||||||
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
|
*it, &stemmer);
|
||||||
(void)expander.synExpand(unac, result);
|
string unac;
|
||||||
|
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
|
||||||
|
(void)expander.synExpand(unac, result);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -33,17 +33,12 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
#include "hldata.h"
|
#include "hldata.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
|
||||||
using namespace std;
|
|
||||||
#endif /* NO_NAMESPACES */
|
|
||||||
|
|
||||||
#define MIN(A,B) ((A)<(B)?(A):(B))
|
|
||||||
|
|
||||||
int stringicmp(const string & s1, const string& s2)
|
int stringicmp(const string & s1, const string& s2)
|
||||||
{
|
{
|
||||||
string::const_iterator it1 = s1.begin();
|
string::const_iterator it1 = s1.begin();
|
||||||
|
|
|
@ -224,4 +224,11 @@ public:
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifndef MIN
|
||||||
|
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
|
||||||
|
#endif
|
||||||
|
#ifndef MAX
|
||||||
|
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* _SMALLUT_H_INCLUDED_ */
|
#endif /* _SMALLUT_H_INCLUDED_ */
|
||||||
|
|
|
@ -4,6 +4,8 @@ logfilename = /tmp/logrcltst
|
||||||
daemloglevel = 6
|
daemloglevel = 6
|
||||||
daemlogfilename = /tmp/rclmontrace
|
daemlogfilename = /tmp/rclmontrace
|
||||||
|
|
||||||
|
indexStripChars = 1
|
||||||
|
|
||||||
topdirs = /home/dockes/projets/fulltext/testrecoll/
|
topdirs = /home/dockes/projets/fulltext/testrecoll/
|
||||||
|
|
||||||
skippedPaths = \
|
skippedPaths = \
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue