ensure that recoll configured with indexStripChars=1 runs as compiled with -DRCL_INDEX_STRIPCHARS
This commit is contained in:
parent
48e9a4f901
commit
e22b347767
17 changed files with 425 additions and 260 deletions
|
@ -23,9 +23,9 @@
|
|||
|
||||
#include <unistd.h>
|
||||
#include <dlfcn.h>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include ASPELL_INCLUDE
|
||||
|
||||
|
@ -33,7 +33,7 @@
|
|||
#include "execmd.h"
|
||||
#include "rclaspell.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
#include "unacpp.h"
|
||||
#include "ptmutex.h"
|
||||
|
||||
// Just a place where we keep the Aspell library entry points together
|
||||
|
@ -260,6 +260,14 @@ public:
|
|||
while (m_db.termWalkNext(m_tit, *m_input)) {
|
||||
if (!Rcl::Db::isSpellingCandidate(*m_input))
|
||||
continue;
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (!o_index_stripchars) {
|
||||
string lower;
|
||||
if (!unacmaybefold(*m_input, lower, "UTF-8", UNACOP_FOLD))
|
||||
continue;
|
||||
m_input->swap(lower);
|
||||
}
|
||||
#endif
|
||||
// Got a non-empty sort-of appropriate term, let's send it to
|
||||
// aspell
|
||||
m_input->append("\n");
|
||||
|
@ -335,17 +343,29 @@ bool Aspell::make_speller(string& reason)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool Aspell::check(Rcl::Db &db, const string &term, string& reason)
|
||||
bool Aspell::check(const string &iterm, string& reason)
|
||||
{
|
||||
LOGDEB2(("Aspell::check [%s]\n", term.c_str()));
|
||||
LOGDEB2(("Aspell::check [%s]\n", iterm.c_str()));
|
||||
string mterm(iterm);
|
||||
|
||||
if (!ok() || !make_speller(reason))
|
||||
return false;
|
||||
if (term.empty())
|
||||
if (iterm.empty())
|
||||
return true; //??
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (!o_index_stripchars) {
|
||||
string lower;
|
||||
if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
|
||||
LOGERR(("Aspell::check : cant lowercase input\n"));
|
||||
return false;
|
||||
}
|
||||
mterm.swap(lower);
|
||||
}
|
||||
#endif
|
||||
|
||||
int ret = aapi.aspell_speller_check(m_data->m_speller,
|
||||
term.c_str(), term.length());
|
||||
mterm.c_str(), mterm.length());
|
||||
reason.clear();
|
||||
switch (ret) {
|
||||
case 0: return false;
|
||||
|
@ -358,19 +378,31 @@ bool Aspell::check(Rcl::Db &db, const string &term, string& reason)
|
|||
}
|
||||
}
|
||||
|
||||
bool Aspell::suggest(Rcl::Db &db, const string &term,
|
||||
bool Aspell::suggest(Rcl::Db &db, const string &_term,
|
||||
list<string>& suggestions, string& reason)
|
||||
{
|
||||
if (!ok() || !make_speller(reason))
|
||||
return false;
|
||||
if (term.empty())
|
||||
string mterm(_term);
|
||||
if (mterm.empty())
|
||||
return true; //??
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (!o_index_stripchars) {
|
||||
string lower;
|
||||
if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
|
||||
LOGERR(("Aspell::check : cant lowercase input\n"));
|
||||
return false;
|
||||
}
|
||||
mterm.swap(lower);
|
||||
}
|
||||
#endif
|
||||
|
||||
AspellCanHaveError *ret;
|
||||
|
||||
const AspellWordList *wl =
|
||||
aapi.aspell_speller_suggest(m_data->m_speller,
|
||||
term.c_str(), term.length());
|
||||
mterm.c_str(), mterm.length());
|
||||
if (wl == 0) {
|
||||
reason = aapi.aspell_speller_error_message(m_data->m_speller);
|
||||
return false;
|
||||
|
@ -385,7 +417,7 @@ bool Aspell::suggest(Rcl::Db &db, const string &term,
|
|||
// ******** This should depend if
|
||||
// stemming is turned on or not for querying *******
|
||||
string sw(word);
|
||||
if (db.termExists(sw) && db.stemDiffers("english", sw, term))
|
||||
if (db.termExists(sw) && db.stemDiffers("english", sw, mterm))
|
||||
suggestions.push_back(word);
|
||||
}
|
||||
aapi.delete_aspell_string_enumeration(els);
|
||||
|
@ -418,7 +450,6 @@ using namespace std;
|
|||
|
||||
static char *thisprog;
|
||||
RclConfig *rclconfig;
|
||||
Rcl::Db rcldb;
|
||||
|
||||
static char usage [] =
|
||||
" -b : build dictionary\n"
|
||||
|
@ -477,7 +508,9 @@ int main(int argc, char **argv)
|
|||
exit(1);
|
||||
}
|
||||
|
||||
if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) {
|
||||
Rcl::Db rcldb(rclconfig);
|
||||
|
||||
if (!rcldb.open(Rcl::Db::DbRO, 0)) {
|
||||
fprintf(stderr, "Could not open database in %s\n", dbdir.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
|
|
@ -37,11 +37,6 @@
|
|||
#include "rclconfig.h"
|
||||
#include "rcldb.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
using std::list;
|
||||
#endif // NO_NAMESPACES
|
||||
|
||||
class AspellData;
|
||||
|
||||
class Aspell {
|
||||
|
@ -53,26 +48,26 @@ class Aspell {
|
|||
bool ok() const;
|
||||
|
||||
/** Find the aspell command and shared library, init function pointers */
|
||||
bool init(string &reason);
|
||||
bool init(std::string &reason);
|
||||
|
||||
/** Build dictionary out of index term list. This is done at the end
|
||||
* of an indexing pass. */
|
||||
bool buildDict(Rcl::Db &db, string &reason);
|
||||
bool buildDict(Rcl::Db &db, std::string &reason);
|
||||
|
||||
/** Check that word is in dictionary. ret==false && !reason.empty() => err*/
|
||||
bool check(Rcl::Db &db, const string& term, string& reason);
|
||||
bool check(const std::string& term, std::string& reason);
|
||||
|
||||
/** Return a list of possible expansions for a given word */
|
||||
bool suggest(Rcl::Db &db, const string& term, list<string> &suggestions,
|
||||
string &reason);
|
||||
bool suggest(Rcl::Db &db, const std::string& term,
|
||||
std::list<std::string> &suggestions, std::string &reason);
|
||||
|
||||
private:
|
||||
string dicPath();
|
||||
std::string dicPath();
|
||||
RclConfig *m_config;
|
||||
string m_lang;
|
||||
std::string m_lang;
|
||||
AspellData *m_data;
|
||||
|
||||
bool make_speller(string& reason);
|
||||
bool make_speller(std::string& reason);
|
||||
};
|
||||
|
||||
#endif /* RCL_USE_ASPELL */
|
||||
|
|
|
@ -15,6 +15,8 @@
|
|||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef TEST_RCLCONFIG
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
|
@ -34,6 +36,7 @@
|
|||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
using namespace std;
|
||||
|
||||
#include "cstr.h"
|
||||
#include "pathut.h"
|
||||
|
@ -45,15 +48,8 @@
|
|||
#include "readfile.h"
|
||||
#include "fstreewalk.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
|
||||
#endif
|
||||
#ifndef MAX
|
||||
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
bool o_index_stripchars;
|
||||
#endif
|
||||
|
||||
bool ParamStale::needrecompute()
|
||||
|
@ -77,6 +73,7 @@ bool ParamStale::needrecompute()
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void ParamStale::init(RclConfig *rconf, ConfNull *cnf, const string& nm)
|
||||
{
|
||||
parent = rconf;
|
||||
|
@ -239,6 +236,14 @@ bool RclConfig::updateMainConfig()
|
|||
FsTreeWalker::setNoFnmPathname();
|
||||
}
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
static int m_index_stripchars_init = 0;
|
||||
if (!m_index_stripchars_init) {
|
||||
getConfParam("indexStripChars", &o_index_stripchars);
|
||||
m_index_stripchars_init = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -303,5 +303,13 @@ class RclConfig {
|
|||
bool readFieldsConfig(const string& errloc);
|
||||
};
|
||||
|
||||
|
||||
// This global variable defines if we are running with an index
|
||||
// stripped of accents and case or a raw one. Ideally, it should be
|
||||
// constant, but it needs to be initialized from the configuration, so
|
||||
// there is no way to do this. It never changes after initialization
|
||||
// of course. When set, it is supposed to get all of recoll to behave like if
|
||||
// if was compiled with RCL_INDEX_STRIPCHARS
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
extern bool o_index_stripchars;
|
||||
#endif
|
||||
#endif /* _RCLCONFIG_H_INCLUDED_ */
|
||||
|
|
|
@ -197,10 +197,14 @@ void QtGuiResListPager::suggest(const vector<string>uterms,
|
|||
// If the term is in the index, we don't suggest alternatives.
|
||||
// Actually, we may want to check the frequencies and propose something
|
||||
// anyway if a possible variation is much more common (as google does)
|
||||
if (aspell->check(*rcldb, *uit, reason))
|
||||
#warning need to take case and diacs sensibility into account somehow
|
||||
// Maybe use the xapian index instead ? How to retrieve the sensitivity flags ?
|
||||
if (0) {
|
||||
if (aspell->check(*uit, reason))
|
||||
continue;
|
||||
else if (!reason.empty())
|
||||
return;
|
||||
}
|
||||
if (!aspell->suggest(*rcldb, *uit, asuggs, reason)) {
|
||||
LOGERR(("QtGuiResListPager::suggest: aspell failed: %s\n",
|
||||
reason.c_str()));
|
||||
|
@ -336,6 +340,7 @@ ResList::~ResList()
|
|||
QT_TR_NOOP("Open"),
|
||||
QT_TR_NOOP("(show query)"),
|
||||
QT_TR_NOOP("<p><i>Alternate spellings (accents suppressed): </i>"),
|
||||
QT_TR_NOOP("<p><i>Alternate spellings: </i>"),
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -79,22 +79,30 @@ class TextSplitPTR : public TextSplit {
|
|||
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
|
||||
vit != hdata.groups.end(); vit++) {
|
||||
if (vit->size() == 1) {
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars) {
|
||||
#endif
|
||||
m_terms[vit->front()] = vit - hdata.groups.begin();
|
||||
#else
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
} else {
|
||||
string dumb = vit->front();
|
||||
unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
|
||||
m_terms[dumb] = vit - hdata.groups.begin();
|
||||
}
|
||||
#endif
|
||||
} else if (vit->size() > 1) {
|
||||
for (vector<string>::const_iterator it = vit->begin();
|
||||
it != vit->end(); it++) {
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars) {
|
||||
#endif
|
||||
m_gterms.insert(*it);
|
||||
#else
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
} else {
|
||||
string dumb = *it;
|
||||
unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
|
||||
m_gterms.insert(dumb);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
|
|
@ -320,9 +320,16 @@ void ResListPager::displayPage(RclConfig *config)
|
|||
map<string, vector<string> > spellings;
|
||||
suggest(uterms, spellings);
|
||||
if (!spellings.empty()) {
|
||||
if (o_index_stripchars) {
|
||||
chunk <<
|
||||
trans("<p><i>Alternate spellings (accents suppressed): </i>")
|
||||
<< "<br /><blockquote>";
|
||||
} else {
|
||||
chunk <<
|
||||
trans("<p><i>Alternate spellings: </i>")
|
||||
<< "<br /><blockquote>";
|
||||
|
||||
}
|
||||
|
||||
for (map<string, vector<string> >::const_iterator it0 =
|
||||
spellings.begin(); it0 != spellings.end(); it0++) {
|
||||
|
|
|
@ -116,12 +116,20 @@ static void sigcleanup(int sig)
|
|||
exit(1);
|
||||
}
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
bool o_index_stripchars;
|
||||
#endif
|
||||
|
||||
inline bool has_prefix(const string& trm)
|
||||
{
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars) {
|
||||
#endif
|
||||
return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z';
|
||||
#else
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
} else {
|
||||
return trm.size() > 0 && trm[0] == ':';
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -201,10 +209,22 @@ int main(int argc, char **argv)
|
|||
|
||||
try {
|
||||
db = new Xapian::Database(dbdir);
|
||||
|
||||
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
|
||||
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
// If we have terms with a leading ':' it's a new style,
|
||||
// unstripped index
|
||||
{
|
||||
Xapian::TermIterator term = db->allterms_begin(":");
|
||||
if (term == db->allterms_end())
|
||||
o_index_stripchars = true;
|
||||
else
|
||||
o_index_stripchars = false;
|
||||
cout<<"DB: terms are "<<(o_index_stripchars?"stripped":"raw")<<endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (op_flags & OPT_T) {
|
||||
Xapian::TermIterator term;
|
||||
string printable;
|
||||
|
|
|
@ -63,16 +63,18 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|||
// Unaccented stem dbs
|
||||
vector<XapWritableComputableSynFamMember> unacstemdbs;
|
||||
// We can reuse the same stemmer pointers, the objects are stateless.
|
||||
if (!o_index_stripchars) {
|
||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||
unacstemdbs.push_back(
|
||||
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
|
||||
stemmers.back().getptr()));
|
||||
unacstemdbs.back().recreate();
|
||||
}
|
||||
|
||||
}
|
||||
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
||||
XapWritableComputableSynFamMember
|
||||
diacasedb(wdb, synFamDiCa, "all", &transunac);
|
||||
if (!o_index_stripchars)
|
||||
diacasedb.recreate();
|
||||
#endif
|
||||
|
||||
|
@ -109,8 +111,10 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|||
// is the input to the stem db, and add a synonym from the
|
||||
// stripped term to the cased and accented one, for accent
|
||||
// and case expansion at query time
|
||||
if (!o_index_stripchars) {
|
||||
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
||||
diacasedb.addSynonym(*it);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Create stemming synonym for every language. The input is the
|
||||
|
@ -124,12 +128,15 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|||
// the unaccented term. While this may be incorrect, it is
|
||||
// also necessary for searching in a diacritic-unsensitive
|
||||
// way on a raw index
|
||||
if (!o_index_stripchars) {
|
||||
string unac;
|
||||
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
|
||||
if (unac != lower)
|
||||
if (unac != lower) {
|
||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||
unacstemdbs[i].addSynonym(unac);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
|
|
|
@ -24,10 +24,13 @@
|
|||
|
||||
namespace Rcl {
|
||||
|
||||
/* A Capitals/Diacritics removal functor for using with
|
||||
XapComputableSynFamMember */
|
||||
/** A Capitals/Diacritics removal functor for using with
|
||||
* XapComputableSynFamMember */
|
||||
class SynTermTransUnac : public SynTermTrans {
|
||||
public:
|
||||
/** Constructor
|
||||
* @param op defines if we remove diacritics, case or both
|
||||
*/
|
||||
SynTermTransUnac(UnacOp op)
|
||||
: m_op(op)
|
||||
{
|
||||
|
@ -43,7 +46,9 @@ public:
|
|||
UnacOp m_op;
|
||||
};
|
||||
|
||||
/** Walk the Xapian term list and create all the expansion dbs in one go */
|
||||
/** Walk the Xapian term list and create all the expansion dbs in one go.
|
||||
*
|
||||
*/
|
||||
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
const std::vector<std::string>& langs);
|
||||
}
|
||||
|
|
|
@ -92,10 +92,11 @@ const string start_of_field_term = "XXST";
|
|||
const string end_of_field_term = "XXND";
|
||||
static const string page_break_term = "XXPG";
|
||||
#else
|
||||
const string start_of_field_term = "XXST/";
|
||||
const string end_of_field_term = "XXND/";
|
||||
static const string page_break_term = "XXPG/";
|
||||
string start_of_field_term;
|
||||
string end_of_field_term;
|
||||
const string page_break_term = "XXPG/";
|
||||
#endif
|
||||
|
||||
// Field name for the unsplit file name. Has to exist in the field file
|
||||
// because of usage in termmatch()
|
||||
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
||||
|
@ -683,6 +684,18 @@ Db::Db(RclConfig *cfp)
|
|||
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
|
||||
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
||||
{
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (start_of_field_term.empty()) {
|
||||
if (o_index_stripchars) {
|
||||
start_of_field_term = "XXST";
|
||||
end_of_field_term = "XXND";
|
||||
} else {
|
||||
start_of_field_term = "XXST/";
|
||||
end_of_field_term = "XXND/";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
m_ndb = new Native(this);
|
||||
if (m_config) {
|
||||
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
||||
|
@ -886,12 +899,13 @@ int Db::termDocCnt(const string& _term)
|
|||
return -1;
|
||||
|
||||
string term = _term;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars)
|
||||
#endif
|
||||
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (m_stops.isStop(term)) {
|
||||
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
|
||||
|
@ -1151,13 +1165,17 @@ string Db::getSpellingSuggestion(const string& word)
|
|||
{
|
||||
if (m_ndb == 0)
|
||||
return string();
|
||||
|
||||
string term = word;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars)
|
||||
#endif
|
||||
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
||||
return string();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!isSpellingCandidate(term))
|
||||
return string();
|
||||
return m_ndb->xrdb.get_spelling_suggestion(term);
|
||||
|
@ -1266,9 +1284,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||
TermProc *nxt = &tpidx;
|
||||
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
||||
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
||||
|
||||
TermProcPrep tpprep(nxt);
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars)
|
||||
#endif
|
||||
nxt = &tpprep;
|
||||
|
||||
TextSplitDb splitter(newdocument, nxt);
|
||||
tpidx.setTSD(&splitter);
|
||||
|
@ -1951,12 +1972,15 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||
// Get rid of capitals and accents
|
||||
|
||||
string droot = root;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars)
|
||||
#endif
|
||||
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
|
||||
|
||||
string prefix;
|
||||
|
|
|
@ -129,18 +129,27 @@ extern void *DbUpdWorker(void*);
|
|||
|
||||
inline bool has_prefix(const string& trm)
|
||||
{
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars) {
|
||||
#endif
|
||||
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
|
||||
#else
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
} else {
|
||||
return !trm.empty() && trm[0] == ':';
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
inline string wrap_prefix(const string& pfx)
|
||||
{
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars) {
|
||||
#endif
|
||||
return pfx;
|
||||
#else
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
} else {
|
||||
return cstr_colon + pfx + cstr_colon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -384,9 +393,13 @@ private:
|
|||
string version_string();
|
||||
|
||||
extern const string pathelt_prefix;
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
extern const string start_of_field_term;
|
||||
extern const string end_of_field_term;
|
||||
|
||||
#else
|
||||
extern string start_of_field_term;
|
||||
extern string end_of_field_term;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* _DB_H_INCLUDED_ */
|
||||
|
|
|
@ -79,10 +79,22 @@ static const int original_term_wqf_booster = 10;
|
|||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
#define bufprefix(BUF, L) {(BUF)[0] = L;}
|
||||
#define bpoffs 1
|
||||
#define bpoffs() 1
|
||||
#else
|
||||
#define bufprefix(BUF, L) {(BUF)[0] = ':'; (BUF)[1] = L; (BUF)[2] = ':';}
|
||||
#define bpoffs 3
|
||||
static inline void bufprefix(char *buf, char c)
|
||||
{
|
||||
if (o_index_stripchars) {
|
||||
buf[0] = c;
|
||||
} else {
|
||||
buf[0] = ':';
|
||||
buf[1] = c;
|
||||
buf[2] = ':';
|
||||
}
|
||||
}
|
||||
static inline int bpoffs()
|
||||
{
|
||||
return o_index_stripchars ? 1 : 3;
|
||||
}
|
||||
#endif
|
||||
|
||||
static Xapian::Query
|
||||
|
@ -92,7 +104,7 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
|||
// only doing %d's !
|
||||
char buf[200];
|
||||
bufprefix(buf, 'D');
|
||||
sprintf(buf+bpoffs, "%04d%02d", y1, m1);
|
||||
sprintf(buf+bpoffs(), "%04d%02d", y1, m1);
|
||||
vector<Xapian::Query> v;
|
||||
|
||||
int d_last = monthdays(m1, y1);
|
||||
|
@ -103,7 +115,7 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
|||
// Deal with any initial partial month
|
||||
if (d1 > 1 || d_end < d_last) {
|
||||
for ( ; d1 <= d_end ; d1++) {
|
||||
sprintf(buf + 6 + bpoffs, "%02d", d1);
|
||||
sprintf(buf + 6 + bpoffs(), "%02d", d1);
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
} else {
|
||||
|
@ -117,32 +129,32 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
|||
|
||||
int m_last = (y1 < y2) ? 12 : m2 - 1;
|
||||
while (++m1 <= m_last) {
|
||||
sprintf(buf + 4 + bpoffs, "%02d", m1);
|
||||
sprintf(buf + 4 + bpoffs(), "%02d", m1);
|
||||
bufprefix(buf, 'M');
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
|
||||
if (y1 < y2) {
|
||||
while (++y1 < y2) {
|
||||
sprintf(buf + bpoffs, "%04d", y1);
|
||||
sprintf(buf + bpoffs(), "%04d", y1);
|
||||
bufprefix(buf, 'Y');
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
sprintf(buf + bpoffs, "%04d", y2);
|
||||
sprintf(buf + bpoffs(), "%04d", y2);
|
||||
bufprefix(buf, 'M');
|
||||
for (m1 = 1; m1 < m2; m1++) {
|
||||
sprintf(buf + 4 + bpoffs, "%02d", m1);
|
||||
sprintf(buf + 4 + bpoffs(), "%02d", m1);
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
}
|
||||
|
||||
sprintf(buf + 2 + bpoffs, "%02d", m2);
|
||||
sprintf(buf + 2 + bpoffs(), "%02d", m2);
|
||||
|
||||
// Deal with any final partial month
|
||||
if (d2 < monthdays(m2, y2)) {
|
||||
bufprefix(buf, 'D');
|
||||
for (d1 = 1 ; d1 <= d2; d1++) {
|
||||
sprintf(buf + 6 + bpoffs, "%02d", d1);
|
||||
sprintf(buf + 6 + bpoffs(), "%02d", d1);
|
||||
v.push_back(Xapian::Query(buf));
|
||||
}
|
||||
} else {
|
||||
|
@ -663,13 +675,13 @@ static void listVector(const string& what, const vector<string>&l)
|
|||
*/
|
||||
void StringToXapianQ::expandTerm(int mods,
|
||||
const string& term,
|
||||
vector<string>& exp, string &sterm,
|
||||
vector<string>& oexp, string &sterm,
|
||||
const string& prefix)
|
||||
{
|
||||
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
||||
mods, m_field.c_str(), term.c_str(), m_stemlang.c_str()));
|
||||
sterm.clear();
|
||||
exp.clear();
|
||||
oexp.clear();
|
||||
if (term.empty())
|
||||
return;
|
||||
|
||||
|
@ -693,6 +705,9 @@ void StringToXapianQ::expandTerm(int mods,
|
|||
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
||||
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
|
||||
|
||||
if (o_index_stripchars) {
|
||||
diac_sensitive = case_sensitive = false;
|
||||
} else {
|
||||
// If we are working with a raw index, apply the rules for case and
|
||||
// diacritics sensitivity.
|
||||
|
||||
|
@ -703,10 +718,10 @@ void StringToXapianQ::expandTerm(int mods,
|
|||
if (unachasaccents(term))
|
||||
diac_sensitive = true;
|
||||
|
||||
// If any character apart the first is uppercase, we become case-sensitive.
|
||||
// The first character is reserved for turning off stemming. You need to
|
||||
// use a query language modifier to search for Floor in a case-sensitive
|
||||
// way.
|
||||
// If any character apart the first is uppercase, we become
|
||||
// case-sensitive. The first character is reserved for
|
||||
// turning off stemming. You need to use a query language
|
||||
// modifier to search for Floor in a case-sensitive way.
|
||||
Utf8Iter it(term);
|
||||
it++;
|
||||
if (unachasuppercase(term.substr(it.getBpos())))
|
||||
|
@ -718,12 +733,21 @@ void StringToXapianQ::expandTerm(int mods,
|
|||
|
||||
if (!case_sensitive || !diac_sensitive)
|
||||
noexpansion = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (noexpansion) {
|
||||
sterm = term;
|
||||
exp.push_back(prefix + term);
|
||||
} else {
|
||||
oexp.push_back(prefix + term);
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||
return;
|
||||
}
|
||||
|
||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa, "all",
|
||||
&unacfoldtrans);
|
||||
vector<string> lexp;
|
||||
|
||||
TermMatchResult res;
|
||||
if (haswild) {
|
||||
// Note that if there are wildcards, we do a direct from-index
|
||||
|
@ -732,106 +756,110 @@ void StringToXapianQ::expandTerm(int mods,
|
|||
// synonyms first. To be done later
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
|
||||
m_field);
|
||||
} else {
|
||||
goto termmatchtoresult;
|
||||
}
|
||||
|
||||
sterm = term;
|
||||
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1,
|
||||
m_field);
|
||||
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
|
||||
|
||||
#else
|
||||
// No stem expansion when diacritic or case sensitivity is
|
||||
// set, it makes no sense (it would mess with the
|
||||
// diacritics anyway if they are not in the stem part).
|
||||
// In these 3 cases, perform appropriate expansion from
|
||||
// the charstripping db, and do a bogus wildcard expansion
|
||||
// (there is no wild card) to generate the result:
|
||||
|
||||
if (o_index_stripchars) {
|
||||
// If the index is raw, we can only come here if nostemexp is unset
|
||||
// and we just need stem expansion.
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
|
||||
goto termmatchtoresult;
|
||||
}
|
||||
|
||||
// No stem expansion when diacritic or case sensitivity is set, it
|
||||
// makes no sense (it would mess with the diacritics anyway if
|
||||
// they are not in the stem part). In these 3 cases, perform
|
||||
// appropriate expansion from the charstripping db, and do a bogus
|
||||
// wildcard expansion (there is no wild card) to generate the
|
||||
// result:
|
||||
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
// No expansion whatsoever
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
|
||||
m_field);
|
||||
} else {
|
||||
// Access case and diacritics expansion:
|
||||
vector<string> exp;
|
||||
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
||||
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa,
|
||||
"all", &unacfoldtrans);
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
|
||||
goto termmatchtoresult;
|
||||
}
|
||||
|
||||
if (diac_sensitive) {
|
||||
// Expand for accents and case, filtering for same accents,
|
||||
// then bogus wildcard expansion for generating result
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
synac.synExpand(term, exp, &foldtrans);
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
|
||||
-1, m_field);
|
||||
synac.synExpand(term, lexp, &foldtrans);
|
||||
goto exptotermatch;
|
||||
}
|
||||
} else if (case_sensitive) {
|
||||
// Expand for accents and case, filtering for same case,
|
||||
// then bogus wildcard expansion for generating result
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synExpand(term, exp, &unactrans);
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
|
||||
-1, m_field);
|
||||
}
|
||||
} else {
|
||||
// Expand for accents and case, then lowercase
|
||||
// result for input to stemdb.
|
||||
synac.synExpand(term, exp);
|
||||
for (unsigned int i = 0; i < exp.size(); i++) {
|
||||
string lower;
|
||||
unacmaybefold(exp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||
exp[i] = lower;
|
||||
}
|
||||
sort(exp.begin(), exp.end());
|
||||
vector<string>::iterator uit =
|
||||
unique(exp.begin(), exp.end());
|
||||
exp.resize(uit - exp.begin());
|
||||
LOGDEB(("ExpandTerm: after casediac: %s\n",
|
||||
stringsToString(exp).c_str()));
|
||||
|
||||
if (case_sensitive) {
|
||||
// Expand for accents and case, filtering for same case, then
|
||||
// bogus wildcard expansion for generating result
|
||||
SynTermTransUnac unactrans(UNACOP_UNAC);
|
||||
synac.synExpand(term, lexp, &unactrans);
|
||||
goto exptotermatch;
|
||||
}
|
||||
|
||||
// We are neither accent- nor case- sensitive and may need stem
|
||||
// expansion or not.
|
||||
|
||||
// Expand for accents and case
|
||||
synac.synExpand(term, lexp);
|
||||
LOGDEB(("ExpTerm: casediac: %s\n", stringsToString(lexp).c_str()));
|
||||
if (nostemexp)
|
||||
goto exptotermatch;
|
||||
|
||||
// Need stem expansion. Lowercase the result of accent and case
|
||||
// expansion for input to stemdb.
|
||||
for (unsigned int i = 0; i < lexp.size(); i++) {
|
||||
string lower;
|
||||
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
||||
lexp[i] = lower;
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
{
|
||||
vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
|
||||
lexp.resize(uit - lexp.begin());
|
||||
StemDb db(m_db.m_ndb->xrdb);
|
||||
vector<string> exp1;
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
db.stemExpand(m_stemlang, *it, exp1);
|
||||
}
|
||||
LOGDEB(("ExpandTerm: after stem: %s\n",
|
||||
stringsToString(exp1).c_str()));
|
||||
LOGDEB(("ExpTerm: stem: %s\n", stringsToString(exp1).c_str()));
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
exp.clear();
|
||||
lexp.clear();
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
synac.synExpand(*it, exp);
|
||||
synac.synExpand(*it, lexp);
|
||||
}
|
||||
sort(exp.begin(), exp.end());
|
||||
uit = unique(exp.begin(), exp.end());
|
||||
exp.resize(uit - exp.begin());
|
||||
|
||||
LOGDEB(("ExpandTerm: after case exp of stem: %s\n",
|
||||
stringsToString(exp).c_str()));
|
||||
sort(lexp.begin(), lexp.end());
|
||||
uit = unique(lexp.begin(), lexp.end());
|
||||
lexp.resize(uit - lexp.begin());
|
||||
}
|
||||
LOGDEB(("ExpTerm: case exp of stem: %s\n", stringsToString(lexp).c_str()));
|
||||
|
||||
// Bogus wildcard expand to generate the result
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
|
||||
-1, m_field);
|
||||
}
|
||||
|
||||
}
|
||||
exptotermatch:
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it,
|
||||
res, -1, m_field);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Term match entries to vector of terms
|
||||
termmatchtoresult:
|
||||
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
exp.push_back(it->term);
|
||||
}
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(exp).c_str()));
|
||||
oexp.push_back(it->term);
|
||||
}
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||
}
|
||||
|
||||
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
||||
|
@ -1097,9 +1125,11 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
||||
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
||||
//tpcommon.onlygrams(true);
|
||||
#ifdef RCL_INDEX_STRIPCHARS
|
||||
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
||||
TermProcPrep tpprep(nxt);
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars)
|
||||
#endif
|
||||
nxt = &tpprep;
|
||||
|
||||
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
|
|
|
@ -26,6 +26,8 @@
|
|||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
#include <xapian.h>
|
||||
|
||||
|
@ -34,18 +36,14 @@
|
|||
#include "smallut.h"
|
||||
#include "synfamily.h"
|
||||
#include "unacpp.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
#include "rclconfig.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
/**
|
||||
* Expand for one or several languages
|
||||
*/
|
||||
bool StemDb::stemExpand(const std::string& langs,
|
||||
const std::string& term,
|
||||
bool StemDb::stemExpand(const std::string& langs, const std::string& term,
|
||||
vector<string>& result)
|
||||
{
|
||||
vector<string> llangs;
|
||||
|
@ -59,6 +57,8 @@ bool StemDb::stemExpand(const std::string& langs,
|
|||
}
|
||||
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
// Expand the unaccented stem
|
||||
if (!o_index_stripchars) {
|
||||
for (vector<string>::const_iterator it = llangs.begin();
|
||||
it != llangs.end(); it++) {
|
||||
SynTermTransStem stemmer(*it);
|
||||
|
@ -68,6 +68,7 @@ bool StemDb::stemExpand(const std::string& langs,
|
|||
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
|
||||
(void)expander.synExpand(unac, result);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (result.empty())
|
||||
|
|
|
@ -33,17 +33,12 @@
|
|||
#include <string>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
using namespace std;
|
||||
|
||||
#include "smallut.h"
|
||||
#include "utf8iter.h"
|
||||
#include "hldata.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
#define MIN(A,B) ((A)<(B)?(A):(B))
|
||||
|
||||
int stringicmp(const string & s1, const string& s2)
|
||||
{
|
||||
string::const_iterator it1 = s1.begin();
|
||||
|
|
|
@ -224,4 +224,11 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
|
||||
#endif
|
||||
#ifndef MAX
|
||||
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
|
||||
#endif
|
||||
|
||||
#endif /* _SMALLUT_H_INCLUDED_ */
|
||||
|
|
|
@ -4,6 +4,8 @@ logfilename = /tmp/logrcltst
|
|||
daemloglevel = 6
|
||||
daemlogfilename = /tmp/rclmontrace
|
||||
|
||||
indexStripChars = 1
|
||||
|
||||
topdirs = /home/dockes/projets/fulltext/testrecoll/
|
||||
|
||||
skippedPaths = \
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue