ensure that recoll configured with indexStripChars=1 runs as compiled with -DRCL_INDEX_STRIPCHARS

This commit is contained in:
Jean-Francois Dockes 2012-09-15 15:16:20 +02:00
parent 48e9a4f901
commit e22b347767
17 changed files with 425 additions and 260 deletions

View file

@ -23,9 +23,9 @@
#include <unistd.h>
#include <dlfcn.h>
#include <iostream>
#include <stdlib.h>
#include <vector>
using namespace std;
#include ASPELL_INCLUDE
@ -33,7 +33,7 @@
#include "execmd.h"
#include "rclaspell.h"
#include "debuglog.h"
#include "unacpp.h"
#include "ptmutex.h"
// Just a place where we keep the Aspell library entry points together
@ -260,6 +260,14 @@ public:
while (m_db.termWalkNext(m_tit, *m_input)) {
if (!Rcl::Db::isSpellingCandidate(*m_input))
continue;
#ifndef RCL_INDEX_STRIPCHARS
if (!o_index_stripchars) {
string lower;
if (!unacmaybefold(*m_input, lower, "UTF-8", UNACOP_FOLD))
continue;
m_input->swap(lower);
}
#endif
// Got a non-empty sort-of appropriate term, let's send it to
// aspell
m_input->append("\n");
@ -335,17 +343,29 @@ bool Aspell::make_speller(string& reason)
return true;
}
bool Aspell::check(Rcl::Db &db, const string &term, string& reason)
bool Aspell::check(const string &iterm, string& reason)
{
LOGDEB2(("Aspell::check [%s]\n", term.c_str()));
LOGDEB2(("Aspell::check [%s]\n", iterm.c_str()));
string mterm(iterm);
if (!ok() || !make_speller(reason))
return false;
if (term.empty())
if (iterm.empty())
return true; //??
#ifndef RCL_INDEX_STRIPCHARS
if (!o_index_stripchars) {
string lower;
if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
LOGERR(("Aspell::check : cant lowercase input\n"));
return false;
}
mterm.swap(lower);
}
#endif
int ret = aapi.aspell_speller_check(m_data->m_speller,
term.c_str(), term.length());
mterm.c_str(), mterm.length());
reason.clear();
switch (ret) {
case 0: return false;
@ -358,19 +378,31 @@ bool Aspell::check(Rcl::Db &db, const string &term, string& reason)
}
}
bool Aspell::suggest(Rcl::Db &db, const string &term,
bool Aspell::suggest(Rcl::Db &db, const string &_term,
list<string>& suggestions, string& reason)
{
if (!ok() || !make_speller(reason))
return false;
if (term.empty())
string mterm(_term);
if (mterm.empty())
return true; //??
#ifndef RCL_INDEX_STRIPCHARS
if (!o_index_stripchars) {
string lower;
if (!unacmaybefold(mterm, lower, "UTF-8", UNACOP_FOLD)) {
LOGERR(("Aspell::check : cant lowercase input\n"));
return false;
}
mterm.swap(lower);
}
#endif
AspellCanHaveError *ret;
const AspellWordList *wl =
aapi.aspell_speller_suggest(m_data->m_speller,
term.c_str(), term.length());
mterm.c_str(), mterm.length());
if (wl == 0) {
reason = aapi.aspell_speller_error_message(m_data->m_speller);
return false;
@ -385,7 +417,7 @@ bool Aspell::suggest(Rcl::Db &db, const string &term,
// ******** This should depend if
// stemming is turned on or not for querying *******
string sw(word);
if (db.termExists(sw) && db.stemDiffers("english", sw, term))
if (db.termExists(sw) && db.stemDiffers("english", sw, mterm))
suggestions.push_back(word);
}
aapi.delete_aspell_string_enumeration(els);
@ -418,7 +450,6 @@ using namespace std;
static char *thisprog;
RclConfig *rclconfig;
Rcl::Db rcldb;
static char usage [] =
" -b : build dictionary\n"
@ -477,7 +508,9 @@ int main(int argc, char **argv)
exit(1);
}
if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) {
Rcl::Db rcldb(rclconfig);
if (!rcldb.open(Rcl::Db::DbRO, 0)) {
fprintf(stderr, "Could not open database in %s\n", dbdir.c_str());
exit(1);
}

View file

@ -37,11 +37,6 @@
#include "rclconfig.h"
#include "rcldb.h"
#ifndef NO_NAMESPACES
using std::string;
using std::list;
#endif // NO_NAMESPACES
class AspellData;
class Aspell {
@ -53,26 +48,26 @@ class Aspell {
bool ok() const;
/** Find the aspell command and shared library, init function pointers */
bool init(string &reason);
bool init(std::string &reason);
/** Build dictionary out of index term list. This is done at the end
* of an indexing pass. */
bool buildDict(Rcl::Db &db, string &reason);
bool buildDict(Rcl::Db &db, std::string &reason);
/** Check that word is in dictionary. ret==false && !reason.empty() => err*/
bool check(Rcl::Db &db, const string& term, string& reason);
bool check(const std::string& term, std::string& reason);
/** Return a list of possible expansions for a given word */
bool suggest(Rcl::Db &db, const string& term, list<string> &suggestions,
string &reason);
bool suggest(Rcl::Db &db, const std::string& term,
std::list<std::string> &suggestions, std::string &reason);
private:
string dicPath();
std::string dicPath();
RclConfig *m_config;
string m_lang;
std::string m_lang;
AspellData *m_data;
bool make_speller(string& reason);
bool make_speller(std::string& reason);
};
#endif /* RCL_USE_ASPELL */

View file

@ -15,6 +15,8 @@
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef TEST_RCLCONFIG
#include "autoconfig.h"
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
@ -34,6 +36,7 @@
#include <iostream>
#include <cstdlib>
#include <cstring>
using namespace std;
#include "cstr.h"
#include "pathut.h"
@ -45,15 +48,8 @@
#include "readfile.h"
#include "fstreewalk.h"
#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */
#ifndef MIN
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
#endif
#ifndef MAX
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
#ifndef RCL_INDEX_STRIPCHARS
bool o_index_stripchars;
#endif
bool ParamStale::needrecompute()
@ -77,6 +73,7 @@ bool ParamStale::needrecompute()
}
return false;
}
void ParamStale::init(RclConfig *rconf, ConfNull *cnf, const string& nm)
{
parent = rconf;
@ -239,6 +236,14 @@ bool RclConfig::updateMainConfig()
FsTreeWalker::setNoFnmPathname();
}
#ifndef RCL_INDEX_STRIPCHARS
static int m_index_stripchars_init = 0;
if (!m_index_stripchars_init) {
getConfParam("indexStripChars", &o_index_stripchars);
m_index_stripchars_init = 1;
}
#endif
return true;
}

View file

@ -303,5 +303,13 @@ class RclConfig {
bool readFieldsConfig(const string& errloc);
};
// This global variable defines if we are running with an index
// stripped of accents and case or a raw one. Ideally, it should be
// constant, but it needs to be initialized from the configuration, so
// there is no way to do this. It never changes after initialization
// of course. When set, it is supposed to get all of recoll to behave like if
// if was compiled with RCL_INDEX_STRIPCHARS
#ifndef RCL_INDEX_STRIPCHARS
extern bool o_index_stripchars;
#endif
#endif /* _RCLCONFIG_H_INCLUDED_ */

View file

@ -197,10 +197,14 @@ void QtGuiResListPager::suggest(const vector<string>uterms,
// If the term is in the index, we don't suggest alternatives.
// Actually, we may want to check the frequencies and propose something
// anyway if a possible variation is much more common (as google does)
if (aspell->check(*rcldb, *uit, reason))
#warning need to take case and diacs sensibility into account somehow
// Maybe use the xapian index instead ? How to retrieve the sensitivity flags ?
if (0) {
if (aspell->check(*uit, reason))
continue;
else if (!reason.empty())
return;
}
if (!aspell->suggest(*rcldb, *uit, asuggs, reason)) {
LOGERR(("QtGuiResListPager::suggest: aspell failed: %s\n",
reason.c_str()));
@ -336,6 +340,7 @@ ResList::~ResList()
QT_TR_NOOP("Open"),
QT_TR_NOOP("(show query)"),
QT_TR_NOOP("<p><i>Alternate spellings (accents suppressed): </i>"),
QT_TR_NOOP("<p><i>Alternate spellings: </i>"),
};
}

View file

@ -79,22 +79,30 @@ class TextSplitPTR : public TextSplit {
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
vit != hdata.groups.end(); vit++) {
if (vit->size() == 1) {
#ifdef RCL_INDEX_STRIPCHARS
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars) {
#endif
m_terms[vit->front()] = vit - hdata.groups.begin();
#else
#ifndef RCL_INDEX_STRIPCHARS
} else {
string dumb = vit->front();
unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
m_terms[dumb] = vit - hdata.groups.begin();
}
#endif
} else if (vit->size() > 1) {
for (vector<string>::const_iterator it = vit->begin();
it != vit->end(); it++) {
#ifdef RCL_INDEX_STRIPCHARS
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars) {
#endif
m_gterms.insert(*it);
#else
#ifndef RCL_INDEX_STRIPCHARS
} else {
string dumb = *it;
unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
m_gterms.insert(dumb);
}
#endif
}
}

View file

@ -320,9 +320,16 @@ void ResListPager::displayPage(RclConfig *config)
map<string, vector<string> > spellings;
suggest(uterms, spellings);
if (!spellings.empty()) {
if (o_index_stripchars) {
chunk <<
trans("<p><i>Alternate spellings (accents suppressed): </i>")
<< "<br /><blockquote>";
} else {
chunk <<
trans("<p><i>Alternate spellings: </i>")
<< "<br /><blockquote>";
}
for (map<string, vector<string> >::const_iterator it0 =
spellings.begin(); it0 != spellings.end(); it0++) {

View file

@ -116,12 +116,20 @@ static void sigcleanup(int sig)
exit(1);
}
#ifndef RCL_INDEX_STRIPCHARS
bool o_index_stripchars;
#endif
inline bool has_prefix(const string& trm)
{
#ifdef RCL_INDEX_STRIPCHARS
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars) {
#endif
return trm.size() && 'A' <= trm[0] && trm[0] <= 'Z';
#else
#ifndef RCL_INDEX_STRIPCHARS
} else {
return trm.size() > 0 && trm[0] == ':';
}
#endif
}
@ -201,10 +209,22 @@ int main(int argc, char **argv)
try {
db = new Xapian::Database(dbdir);
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
#ifndef RCL_INDEX_STRIPCHARS
// If we have terms with a leading ':' it's a new style,
// unstripped index
{
Xapian::TermIterator term = db->allterms_begin(":");
if (term == db->allterms_end())
o_index_stripchars = true;
else
o_index_stripchars = false;
cout<<"DB: terms are "<<(o_index_stripchars?"stripped":"raw")<<endl;
}
#endif
if (op_flags & OPT_T) {
Xapian::TermIterator term;
string printable;

View file

@ -63,16 +63,18 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
// Unaccented stem dbs
vector<XapWritableComputableSynFamMember> unacstemdbs;
// We can reuse the same stemmer pointers, the objects are stateless.
if (!o_index_stripchars) {
for (unsigned int i = 0; i < langs.size(); i++) {
unacstemdbs.push_back(
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
stemmers.back().getptr()));
unacstemdbs.back().recreate();
}
}
SynTermTransUnac transunac(UNACOP_UNACFOLD);
XapWritableComputableSynFamMember
diacasedb(wdb, synFamDiCa, "all", &transunac);
if (!o_index_stripchars)
diacasedb.recreate();
#endif
@ -109,8 +111,10 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
// is the input to the stem db, and add a synonym from the
// stripped term to the cased and accented one, for accent
// and case expansion at query time
if (!o_index_stripchars) {
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
diacasedb.addSynonym(*it);
}
#endif
// Create stemming synonym for every language. The input is the
@ -124,12 +128,15 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
// the unaccented term. While this may be incorrect, it is
// also necessary for searching in a diacritic-unsensitive
// way on a raw index
if (!o_index_stripchars) {
string unac;
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
if (unac != lower)
if (unac != lower) {
for (unsigned int i = 0; i < langs.size(); i++) {
unacstemdbs[i].addSynonym(unac);
}
}
}
#endif
}
} XCATCHERROR(ermsg);

View file

@ -24,10 +24,13 @@
namespace Rcl {
/* A Capitals/Diacritics removal functor for using with
XapComputableSynFamMember */
/** A Capitals/Diacritics removal functor for using with
* XapComputableSynFamMember */
class SynTermTransUnac : public SynTermTrans {
public:
/** Constructor
* @param op defines if we remove diacritics, case or both
*/
SynTermTransUnac(UnacOp op)
: m_op(op)
{
@ -43,7 +46,9 @@ public:
UnacOp m_op;
};
/** Walk the Xapian term list and create all the expansion dbs in one go */
/** Walk the Xapian term list and create all the expansion dbs in one go.
*
*/
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const std::vector<std::string>& langs);
}

View file

@ -92,10 +92,11 @@ const string start_of_field_term = "XXST";
const string end_of_field_term = "XXND";
static const string page_break_term = "XXPG";
#else
const string start_of_field_term = "XXST/";
const string end_of_field_term = "XXND/";
static const string page_break_term = "XXPG/";
string start_of_field_term;
string end_of_field_term;
const string page_break_term = "XXPG/";
#endif
// Field name for the unsplit file name. Has to exist in the field file
// because of usage in termmatch()
static const string unsplitFilenameFieldName = "rclUnsplitFN";
@ -683,6 +684,18 @@ Db::Db(RclConfig *cfp)
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
m_maxFsOccupPc(0), m_mode(Db::DbRO)
{
#ifndef RCL_INDEX_STRIPCHARS
if (start_of_field_term.empty()) {
if (o_index_stripchars) {
start_of_field_term = "XXST";
end_of_field_term = "XXND";
} else {
start_of_field_term = "XXST/";
end_of_field_term = "XXND/";
}
}
#endif
m_ndb = new Native(this);
if (m_config) {
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
@ -886,12 +899,13 @@ int Db::termDocCnt(const string& _term)
return -1;
string term = _term;
#ifdef RCL_INDEX_STRIPCHARS
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars)
#endif
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
return 0;
}
#endif
if (m_stops.isStop(term)) {
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
@ -1151,13 +1165,17 @@ string Db::getSpellingSuggestion(const string& word)
{
if (m_ndb == 0)
return string();
string term = word;
#ifdef RCL_INDEX_STRIPCHARS
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars)
#endif
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
return string();
}
#endif
if (!isSpellingCandidate(term))
return string();
return m_ndb->xrdb.get_spelling_suggestion(term);
@ -1266,9 +1284,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
TermProc *nxt = &tpidx;
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
#ifdef RCL_INDEX_STRIPCHARS
TermProcPrep tpprep(nxt); nxt = &tpprep;
TermProcPrep tpprep(nxt);
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars)
#endif
nxt = &tpprep;
TextSplitDb splitter(newdocument, nxt);
tpidx.setTSD(&splitter);
@ -1951,12 +1972,15 @@ bool Db::termMatch(MatchType typ, const string &lang,
// Get rid of capitals and accents
string droot = root;
#ifdef RCL_INDEX_STRIPCHARS
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars)
#endif
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
return false;
}
#endif
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
string prefix;

View file

@ -129,18 +129,27 @@ extern void *DbUpdWorker(void*);
inline bool has_prefix(const string& trm)
{
#ifdef RCL_INDEX_STRIPCHARS
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars) {
#endif
return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
#else
#ifndef RCL_INDEX_STRIPCHARS
} else {
return !trm.empty() && trm[0] == ':';
}
#endif
}
inline string wrap_prefix(const string& pfx)
{
#ifdef RCL_INDEX_STRIPCHARS
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars) {
#endif
return pfx;
#else
#ifndef RCL_INDEX_STRIPCHARS
} else {
return cstr_colon + pfx + cstr_colon;
}
#endif
}
@ -384,9 +393,13 @@ private:
string version_string();
extern const string pathelt_prefix;
#ifdef RCL_INDEX_STRIPCHARS
extern const string start_of_field_term;
extern const string end_of_field_term;
#else
extern string start_of_field_term;
extern string end_of_field_term;
#endif
}
#endif /* _DB_H_INCLUDED_ */

View file

@ -79,10 +79,22 @@ static const int original_term_wqf_booster = 10;
#ifdef RCL_INDEX_STRIPCHARS
#define bufprefix(BUF, L) {(BUF)[0] = L;}
#define bpoffs 1
#define bpoffs() 1
#else
#define bufprefix(BUF, L) {(BUF)[0] = ':'; (BUF)[1] = L; (BUF)[2] = ':';}
#define bpoffs 3
static inline void bufprefix(char *buf, char c)
{
if (o_index_stripchars) {
buf[0] = c;
} else {
buf[0] = ':';
buf[1] = c;
buf[2] = ':';
}
}
static inline int bpoffs()
{
return o_index_stripchars ? 1 : 3;
}
#endif
static Xapian::Query
@ -92,7 +104,7 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
// only doing %d's !
char buf[200];
bufprefix(buf, 'D');
sprintf(buf+bpoffs, "%04d%02d", y1, m1);
sprintf(buf+bpoffs(), "%04d%02d", y1, m1);
vector<Xapian::Query> v;
int d_last = monthdays(m1, y1);
@ -103,7 +115,7 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
// Deal with any initial partial month
if (d1 > 1 || d_end < d_last) {
for ( ; d1 <= d_end ; d1++) {
sprintf(buf + 6 + bpoffs, "%02d", d1);
sprintf(buf + 6 + bpoffs(), "%02d", d1);
v.push_back(Xapian::Query(buf));
}
} else {
@ -117,32 +129,32 @@ date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
int m_last = (y1 < y2) ? 12 : m2 - 1;
while (++m1 <= m_last) {
sprintf(buf + 4 + bpoffs, "%02d", m1);
sprintf(buf + 4 + bpoffs(), "%02d", m1);
bufprefix(buf, 'M');
v.push_back(Xapian::Query(buf));
}
if (y1 < y2) {
while (++y1 < y2) {
sprintf(buf + bpoffs, "%04d", y1);
sprintf(buf + bpoffs(), "%04d", y1);
bufprefix(buf, 'Y');
v.push_back(Xapian::Query(buf));
}
sprintf(buf + bpoffs, "%04d", y2);
sprintf(buf + bpoffs(), "%04d", y2);
bufprefix(buf, 'M');
for (m1 = 1; m1 < m2; m1++) {
sprintf(buf + 4 + bpoffs, "%02d", m1);
sprintf(buf + 4 + bpoffs(), "%02d", m1);
v.push_back(Xapian::Query(buf));
}
}
sprintf(buf + 2 + bpoffs, "%02d", m2);
sprintf(buf + 2 + bpoffs(), "%02d", m2);
// Deal with any final partial month
if (d2 < monthdays(m2, y2)) {
bufprefix(buf, 'D');
for (d1 = 1 ; d1 <= d2; d1++) {
sprintf(buf + 6 + bpoffs, "%02d", d1);
sprintf(buf + 6 + bpoffs(), "%02d", d1);
v.push_back(Xapian::Query(buf));
}
} else {
@ -663,13 +675,13 @@ static void listVector(const string& what, const vector<string>&l)
*/
void StringToXapianQ::expandTerm(int mods,
const string& term,
vector<string>& exp, string &sterm,
vector<string>& oexp, string &sterm,
const string& prefix)
{
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
mods, m_field.c_str(), term.c_str(), m_stemlang.c_str()));
sterm.clear();
exp.clear();
oexp.clear();
if (term.empty())
return;
@ -693,6 +705,9 @@ void StringToXapianQ::expandTerm(int mods,
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
if (o_index_stripchars) {
diac_sensitive = case_sensitive = false;
} else {
// If we are working with a raw index, apply the rules for case and
// diacritics sensitivity.
@ -703,10 +718,10 @@ void StringToXapianQ::expandTerm(int mods,
if (unachasaccents(term))
diac_sensitive = true;
// If any character apart the first is uppercase, we become case-sensitive.
// The first character is reserved for turning off stemming. You need to
// use a query language modifier to search for Floor in a case-sensitive
// way.
// If any character apart the first is uppercase, we become
// case-sensitive. The first character is reserved for
// turning off stemming. You need to use a query language
// modifier to search for Floor in a case-sensitive way.
Utf8Iter it(term);
it++;
if (unachasuppercase(term.substr(it.getBpos())))
@ -718,12 +733,21 @@ void StringToXapianQ::expandTerm(int mods,
if (!case_sensitive || !diac_sensitive)
noexpansion = false;
}
#endif
if (noexpansion) {
sterm = term;
exp.push_back(prefix + term);
} else {
oexp.push_back(prefix + term);
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
return;
}
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa, "all",
&unacfoldtrans);
vector<string> lexp;
TermMatchResult res;
if (haswild) {
// Note that if there are wildcards, we do a direct from-index
@ -732,106 +756,110 @@ void StringToXapianQ::expandTerm(int mods,
// synonyms first. To be done later
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
m_field);
} else {
goto termmatchtoresult;
}
sterm = term;
#ifdef RCL_INDEX_STRIPCHARS
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1,
m_field);
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
#else
// No stem expansion when diacritic or case sensitivity is
// set, it makes no sense (it would mess with the
// diacritics anyway if they are not in the stem part).
// In these 3 cases, perform appropriate expansion from
// the charstripping db, and do a bogus wildcard expansion
// (there is no wild card) to generate the result:
if (o_index_stripchars) {
// If the index is raw, we can only come here if nostemexp is unset
// and we just need stem expansion.
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
goto termmatchtoresult;
}
// No stem expansion when diacritic or case sensitivity is set, it
// makes no sense (it would mess with the diacritics anyway if
// they are not in the stem part). In these 3 cases, perform
// appropriate expansion from the charstripping db, and do a bogus
// wildcard expansion (there is no wild card) to generate the
// result:
if (diac_sensitive && case_sensitive) {
// No expansion whatsoever
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
m_field);
} else {
// Access case and diacritics expansion:
vector<string> exp;
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa,
"all", &unacfoldtrans);
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
goto termmatchtoresult;
}
if (diac_sensitive) {
// Expand for accents and case, filtering for same accents,
// then bogus wildcard expansion for generating result
SynTermTransUnac foldtrans(UNACOP_FOLD);
synac.synExpand(term, exp, &foldtrans);
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
-1, m_field);
synac.synExpand(term, lexp, &foldtrans);
goto exptotermatch;
}
} else if (case_sensitive) {
// Expand for accents and case, filtering for same case,
// then bogus wildcard expansion for generating result
SynTermTransUnac unactrans(UNACOP_UNAC);
synac.synExpand(term, exp, &unactrans);
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
-1, m_field);
}
} else {
// Expand for accents and case, then lowercase
// result for input to stemdb.
synac.synExpand(term, exp);
for (unsigned int i = 0; i < exp.size(); i++) {
string lower;
unacmaybefold(exp[i], lower, "UTF-8", UNACOP_FOLD);
exp[i] = lower;
}
sort(exp.begin(), exp.end());
vector<string>::iterator uit =
unique(exp.begin(), exp.end());
exp.resize(uit - exp.begin());
LOGDEB(("ExpandTerm: after casediac: %s\n",
stringsToString(exp).c_str()));
if (case_sensitive) {
// Expand for accents and case, filtering for same case, then
// bogus wildcard expansion for generating result
SynTermTransUnac unactrans(UNACOP_UNAC);
synac.synExpand(term, lexp, &unactrans);
goto exptotermatch;
}
// We are neither accent- nor case- sensitive and may need stem
// expansion or not.
// Expand for accents and case
synac.synExpand(term, lexp);
LOGDEB(("ExpTerm: casediac: %s\n", stringsToString(lexp).c_str()));
if (nostemexp)
goto exptotermatch;
// Need stem expansion. Lowercase the result of accent and case
// expansion for input to stemdb.
for (unsigned int i = 0; i < lexp.size(); i++) {
string lower;
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
lexp[i] = lower;
}
sort(lexp.begin(), lexp.end());
{
vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
lexp.resize(uit - lexp.begin());
StemDb db(m_db.m_ndb->xrdb);
vector<string> exp1;
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
db.stemExpand(m_stemlang, *it, exp1);
}
LOGDEB(("ExpandTerm: after stem: %s\n",
stringsToString(exp1).c_str()));
LOGDEB(("ExpTerm: stem: %s\n", stringsToString(exp1).c_str()));
// Expand the resulting list for case (all stemdb content
// is lowercase)
exp.clear();
lexp.clear();
for (vector<string>::const_iterator it = exp1.begin();
it != exp1.end(); it++) {
synac.synExpand(*it, exp);
synac.synExpand(*it, lexp);
}
sort(exp.begin(), exp.end());
uit = unique(exp.begin(), exp.end());
exp.resize(uit - exp.begin());
LOGDEB(("ExpandTerm: after case exp of stem: %s\n",
stringsToString(exp).c_str()));
sort(lexp.begin(), lexp.end());
uit = unique(lexp.begin(), lexp.end());
lexp.resize(uit - lexp.begin());
}
LOGDEB(("ExpTerm: case exp of stem: %s\n", stringsToString(lexp).c_str()));
// Bogus wildcard expand to generate the result
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,
-1, m_field);
}
}
exptotermatch:
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it,
res, -1, m_field);
}
#endif
}
// Term match entries to vector of terms
termmatchtoresult:
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
it != res.entries.end(); it++) {
exp.push_back(it->term);
}
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(exp).c_str()));
oexp.push_back(it->term);
}
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
}
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
@ -1097,9 +1125,11 @@ bool StringToXapianQ::processUserString(const string &iq,
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
//tpcommon.onlygrams(true);
#ifdef RCL_INDEX_STRIPCHARS
TermProcPrep tpprep(nxt); nxt = &tpprep;
TermProcPrep tpprep(nxt);
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars)
#endif
nxt = &tpprep;
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD),

View file

@ -26,6 +26,8 @@
#include <algorithm>
#include <map>
#include <iostream>
using namespace std;
#include <xapian.h>
@ -34,18 +36,14 @@
#include "smallut.h"
#include "synfamily.h"
#include "unacpp.h"
#include <iostream>
using namespace std;
#include "rclconfig.h"
namespace Rcl {
/**
* Expand for one or several languages
*/
bool StemDb::stemExpand(const std::string& langs,
const std::string& term,
bool StemDb::stemExpand(const std::string& langs, const std::string& term,
vector<string>& result)
{
vector<string> llangs;
@ -59,6 +57,8 @@ bool StemDb::stemExpand(const std::string& langs,
}
#ifndef RCL_INDEX_STRIPCHARS
// Expand the unaccented stem
if (!o_index_stripchars) {
for (vector<string>::const_iterator it = llangs.begin();
it != llangs.end(); it++) {
SynTermTransStem stemmer(*it);
@ -68,6 +68,7 @@ bool StemDb::stemExpand(const std::string& langs,
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
(void)expander.synExpand(unac, result);
}
}
#endif
if (result.empty())

View file

@ -33,17 +33,12 @@
#include <string>
#include <iostream>
#include <list>
using namespace std;
#include "smallut.h"
#include "utf8iter.h"
#include "hldata.h"
#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */
#define MIN(A,B) ((A)<(B)?(A):(B))
int stringicmp(const string & s1, const string& s2)
{
string::const_iterator it1 = s1.begin();

View file

@ -224,4 +224,11 @@ public:
}
};
#ifndef MIN
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
#endif
#ifndef MAX
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
#endif
#endif /* _SMALLUT_H_INCLUDED_ */

View file

@ -4,6 +4,8 @@ logfilename = /tmp/logrcltst
daemloglevel = 6
daemlogfilename = /tmp/rclmontrace
indexStripChars = 1
topdirs = /home/dockes/projets/fulltext/testrecoll/
skippedPaths = \