Make Recoll optionally sensitive to case and diacritics

This commit is contained in:
Jean-Francois Dockes 2012-09-14 14:34:27 +02:00
parent 7fcfe27952
commit 166624f7f2
30 changed files with 849 additions and 487 deletions

View file

@ -19,6 +19,9 @@
* Management of the auxiliary databases listing stems and their expansion
* terms
*/
#include "autoconfig.h"
#include <unistd.h>
#include <algorithm>
@ -27,13 +30,8 @@
#include <xapian.h>
#include "stemdb.h"
#include "pathut.h"
#include "debuglog.h"
#include "smallut.h"
#include "utf8iter.h"
#include "textsplit.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "synfamily.h"
#include "unacpp.h"
@ -43,140 +41,6 @@ using namespace std;
namespace Rcl {
// Fast raw detection of non-natural-language words: look for ascii
// chars which are not lowercase letters. Not too sure what islower()
// would do with 8 bit values, so not using it here. If we want to be
// more complete we'd need to go full utf-8
inline static bool p_notlowerascii(unsigned int c)
{
if (c < 'a' || (c > 'z' && c < 128))
return true;
return false;
}
/**
* Create database of stem to parents associations for a given language.
*/
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const vector<string>& langs)
{
LOGDEB(("StemDb::createExpansionDbs\n"));
Chrono cron;
vector<XapWritableSynFamily> stemdbs;
for (unsigned int i = 0; i < langs.size(); i++) {
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
stemdbs[i].deleteMember(langs[i]);
stemdbs[i].createMember(langs[i]);
stemdbs[i].setCurrentMemberName(langs[i]);
}
// We walk the list of all terms, and stem each. We skip terms which
// don't look like natural language.
// If the stem is not identical to the term, we add a synonym entry.
// Statistics
int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
int stemconst = 0; // Stem == term
int allsyns = 0; // Total number of entries created
string ermsg;
try {
vector<Xapian::Stem> stemmers;
for (unsigned int i = 0; i < langs.size(); i++) {
stemmers.push_back(Xapian::Stem(langs[i]));
}
for (Xapian::TermIterator it = wdb.allterms_begin();
it != wdb.allterms_end(); it++) {
// If the term has any non-lowercase 7bit char (that is,
// numbers, capitals and punctuation) dont stem.
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
++nostem;
LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
(*it).c_str(), *sit));
continue;
}
// Detect and skip CJK terms.
// We're still sending all other multibyte utf-8 chars to
// the stemmer, which is not too well defined for
// xapian<1.0 (very obsolete now), but seems to work
// anyway. There shouldn't be too many in any case because
// accents are stripped at this point.
// The effect of stripping accents on stemming is not good,
// (e.g: in french partimes -> partim, parti^mes -> part)
// but fixing the issue would be complicated.
Utf8Iter utfit(*it);
if (TextSplit::isCJK(*utfit)) {
// LOGDEB(("stemskipped: Skipping CJK\n"));
continue;
}
// Create stemming synonym for every lang
for (unsigned int i = 0; i < langs.size(); i++) {
string stem = stemmers[i](*it);
if (stem == *it) {
++stemconst;
} else {
stemdbs[i].addSynonym(stem, *it);
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
(*it).c_str(), langs[i].c_str(), stem.c_str()));
++allsyns;
}
}
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
return false;
}
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
nostem, stemconst, allsyns));
return true;
}
/**
* Expand term to list of all terms which stem to the same term, for one
* expansion language
*/
bool StemDb::expandOne(const std::string& lang,
const std::string& term,
vector<string>& result)
{
try {
Xapian::Stem stemmer(lang);
string stem = stemmer(term);
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
lang.c_str(), term.c_str(), stem.c_str()));
if (!synExpand(lang, stem, result)) {
// ?
}
// If the user term or stem are not in the list, add them
if (find(result.begin(), result.end(), term) == result.end()) {
result.push_back(term);
}
if (find(result.begin(), result.end(), stem) == result.end()) {
result.push_back(stem);
}
LOGDEB0(("stemExpand:%s: %s -> %s\n", lang.c_str(), stem.c_str(),
stringsToString(result).c_str()));
} catch (...) {
LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
lang.c_str()));
result.push_back(term);
return false;
}
return true;
}
/**
* Expand for one or several languages
*/
@ -186,14 +50,34 @@ bool StemDb::stemExpand(const std::string& langs,
{
vector<string> llangs;
stringToStrings(langs, llangs);
for (vector<string>::const_iterator it = llangs.begin();
it != llangs.end(); it++) {
vector<string> oneexp;
expandOne(*it, term, oneexp);
result.insert(result.end(), oneexp.begin(), oneexp.end());
SynTermTransStem stemmer(*it);
XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
(void)expander.synExpand(term, result);
}
#ifndef RCL_INDEX_STRIPCHARS
for (vector<string>::const_iterator it = llangs.begin();
it != llangs.end(); it++) {
SynTermTransStem stemmer(*it);
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
*it, &stemmer);
string unac;
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
(void)expander.synExpand(unac, result);
}
#endif
if (result.empty())
result.push_back(term);
sort(result.begin(), result.end());
unique(result.begin(), result.end());
vector<string>::iterator uit = unique(result.begin(), result.end());
result.resize(uit - result.begin());
LOGDEB0(("stemExpand:%s: %s -> %s\n", langs.c_str(), term.c_str(),
stringsToString(result).c_str()));
return true;
}