Make Recoll optionally sensitive to case and diacritics

This commit is contained in:
Jean-Francois Dockes 2012-09-14 14:34:27 +02:00
parent 7fcfe27952
commit 166624f7f2
30 changed files with 849 additions and 487 deletions

View file

@ -14,6 +14,8 @@
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <stdio.h>
#include <cstring>
#include <unistd.h>
@ -53,6 +55,7 @@ using namespace std;
#include "cancelcheck.h"
#include "ptmutex.h"
#include "termproc.h"
#include "expansiondbs.h"
#ifndef MAX
#define MAX(A,B) (A>B?A:B)
@ -84,9 +87,15 @@ static const string xapday_prefix = "D";
static const string xapmonth_prefix = "M";
static const string xapyear_prefix = "Y";
const string pathelt_prefix = "XP";
#ifdef RCL_INDEX_STRIPCHARS
const string start_of_field_term = "XXST";
const string end_of_field_term = "XXND";
static const string page_break_term = "XXPG";
#else
const string start_of_field_term = "XXST/";
const string end_of_field_term = "XXND/";
static const string page_break_term = "XXPG/";
#endif
// Field name for the unsplit file name. Has to exist in the field file
// because of usage in termmatch()
static const string unsplitFilenameFieldName = "rclUnsplitFN";
@ -197,7 +206,7 @@ static void noPrefixList(const vector<string>& in, vector<string>& out)
{
for (vector<string>::const_iterator qit = in.begin();
qit != in.end(); qit++) {
if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
if (!has_prefix(*qit))
out.push_back(*qit);
}
}
@ -573,7 +582,7 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
for (term = xrdb.termlist_begin(docid);
term != xrdb.termlist_end(docid); term++) {
// Ignore prefixed terms
if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
if (has_prefix(*term))
continue;
if (cutoff-- < 0) {
LOGDEB0(("makeAbstract: max term count cutoff\n"));
@ -652,7 +661,9 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
vabs.push_back(chunk);
chunk.clear();
} else {
chunk += it->second;
if (it->second.compare(end_of_field_term) &&
it->second.compare(start_of_field_term))
chunk += it->second;
}
}
if (!chunk.empty())
@ -874,11 +885,13 @@ int Db::termDocCnt(const string& _term)
if (!m_ndb || !m_ndb->m_isopen)
return -1;
string term;
string term = _term;
#ifdef RCL_INDEX_STRIPCHARS
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
return 0;
}
#endif
if (m_stops.isStop(term)) {
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
@ -994,8 +1007,19 @@ class TextSplitDb : public TextSplitP {
{}
// Reimplement text_to_words to add start and end special terms
virtual bool text_to_words(const string &in);
void setprefix(const string& pref) {prefix = pref;}
void setwdfinc(int i) {wdfinc = i;}
void setprefix(const string& pref)
{
if (pref.empty())
prefix.clear();
else
prefix = wrap_prefix(pref);
}
void setwdfinc(int i)
{
wdfinc = i;
}
friend class TermProcIdx;
@ -1127,11 +1151,13 @@ string Db::getSpellingSuggestion(const string& word)
{
if (m_ndb == 0)
return string();
string term;
string term = word;
#ifdef RCL_INDEX_STRIPCHARS
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
return string();
}
#endif
if (!isSpellingCandidate(term))
return string();
return m_ndb->xrdb.get_spelling_suggestion(term);
@ -1239,8 +1265,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
TermProcIdx tpidx;
TermProc *nxt = &tpidx;
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
#ifdef RCL_INDEX_STRIPCHARS
TermProcPrep tpprep(nxt); nxt = &tpprep;
#endif
TextSplitDb splitter(newdocument, nxt);
tpidx.setTSD(&splitter);
@ -1266,7 +1294,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
vector<string> vpath;
stringToTokens(path, vpath, "/");
splitter.curpos = 0;
newdocument.add_posting(pathelt_prefix,
newdocument.add_posting(wrap_prefix(pathelt_prefix),
splitter.basepos + splitter.curpos++);
for (vector<string>::iterator it = vpath.begin();
it != vpath.end(); it++){
@ -1274,7 +1302,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
// Just truncate it. May still be useful because of wildcards
*it = it->substr(0, 230);
}
newdocument.add_posting(pathelt_prefix + *it,
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
splitter.basepos + splitter.curpos++);
}
}
@ -1319,7 +1347,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
////// Special terms for other metadata. No positions for these.
// Mime type
newdocument.add_term(mimetype_prefix + doc.mimetype);
newdocument.add_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
// Simple file name indexed unsplit for specific "file name"
// searches. This is not the same as a filename: clause inside the
@ -1335,9 +1363,10 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
utf8truncate(fn, 230);
string::size_type pos = fn.rfind('.');
if (pos != string::npos && pos != fn.length() - 1) {
newdocument.add_term(fileext_prefix + fn.substr(pos + 1));
newdocument.add_term(wrap_prefix(fileext_prefix) +
fn.substr(pos + 1));
}
newdocument.add_term(unsplitfilename_prefix + fn);
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn);
}
}
@ -1356,12 +1385,15 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
struct tm *tm = localtime(&mtime);
char buf[9];
snprintf(buf, 9, "%04d%02d%02d",
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
// Date (YYYYMMDD)
newdocument.add_term(wrap_prefix(xapday_prefix) + string(buf));
// Month (YYYYMM)
buf[6] = '\0';
newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
newdocument.add_term(wrap_prefix(xapmonth_prefix) + string(buf));
// Year (YYYY)
buf[4] = '\0';
newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
newdocument.add_term(wrap_prefix(xapyear_prefix) + string(buf));
//////////////////////////////////////////////////////////////////
@ -1834,7 +1866,7 @@ bool Db::maxYearSpan(int *minyear, int *maxyear)
*minyear = 1000000;
*maxyear = -1000000;
TermMatchResult result;
if (!termMatch(ET_WILD, string(), "*", result, 5000, "xapyear"))
if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear"))
return false;
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
it != result.entries.end(); it++) {
@ -1899,30 +1931,32 @@ const string cstr_wildSpecChars = "*?[";
const string cstr_regSpecChars = "(.[{";
// Find all index terms that match a wildcard or regular expression
// If field is set, we return a list of appropriately prefixed terms (which
// are going to be used to build a Xapian query).
bool Db::termMatch(MatchType typ, const string &lang,
const string &root,
TermMatchResult& res,
int max,
const string& field,
string *prefixp
)
const string& field)
{
if (!m_ndb || !m_ndb->m_isopen)
return false;
Xapian::Database xdb = m_ndb->xdb();
res.clear();
XAPTRY(res.dbdoccount = xdb.get_doccount();
res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
if (!m_reason.empty())
return false;
// Get rid of capitals and accents
string droot;
string droot = root;
#ifdef RCL_INDEX_STRIPCHARS
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
return false;
}
#endif
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
string prefix;
@ -1932,17 +1966,14 @@ bool Db::termMatch(MatchType typ, const string &lang,
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
field.c_str()));
} else {
prefix = ftp->pfx;
prefix = wrap_prefix(ftp->pfx);
}
if (prefixp)
*prefixp = prefix;
}
res.prefix = prefix;
if (typ == ET_STEM) {
if (!stemExpand(lang, root, res, max))
return false;
sort(res.entries.begin(), res.entries.end());
unique(res.entries.begin(), res.entries.end());
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
it != res.entries.end(); it++) {
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
@ -2032,7 +2063,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
TermMatchCmpByTerm tcmp;
sort(res.entries.begin(), res.entries.end(), tcmp);
TermMatchTermEqual teq;
unique(res.entries.begin(), res.entries.end(), teq);
vector<TermMatchEntry>::iterator uit =
unique(res.entries.begin(), res.entries.end(), teq);
res.entries.resize(uit - res.entries.begin());
TermMatchCmpByWcf wcmp;
sort(res.entries.begin(), res.entries.end(), wcmp);
if (max > 0) {