Implement single-term query-time synonyms

This commit is contained in:
Jean-Francois Dockes 2015-08-22 15:11:07 +02:00
parent f4ecd5c29e
commit 16b3396f12
7 changed files with 92 additions and 41 deletions

View file

@ -1144,21 +1144,44 @@ string RclConfig::getMimeIconPath(const string &mtype, const string &apptag)
return path_cat(iconpath, iconname) + ".png"; return path_cat(iconpath, iconname) + ".png";
} }
string RclConfig::getDbDir() const // Return path defined by varname. May be absolute or relative to
// confdir, with default in confdir
string RclConfig::getConfdirPath(const char *varname, const char *dflt) const
{ {
string dbdir; string result;
if (!getConfParam("dbdir", dbdir)) { if (!getConfParam(varname, result)) {
LOGERR(("RclConfig::getDbDir: no db directory in configuration\n")); result = path_cat(getConfDir(), dflt);
} else { } else {
dbdir = path_tildexpand(dbdir); result = path_tildexpand(result);
// If not an absolute path, compute relative to config dir // If not an absolute path, compute relative to config dir
if (dbdir.at(0) != '/') { if (result.at(0) != '/') {
LOGDEB1(("Dbdir not abs, catting with confdir\n")); result = path_cat(getConfDir(), result);
dbdir = path_cat(getConfDir(), dbdir);
} }
} }
LOGDEB1(("RclConfig::getDbDir: dbdir: [%s]\n", dbdir.c_str())); return path_canon(result);
return path_canon(dbdir);
}
string RclConfig::getDbDir() const
{
return getConfdirPath("dbdir", "xapiandb");
}
string RclConfig::getStopfile() const
{
return getConfdirPath("stoplistfile", "stoplist.txt");
}
string RclConfig::getSynGroupsFile() const
{
return getConfdirPath("syngroupsfile", "syngroups.txt");
}
// The index status file is fast changing, so it's possible to put it outside
// of the config directory (for ssds, not sure this is really useful).
string RclConfig::getIdxStatusFile() const
{
return getConfdirPath("idxstatusfile", "idxstatus.txt");
} }
void RclConfig::urlrewrite(const string& dbdir, string& url) const void RclConfig::urlrewrite(const string& dbdir, string& url) const
@ -1213,32 +1236,11 @@ bool RclConfig::sourceChanged() const
return false; return false;
} }
string RclConfig::getStopfile() const
{
return path_cat(getConfDir(), "stoplist.txt");
}
string RclConfig::getPidfile() const string RclConfig::getPidfile() const
{ {
return path_cat(getConfDir(), "index.pid"); return path_cat(getConfDir(), "index.pid");
} }
// The index status file is fast changing, so it's possible to put it outside
// of the config directory (for ssds, not sure this is really useful).
string RclConfig::getIdxStatusFile() const
{
string path;
if (!getConfParam("idxstatusfile", path)) {
return path_cat(getConfDir(), "idxstatus.txt");
} else {
path = path_tildexpand(path);
// If not an absolute path, compute relative to config dir
if (path.at(0) != '/') {
path = path_cat(getConfDir(), path);
}
return path_canon(path);
}
}
string RclConfig::getWebQueueDir() const string RclConfig::getWebQueueDir() const
{ {
string webqueuedir; string webqueuedir;

View file

@ -171,10 +171,13 @@ class RclConfig {
* need for other status */ * need for other status */
vector<string> getTopdirs() const; vector<string> getTopdirs() const;
string getConfdirPath(const char *varname, const char *dflt) const;
/** Get database directory */ /** Get database directory */
string getDbDir() const; string getDbDir() const;
/** Get stoplist file name */ /** Get stoplist file name */
string getStopfile() const; string getStopfile() const;
/** Get synonym groups file name */
string getSynGroupsFile() const;
/** Get indexing pid file name */ /** Get indexing pid file name */
string getPidfile() const; string getPidfile() const;
/** Get indexing status file name */ /** Get indexing status file name */

View file

@ -63,20 +63,29 @@ SynGroups::~SynGroups()
const int LL = 1024; const int LL = 1024;
SynGroups::SynGroups(const string& fn) SynGroups::SynGroups()
: m(new Internal) : m(new Internal)
{ {
}
bool SynGroups::setfile(const string& fn)
{
LOGDEB(("SynGroups::setfile(%s)\n", fn.c_str()));
if (!m) { if (!m) {
LOGERR(("SynGroups::SynGroups:: new Internal failed: no mem ?\n")); LOGERR(("SynGroups:setfile:: new Internal failed: no mem ?\n"));
return; return false;
} }
// Don't set ok to true.
if (fn.empty())
return true;
ifstream input; ifstream input;
input.open(fn.c_str(), ios::in); input.open(fn.c_str(), ios::in);
if (!input.is_open()) { if (!input.is_open()) {
LOGERR(("SynGroups::SynGroups:: could not open %s errno %d\n", LOGERR(("SynGroups:setfile:: could not open %s errno %d\n",
fn.c_str(), errno)); fn.c_str(), errno));
return; return false;
} }
char cline[LL]; char cline[LL];
@ -91,7 +100,7 @@ SynGroups::SynGroups(const string& fn)
if (!input.good()) { if (!input.good()) {
if (input.bad()) { if (input.bad()) {
LOGDEB(("Parse: input.bad()\n")); LOGDEB(("Parse: input.bad()\n"));
return; return false;
} }
// Must be eof ? But maybe we have a partial line which // Must be eof ? But maybe we have a partial line which
// must be processed. This happens if the last line before // must be processed. This happens if the last line before
@ -130,7 +139,7 @@ SynGroups::SynGroups(const string& fn)
vector<string> words; vector<string> words;
if (!stringToStrings(line, words)) { if (!stringToStrings(line, words)) {
LOGERR(("SynGroups::SynGroups: %s: bad line %d: %s\n", LOGERR(("SynGroups:setfile: %s: bad line %d: %s\n",
fn.c_str(), lnum, line.c_str())); fn.c_str(), lnum, line.c_str()));
continue; continue;
} }
@ -138,7 +147,7 @@ SynGroups::SynGroups(const string& fn)
if (words.empty()) if (words.empty())
continue; continue;
if (words.size() == 1) { if (words.size() == 1) {
LOGDEB(("SynGroups::SynGroups: single term group at line %d ??\n", LOGDEB(("SynGroups:setfile: single term group at line %d ??\n",
lnum)); lnum));
continue; continue;
} }
@ -148,8 +157,11 @@ SynGroups::SynGroups(const string& fn)
it != words.end(); it++) { it != words.end(); it++) {
m->terms[*it] = lnum; m->terms[*it] = lnum;
} }
LOGDEB(("SynGroups::setfile: group: [%s]\n",
stringsToString(m->groups[lnum]).c_str()));
} }
m->ok = true; m->ok = true;
return true;
} }
vector<string> SynGroups::getgroup(const string& term) vector<string> SynGroups::getgroup(const string& term)

View file

@ -26,8 +26,9 @@
// in a group are equivalent. // in a group are equivalent.
class SynGroups { class SynGroups {
public: public:
SynGroups(const std::string& fname); SynGroups();
~SynGroups(); ~SynGroups();
bool setfile(const std::string& fname);
std::vector<std::string> getgroup(const std::string& term); std::vector<std::string> getgroup(const std::string& term);
bool ok(); bool ok();
private: private:

View file

@ -762,6 +762,10 @@ bool Db::open(OpenMode mode, OpenError *error)
} }
if (!m_config->getStopfile().empty()) if (!m_config->getStopfile().empty())
m_stops.setFile(m_config->getStopfile()); m_stops.setFile(m_config->getStopfile());
// Synonyms are only used at query time for now
if (mode == DbRO)
m_syngroups.setfile(m_config->getSynGroupsFile());
string dir = m_config->getDbDir(); string dir = m_config->getDbDir();
string ermsg; string ermsg;
try { try {

View file

@ -29,6 +29,7 @@
#include "rclconfig.h" #include "rclconfig.h"
#include "utf8iter.h" #include "utf8iter.h"
#include "textsplit.h" #include "textsplit.h"
#include "syngroups.h"
using std::string; using std::string;
using std::vector; using std::vector;
@ -480,6 +481,9 @@ private:
* after init */ * after init */
// Stop terms: those don't get indexed. // Stop terms: those don't get indexed.
StopList m_stops; StopList m_stops;
// Synonym groups
SynGroups m_syngroups;
// Truncation length for stored meta fields // Truncation length for stored meta fields
int m_idxMetaStoredLen; int m_idxMetaStoredLen;
// This is how long an abstract we keep or build from beginning of // This is how long an abstract we keep or build from beginning of

View file

@ -235,7 +235,8 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
} }
} else { } else {
// Expansion is STEM or NONE (which may still need case/diac exp) // Expansion is STEM or NONE (which may still need synonyms
// and case/diac exp)
vector<string> lexp; vector<string> lexp;
if (diac_sensitive && case_sensitive) { if (diac_sensitive && case_sensitive) {
@ -273,6 +274,30 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
} }
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str())); LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
lexp.clear();
// Expand the result for synonyms. Note that doing it here
// means that multi-term synonyms will not work
// (e.g. stakhanovist -> "hard at work". We would have to
// separate the multi-word expansions for our caller to
// add them as phrases to the query. Not impossible, but
// let's keep it at single words for now.
if (m_syngroups.ok()) {
LOGDEB(("ExpTerm: got syngroups\n"));
for (vector<string>::const_iterator it = exp1.begin();
it != exp1.end(); it++) {
vector<string> sg = m_syngroups.getgroup(*it);
if (!sg.empty()) {
LOGDEB(("ExpTerm: syns: %s -> %s\n",
it->c_str(), stringsToString(sg).c_str()));
lexp.insert(lexp.end(), sg.begin(), sg.end());
}
}
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
// Keep result in exp1 for next step
exp1.swap(lexp);
}
// Expand the resulting list for case (all stemdb content // Expand the resulting list for case (all stemdb content
// is lowercase) // is lowercase)
lexp.clear(); lexp.clear();