Implement single-term query-time synonyms

This commit is contained in:
Jean-Francois Dockes 2015-08-22 15:11:07 +02:00
parent f4ecd5c29e
commit 16b3396f12
7 changed files with 92 additions and 41 deletions

View file

@ -1144,21 +1144,44 @@ string RclConfig::getMimeIconPath(const string &mtype, const string &apptag)
return path_cat(iconpath, iconname) + ".png";
}
// Return path defined by varname. May be absolute or relative to
// confdir, with default in confdir
string RclConfig::getConfdirPath(const char *varname, const char *dflt) const
{
string result;
if (!getConfParam(varname, result)) {
result = path_cat(getConfDir(), dflt);
} else {
result = path_tildexpand(result);
// If not an absolute path, compute relative to config dir
if (result.at(0) != '/') {
result = path_cat(getConfDir(), result);
}
}
return path_canon(result);
}
string RclConfig::getDbDir() const
{
string dbdir;
if (!getConfParam("dbdir", dbdir)) {
LOGERR(("RclConfig::getDbDir: no db directory in configuration\n"));
} else {
dbdir = path_tildexpand(dbdir);
// If not an absolute path, compute relative to config dir
if (dbdir.at(0) != '/') {
LOGDEB1(("Dbdir not abs, catting with confdir\n"));
dbdir = path_cat(getConfDir(), dbdir);
return getConfdirPath("dbdir", "xapiandb");
}
string RclConfig::getStopfile() const
{
return getConfdirPath("stoplistfile", "stoplist.txt");
}
LOGDEB1(("RclConfig::getDbDir: dbdir: [%s]\n", dbdir.c_str()));
return path_canon(dbdir);
string RclConfig::getSynGroupsFile() const
{
return getConfdirPath("syngroupsfile", "syngroups.txt");
}
// The index status file is fast changing, so it's possible to put it outside
// of the config directory (for ssds, not sure this is really useful).
string RclConfig::getIdxStatusFile() const
{
return getConfdirPath("idxstatusfile", "idxstatus.txt");
}
void RclConfig::urlrewrite(const string& dbdir, string& url) const
@ -1213,32 +1236,11 @@ bool RclConfig::sourceChanged() const
return false;
}
string RclConfig::getStopfile() const
{
return path_cat(getConfDir(), "stoplist.txt");
}
string RclConfig::getPidfile() const
{
return path_cat(getConfDir(), "index.pid");
}
// The index status file is fast changing, so it's possible to put it outside
// of the config directory (for ssds, not sure this is really useful).
string RclConfig::getIdxStatusFile() const
{
string path;
if (!getConfParam("idxstatusfile", path)) {
return path_cat(getConfDir(), "idxstatus.txt");
} else {
path = path_tildexpand(path);
// If not an absolute path, compute relative to config dir
if (path.at(0) != '/') {
path = path_cat(getConfDir(), path);
}
return path_canon(path);
}
}
string RclConfig::getWebQueueDir() const
{
string webqueuedir;

View file

@ -171,10 +171,13 @@ class RclConfig {
* need for other status */
vector<string> getTopdirs() const;
string getConfdirPath(const char *varname, const char *dflt) const;
/** Get database directory */
string getDbDir() const;
/** Get stoplist file name */
string getStopfile() const;
/** Get synonym groups file name */
string getSynGroupsFile() const;
/** Get indexing pid file name */
string getPidfile() const;
/** Get indexing status file name */

View file

@ -63,20 +63,29 @@ SynGroups::~SynGroups()
const int LL = 1024;
SynGroups::SynGroups(const string& fn)
SynGroups::SynGroups()
: m(new Internal)
{
if (!m) {
LOGERR(("SynGroups::SynGroups:: new Internal failed: no mem ?\n"));
return;
}
bool SynGroups::setfile(const string& fn)
{
LOGDEB(("SynGroups::setfile(%s)\n", fn.c_str()));
if (!m) {
LOGERR(("SynGroups:setfile:: new Internal failed: no mem ?\n"));
return false;
}
// Don't set ok to true.
if (fn.empty())
return true;
ifstream input;
input.open(fn.c_str(), ios::in);
if (!input.is_open()) {
LOGERR(("SynGroups::SynGroups:: could not open %s errno %d\n",
LOGERR(("SynGroups:setfile:: could not open %s errno %d\n",
fn.c_str(), errno));
return;
return false;
}
char cline[LL];
@ -91,7 +100,7 @@ SynGroups::SynGroups(const string& fn)
if (!input.good()) {
if (input.bad()) {
LOGDEB(("Parse: input.bad()\n"));
return;
return false;
}
// Must be eof ? But maybe we have a partial line which
// must be processed. This happens if the last line before
@ -130,7 +139,7 @@ SynGroups::SynGroups(const string& fn)
vector<string> words;
if (!stringToStrings(line, words)) {
LOGERR(("SynGroups::SynGroups: %s: bad line %d: %s\n",
LOGERR(("SynGroups:setfile: %s: bad line %d: %s\n",
fn.c_str(), lnum, line.c_str()));
continue;
}
@ -138,7 +147,7 @@ SynGroups::SynGroups(const string& fn)
if (words.empty())
continue;
if (words.size() == 1) {
LOGDEB(("SynGroups::SynGroups: single term group at line %d ??\n",
LOGDEB(("SynGroups:setfile: single term group at line %d ??\n",
lnum));
continue;
}
@ -148,8 +157,11 @@ SynGroups::SynGroups(const string& fn)
it != words.end(); it++) {
m->terms[*it] = lnum;
}
LOGDEB(("SynGroups::setfile: group: [%s]\n",
stringsToString(m->groups[lnum]).c_str()));
}
m->ok = true;
return true;
}
vector<string> SynGroups::getgroup(const string& term)

View file

@ -26,8 +26,9 @@
// in a group are equivalent.
class SynGroups {
public:
SynGroups(const std::string& fname);
SynGroups();
~SynGroups();
bool setfile(const std::string& fname);
std::vector<std::string> getgroup(const std::string& term);
bool ok();
private:

View file

@ -762,6 +762,10 @@ bool Db::open(OpenMode mode, OpenError *error)
}
if (!m_config->getStopfile().empty())
m_stops.setFile(m_config->getStopfile());
// Synonyms are only used at query time for now
if (mode == DbRO)
m_syngroups.setfile(m_config->getSynGroupsFile());
string dir = m_config->getDbDir();
string ermsg;
try {

View file

@ -29,6 +29,7 @@
#include "rclconfig.h"
#include "utf8iter.h"
#include "textsplit.h"
#include "syngroups.h"
using std::string;
using std::vector;
@ -480,6 +481,9 @@ private:
* after init */
// Stop terms: those don't get indexed.
StopList m_stops;
// Synonym groups
SynGroups m_syngroups;
// Truncation length for stored meta fields
int m_idxMetaStoredLen;
// This is how long an abstract we keep or build from beginning of

View file

@ -235,7 +235,8 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
}
} else {
// Expansion is STEM or NONE (which may still need case/diac exp)
// Expansion is STEM or NONE (which may still need synonyms
// and case/diac exp)
vector<string> lexp;
if (diac_sensitive && case_sensitive) {
@ -273,6 +274,30 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
}
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
lexp.clear();
// Expand the result for synonyms. Note that doing it here
// means that multi-term synonyms will not work
// (e.g. stakhanovist -> "hard at work". We would have to
// separate the multi-word expansions for our caller to
// add them as phrases to the query. Not impossible, but
// let's keep it at single words for now.
if (m_syngroups.ok()) {
LOGDEB(("ExpTerm: got syngroups\n"));
for (vector<string>::const_iterator it = exp1.begin();
it != exp1.end(); it++) {
vector<string> sg = m_syngroups.getgroup(*it);
if (!sg.empty()) {
LOGDEB(("ExpTerm: syns: %s -> %s\n",
it->c_str(), stringsToString(sg).c_str()));
lexp.insert(lexp.end(), sg.begin(), sg.end());
}
}
sort(lexp.begin(), lexp.end());
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
// Keep result in exp1 for next step
exp1.swap(lexp);
}
// Expand the resulting list for case (all stemdb content
// is lowercase)
lexp.clear();