Implement single-term query-time synonyms
This commit is contained in:
parent
f4ecd5c29e
commit
16b3396f12
7 changed files with 92 additions and 41 deletions
|
@ -1144,21 +1144,44 @@ string RclConfig::getMimeIconPath(const string &mtype, const string &apptag)
|
||||||
return path_cat(iconpath, iconname) + ".png";
|
return path_cat(iconpath, iconname) + ".png";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return path defined by varname. May be absolute or relative to
|
||||||
|
// confdir, with default in confdir
|
||||||
|
string RclConfig::getConfdirPath(const char *varname, const char *dflt) const
|
||||||
|
{
|
||||||
|
string result;
|
||||||
|
if (!getConfParam(varname, result)) {
|
||||||
|
result = path_cat(getConfDir(), dflt);
|
||||||
|
} else {
|
||||||
|
result = path_tildexpand(result);
|
||||||
|
// If not an absolute path, compute relative to config dir
|
||||||
|
if (result.at(0) != '/') {
|
||||||
|
result = path_cat(getConfDir(), result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return path_canon(result);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
string RclConfig::getDbDir() const
|
string RclConfig::getDbDir() const
|
||||||
{
|
{
|
||||||
string dbdir;
|
return getConfdirPath("dbdir", "xapiandb");
|
||||||
if (!getConfParam("dbdir", dbdir)) {
|
|
||||||
LOGERR(("RclConfig::getDbDir: no db directory in configuration\n"));
|
|
||||||
} else {
|
|
||||||
dbdir = path_tildexpand(dbdir);
|
|
||||||
// If not an absolute path, compute relative to config dir
|
|
||||||
if (dbdir.at(0) != '/') {
|
|
||||||
LOGDEB1(("Dbdir not abs, catting with confdir\n"));
|
|
||||||
dbdir = path_cat(getConfDir(), dbdir);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string RclConfig::getStopfile() const
|
||||||
|
{
|
||||||
|
return getConfdirPath("stoplistfile", "stoplist.txt");
|
||||||
}
|
}
|
||||||
LOGDEB1(("RclConfig::getDbDir: dbdir: [%s]\n", dbdir.c_str()));
|
|
||||||
return path_canon(dbdir);
|
string RclConfig::getSynGroupsFile() const
|
||||||
|
{
|
||||||
|
return getConfdirPath("syngroupsfile", "syngroups.txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
// The index status file is fast changing, so it's possible to put it outside
|
||||||
|
// of the config directory (for ssds, not sure this is really useful).
|
||||||
|
string RclConfig::getIdxStatusFile() const
|
||||||
|
{
|
||||||
|
return getConfdirPath("idxstatusfile", "idxstatus.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
void RclConfig::urlrewrite(const string& dbdir, string& url) const
|
void RclConfig::urlrewrite(const string& dbdir, string& url) const
|
||||||
|
@ -1213,32 +1236,11 @@ bool RclConfig::sourceChanged() const
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
string RclConfig::getStopfile() const
|
|
||||||
{
|
|
||||||
return path_cat(getConfDir(), "stoplist.txt");
|
|
||||||
}
|
|
||||||
string RclConfig::getPidfile() const
|
string RclConfig::getPidfile() const
|
||||||
{
|
{
|
||||||
return path_cat(getConfDir(), "index.pid");
|
return path_cat(getConfDir(), "index.pid");
|
||||||
}
|
}
|
||||||
|
|
||||||
// The index status file is fast changing, so it's possible to put it outside
|
|
||||||
// of the config directory (for ssds, not sure this is really useful).
|
|
||||||
string RclConfig::getIdxStatusFile() const
|
|
||||||
{
|
|
||||||
string path;
|
|
||||||
if (!getConfParam("idxstatusfile", path)) {
|
|
||||||
return path_cat(getConfDir(), "idxstatus.txt");
|
|
||||||
} else {
|
|
||||||
path = path_tildexpand(path);
|
|
||||||
// If not an absolute path, compute relative to config dir
|
|
||||||
if (path.at(0) != '/') {
|
|
||||||
path = path_cat(getConfDir(), path);
|
|
||||||
}
|
|
||||||
return path_canon(path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
string RclConfig::getWebQueueDir() const
|
string RclConfig::getWebQueueDir() const
|
||||||
{
|
{
|
||||||
string webqueuedir;
|
string webqueuedir;
|
||||||
|
|
|
@ -171,10 +171,13 @@ class RclConfig {
|
||||||
* need for other status */
|
* need for other status */
|
||||||
vector<string> getTopdirs() const;
|
vector<string> getTopdirs() const;
|
||||||
|
|
||||||
|
string getConfdirPath(const char *varname, const char *dflt) const;
|
||||||
/** Get database directory */
|
/** Get database directory */
|
||||||
string getDbDir() const;
|
string getDbDir() const;
|
||||||
/** Get stoplist file name */
|
/** Get stoplist file name */
|
||||||
string getStopfile() const;
|
string getStopfile() const;
|
||||||
|
/** Get synonym groups file name */
|
||||||
|
string getSynGroupsFile() const;
|
||||||
/** Get indexing pid file name */
|
/** Get indexing pid file name */
|
||||||
string getPidfile() const;
|
string getPidfile() const;
|
||||||
/** Get indexing status file name */
|
/** Get indexing status file name */
|
||||||
|
|
|
@ -63,20 +63,29 @@ SynGroups::~SynGroups()
|
||||||
|
|
||||||
const int LL = 1024;
|
const int LL = 1024;
|
||||||
|
|
||||||
SynGroups::SynGroups(const string& fn)
|
SynGroups::SynGroups()
|
||||||
: m(new Internal)
|
: m(new Internal)
|
||||||
{
|
{
|
||||||
if (!m) {
|
|
||||||
LOGERR(("SynGroups::SynGroups:: new Internal failed: no mem ?\n"));
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool SynGroups::setfile(const string& fn)
|
||||||
|
{
|
||||||
|
LOGDEB(("SynGroups::setfile(%s)\n", fn.c_str()));
|
||||||
|
if (!m) {
|
||||||
|
LOGERR(("SynGroups:setfile:: new Internal failed: no mem ?\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Don't set ok to true.
|
||||||
|
if (fn.empty())
|
||||||
|
return true;
|
||||||
|
|
||||||
ifstream input;
|
ifstream input;
|
||||||
input.open(fn.c_str(), ios::in);
|
input.open(fn.c_str(), ios::in);
|
||||||
if (!input.is_open()) {
|
if (!input.is_open()) {
|
||||||
LOGERR(("SynGroups::SynGroups:: could not open %s errno %d\n",
|
LOGERR(("SynGroups:setfile:: could not open %s errno %d\n",
|
||||||
fn.c_str(), errno));
|
fn.c_str(), errno));
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
char cline[LL];
|
char cline[LL];
|
||||||
|
@ -91,7 +100,7 @@ SynGroups::SynGroups(const string& fn)
|
||||||
if (!input.good()) {
|
if (!input.good()) {
|
||||||
if (input.bad()) {
|
if (input.bad()) {
|
||||||
LOGDEB(("Parse: input.bad()\n"));
|
LOGDEB(("Parse: input.bad()\n"));
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
// Must be eof ? But maybe we have a partial line which
|
// Must be eof ? But maybe we have a partial line which
|
||||||
// must be processed. This happens if the last line before
|
// must be processed. This happens if the last line before
|
||||||
|
@ -130,7 +139,7 @@ SynGroups::SynGroups(const string& fn)
|
||||||
|
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
if (!stringToStrings(line, words)) {
|
if (!stringToStrings(line, words)) {
|
||||||
LOGERR(("SynGroups::SynGroups: %s: bad line %d: %s\n",
|
LOGERR(("SynGroups:setfile: %s: bad line %d: %s\n",
|
||||||
fn.c_str(), lnum, line.c_str()));
|
fn.c_str(), lnum, line.c_str()));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -138,7 +147,7 @@ SynGroups::SynGroups(const string& fn)
|
||||||
if (words.empty())
|
if (words.empty())
|
||||||
continue;
|
continue;
|
||||||
if (words.size() == 1) {
|
if (words.size() == 1) {
|
||||||
LOGDEB(("SynGroups::SynGroups: single term group at line %d ??\n",
|
LOGDEB(("SynGroups:setfile: single term group at line %d ??\n",
|
||||||
lnum));
|
lnum));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -148,8 +157,11 @@ SynGroups::SynGroups(const string& fn)
|
||||||
it != words.end(); it++) {
|
it != words.end(); it++) {
|
||||||
m->terms[*it] = lnum;
|
m->terms[*it] = lnum;
|
||||||
}
|
}
|
||||||
|
LOGDEB(("SynGroups::setfile: group: [%s]\n",
|
||||||
|
stringsToString(m->groups[lnum]).c_str()));
|
||||||
}
|
}
|
||||||
m->ok = true;
|
m->ok = true;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<string> SynGroups::getgroup(const string& term)
|
vector<string> SynGroups::getgroup(const string& term)
|
||||||
|
|
|
@ -26,8 +26,9 @@
|
||||||
// in a group are equivalent.
|
// in a group are equivalent.
|
||||||
class SynGroups {
|
class SynGroups {
|
||||||
public:
|
public:
|
||||||
SynGroups(const std::string& fname);
|
SynGroups();
|
||||||
~SynGroups();
|
~SynGroups();
|
||||||
|
bool setfile(const std::string& fname);
|
||||||
std::vector<std::string> getgroup(const std::string& term);
|
std::vector<std::string> getgroup(const std::string& term);
|
||||||
bool ok();
|
bool ok();
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -762,6 +762,10 @@ bool Db::open(OpenMode mode, OpenError *error)
|
||||||
}
|
}
|
||||||
if (!m_config->getStopfile().empty())
|
if (!m_config->getStopfile().empty())
|
||||||
m_stops.setFile(m_config->getStopfile());
|
m_stops.setFile(m_config->getStopfile());
|
||||||
|
// Synonyms are only used at query time for now
|
||||||
|
if (mode == DbRO)
|
||||||
|
m_syngroups.setfile(m_config->getSynGroupsFile());
|
||||||
|
|
||||||
string dir = m_config->getDbDir();
|
string dir = m_config->getDbDir();
|
||||||
string ermsg;
|
string ermsg;
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
#include "rclconfig.h"
|
#include "rclconfig.h"
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
#include "textsplit.h"
|
#include "textsplit.h"
|
||||||
|
#include "syngroups.h"
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
@ -480,6 +481,9 @@ private:
|
||||||
* after init */
|
* after init */
|
||||||
// Stop terms: those don't get indexed.
|
// Stop terms: those don't get indexed.
|
||||||
StopList m_stops;
|
StopList m_stops;
|
||||||
|
// Synonym groups
|
||||||
|
SynGroups m_syngroups;
|
||||||
|
|
||||||
// Truncation length for stored meta fields
|
// Truncation length for stored meta fields
|
||||||
int m_idxMetaStoredLen;
|
int m_idxMetaStoredLen;
|
||||||
// This is how long an abstract we keep or build from beginning of
|
// This is how long an abstract we keep or build from beginning of
|
||||||
|
|
|
@ -235,7 +235,8 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// Expansion is STEM or NONE (which may still need case/diac exp)
|
// Expansion is STEM or NONE (which may still need synonyms
|
||||||
|
// and case/diac exp)
|
||||||
|
|
||||||
vector<string> lexp;
|
vector<string> lexp;
|
||||||
if (diac_sensitive && case_sensitive) {
|
if (diac_sensitive && case_sensitive) {
|
||||||
|
@ -273,6 +274,30 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
||||||
}
|
}
|
||||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||||
|
|
||||||
|
lexp.clear();
|
||||||
|
// Expand the result for synonyms. Note that doing it here
|
||||||
|
// means that multi-term synonyms will not work
|
||||||
|
// (e.g. stakhanovist -> "hard at work". We would have to
|
||||||
|
// separate the multi-word expansions for our caller to
|
||||||
|
// add them as phrases to the query. Not impossible, but
|
||||||
|
// let's keep it at single words for now.
|
||||||
|
if (m_syngroups.ok()) {
|
||||||
|
LOGDEB(("ExpTerm: got syngroups\n"));
|
||||||
|
for (vector<string>::const_iterator it = exp1.begin();
|
||||||
|
it != exp1.end(); it++) {
|
||||||
|
vector<string> sg = m_syngroups.getgroup(*it);
|
||||||
|
if (!sg.empty()) {
|
||||||
|
LOGDEB(("ExpTerm: syns: %s -> %s\n",
|
||||||
|
it->c_str(), stringsToString(sg).c_str()));
|
||||||
|
lexp.insert(lexp.end(), sg.begin(), sg.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort(lexp.begin(), lexp.end());
|
||||||
|
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||||
|
// Keep result in exp1 for next step
|
||||||
|
exp1.swap(lexp);
|
||||||
|
}
|
||||||
|
|
||||||
// Expand the resulting list for case (all stemdb content
|
// Expand the resulting list for case (all stemdb content
|
||||||
// is lowercase)
|
// is lowercase)
|
||||||
lexp.clear();
|
lexp.clear();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue