Implement single-term query-time synonyms
This commit is contained in:
parent
f4ecd5c29e
commit
16b3396f12
7 changed files with 92 additions and 41 deletions
|
@ -1144,21 +1144,44 @@ string RclConfig::getMimeIconPath(const string &mtype, const string &apptag)
|
|||
return path_cat(iconpath, iconname) + ".png";
|
||||
}
|
||||
|
||||
string RclConfig::getDbDir() const
|
||||
// Return path defined by varname. May be absolute or relative to
|
||||
// confdir, with default in confdir
|
||||
string RclConfig::getConfdirPath(const char *varname, const char *dflt) const
|
||||
{
|
||||
string dbdir;
|
||||
if (!getConfParam("dbdir", dbdir)) {
|
||||
LOGERR(("RclConfig::getDbDir: no db directory in configuration\n"));
|
||||
string result;
|
||||
if (!getConfParam(varname, result)) {
|
||||
result = path_cat(getConfDir(), dflt);
|
||||
} else {
|
||||
dbdir = path_tildexpand(dbdir);
|
||||
result = path_tildexpand(result);
|
||||
// If not an absolute path, compute relative to config dir
|
||||
if (dbdir.at(0) != '/') {
|
||||
LOGDEB1(("Dbdir not abs, catting with confdir\n"));
|
||||
dbdir = path_cat(getConfDir(), dbdir);
|
||||
if (result.at(0) != '/') {
|
||||
result = path_cat(getConfDir(), result);
|
||||
}
|
||||
}
|
||||
LOGDEB1(("RclConfig::getDbDir: dbdir: [%s]\n", dbdir.c_str()));
|
||||
return path_canon(dbdir);
|
||||
return path_canon(result);
|
||||
|
||||
}
|
||||
|
||||
string RclConfig::getDbDir() const
|
||||
{
|
||||
return getConfdirPath("dbdir", "xapiandb");
|
||||
}
|
||||
|
||||
string RclConfig::getStopfile() const
|
||||
{
|
||||
return getConfdirPath("stoplistfile", "stoplist.txt");
|
||||
}
|
||||
|
||||
string RclConfig::getSynGroupsFile() const
|
||||
{
|
||||
return getConfdirPath("syngroupsfile", "syngroups.txt");
|
||||
}
|
||||
|
||||
// The index status file is fast changing, so it's possible to put it outside
|
||||
// of the config directory (for ssds, not sure this is really useful).
|
||||
string RclConfig::getIdxStatusFile() const
|
||||
{
|
||||
return getConfdirPath("idxstatusfile", "idxstatus.txt");
|
||||
}
|
||||
|
||||
void RclConfig::urlrewrite(const string& dbdir, string& url) const
|
||||
|
@ -1213,32 +1236,11 @@ bool RclConfig::sourceChanged() const
|
|||
return false;
|
||||
}
|
||||
|
||||
string RclConfig::getStopfile() const
|
||||
{
|
||||
return path_cat(getConfDir(), "stoplist.txt");
|
||||
}
|
||||
string RclConfig::getPidfile() const
|
||||
{
|
||||
return path_cat(getConfDir(), "index.pid");
|
||||
}
|
||||
|
||||
// The index status file is fast changing, so it's possible to put it outside
|
||||
// of the config directory (for ssds, not sure this is really useful).
|
||||
string RclConfig::getIdxStatusFile() const
|
||||
{
|
||||
string path;
|
||||
if (!getConfParam("idxstatusfile", path)) {
|
||||
return path_cat(getConfDir(), "idxstatus.txt");
|
||||
} else {
|
||||
path = path_tildexpand(path);
|
||||
// If not an absolute path, compute relative to config dir
|
||||
if (path.at(0) != '/') {
|
||||
path = path_cat(getConfDir(), path);
|
||||
}
|
||||
return path_canon(path);
|
||||
}
|
||||
}
|
||||
|
||||
string RclConfig::getWebQueueDir() const
|
||||
{
|
||||
string webqueuedir;
|
||||
|
|
|
@ -171,10 +171,13 @@ class RclConfig {
|
|||
* need for other status */
|
||||
vector<string> getTopdirs() const;
|
||||
|
||||
string getConfdirPath(const char *varname, const char *dflt) const;
|
||||
/** Get database directory */
|
||||
string getDbDir() const;
|
||||
/** Get stoplist file name */
|
||||
string getStopfile() const;
|
||||
/** Get synonym groups file name */
|
||||
string getSynGroupsFile() const;
|
||||
/** Get indexing pid file name */
|
||||
string getPidfile() const;
|
||||
/** Get indexing status file name */
|
||||
|
|
|
@ -63,20 +63,29 @@ SynGroups::~SynGroups()
|
|||
|
||||
const int LL = 1024;
|
||||
|
||||
SynGroups::SynGroups(const string& fn)
|
||||
SynGroups::SynGroups()
|
||||
: m(new Internal)
|
||||
{
|
||||
}
|
||||
|
||||
bool SynGroups::setfile(const string& fn)
|
||||
{
|
||||
LOGDEB(("SynGroups::setfile(%s)\n", fn.c_str()));
|
||||
if (!m) {
|
||||
LOGERR(("SynGroups::SynGroups:: new Internal failed: no mem ?\n"));
|
||||
return;
|
||||
LOGERR(("SynGroups:setfile:: new Internal failed: no mem ?\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Don't set ok to true.
|
||||
if (fn.empty())
|
||||
return true;
|
||||
|
||||
ifstream input;
|
||||
input.open(fn.c_str(), ios::in);
|
||||
if (!input.is_open()) {
|
||||
LOGERR(("SynGroups::SynGroups:: could not open %s errno %d\n",
|
||||
LOGERR(("SynGroups:setfile:: could not open %s errno %d\n",
|
||||
fn.c_str(), errno));
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
char cline[LL];
|
||||
|
@ -91,7 +100,7 @@ SynGroups::SynGroups(const string& fn)
|
|||
if (!input.good()) {
|
||||
if (input.bad()) {
|
||||
LOGDEB(("Parse: input.bad()\n"));
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
// Must be eof ? But maybe we have a partial line which
|
||||
// must be processed. This happens if the last line before
|
||||
|
@ -130,7 +139,7 @@ SynGroups::SynGroups(const string& fn)
|
|||
|
||||
vector<string> words;
|
||||
if (!stringToStrings(line, words)) {
|
||||
LOGERR(("SynGroups::SynGroups: %s: bad line %d: %s\n",
|
||||
LOGERR(("SynGroups:setfile: %s: bad line %d: %s\n",
|
||||
fn.c_str(), lnum, line.c_str()));
|
||||
continue;
|
||||
}
|
||||
|
@ -138,7 +147,7 @@ SynGroups::SynGroups(const string& fn)
|
|||
if (words.empty())
|
||||
continue;
|
||||
if (words.size() == 1) {
|
||||
LOGDEB(("SynGroups::SynGroups: single term group at line %d ??\n",
|
||||
LOGDEB(("SynGroups:setfile: single term group at line %d ??\n",
|
||||
lnum));
|
||||
continue;
|
||||
}
|
||||
|
@ -148,8 +157,11 @@ SynGroups::SynGroups(const string& fn)
|
|||
it != words.end(); it++) {
|
||||
m->terms[*it] = lnum;
|
||||
}
|
||||
LOGDEB(("SynGroups::setfile: group: [%s]\n",
|
||||
stringsToString(m->groups[lnum]).c_str()));
|
||||
}
|
||||
m->ok = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
vector<string> SynGroups::getgroup(const string& term)
|
||||
|
|
|
@ -26,8 +26,9 @@
|
|||
// in a group are equivalent.
|
||||
class SynGroups {
|
||||
public:
|
||||
SynGroups(const std::string& fname);
|
||||
SynGroups();
|
||||
~SynGroups();
|
||||
bool setfile(const std::string& fname);
|
||||
std::vector<std::string> getgroup(const std::string& term);
|
||||
bool ok();
|
||||
private:
|
||||
|
|
|
@ -762,6 +762,10 @@ bool Db::open(OpenMode mode, OpenError *error)
|
|||
}
|
||||
if (!m_config->getStopfile().empty())
|
||||
m_stops.setFile(m_config->getStopfile());
|
||||
// Synonyms are only used at query time for now
|
||||
if (mode == DbRO)
|
||||
m_syngroups.setfile(m_config->getSynGroupsFile());
|
||||
|
||||
string dir = m_config->getDbDir();
|
||||
string ermsg;
|
||||
try {
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "rclconfig.h"
|
||||
#include "utf8iter.h"
|
||||
#include "textsplit.h"
|
||||
#include "syngroups.h"
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
@ -480,6 +481,9 @@ private:
|
|||
* after init */
|
||||
// Stop terms: those don't get indexed.
|
||||
StopList m_stops;
|
||||
// Synonym groups
|
||||
SynGroups m_syngroups;
|
||||
|
||||
// Truncation length for stored meta fields
|
||||
int m_idxMetaStoredLen;
|
||||
// This is how long an abstract we keep or build from beginning of
|
||||
|
|
|
@ -235,7 +235,8 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
|||
}
|
||||
|
||||
} else {
|
||||
// Expansion is STEM or NONE (which may still need case/diac exp)
|
||||
// Expansion is STEM or NONE (which may still need synonyms
|
||||
// and case/diac exp)
|
||||
|
||||
vector<string> lexp;
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
|
@ -273,6 +274,30 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
|
|||
}
|
||||
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
||||
|
||||
lexp.clear();
|
||||
// Expand the result for synonyms. Note that doing it here
|
||||
// means that multi-term synonyms will not work
|
||||
// (e.g. stakhanovist -> "hard at work". We would have to
|
||||
// separate the multi-word expansions for our caller to
|
||||
// add them as phrases to the query. Not impossible, but
|
||||
// let's keep it at single words for now.
|
||||
if (m_syngroups.ok()) {
|
||||
LOGDEB(("ExpTerm: got syngroups\n"));
|
||||
for (vector<string>::const_iterator it = exp1.begin();
|
||||
it != exp1.end(); it++) {
|
||||
vector<string> sg = m_syngroups.getgroup(*it);
|
||||
if (!sg.empty()) {
|
||||
LOGDEB(("ExpTerm: syns: %s -> %s\n",
|
||||
it->c_str(), stringsToString(sg).c_str()));
|
||||
lexp.insert(lexp.end(), sg.begin(), sg.end());
|
||||
}
|
||||
}
|
||||
sort(lexp.begin(), lexp.end());
|
||||
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
||||
// Keep result in exp1 for next step
|
||||
exp1.swap(lexp);
|
||||
}
|
||||
|
||||
// Expand the resulting list for case (all stemdb content
|
||||
// is lowercase)
|
||||
lexp.clear();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue