diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 298a1702..fe26e9d4 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1144,21 +1144,44 @@ string RclConfig::getMimeIconPath(const string &mtype, const string &apptag) return path_cat(iconpath, iconname) + ".png"; } -string RclConfig::getDbDir() const +// Return path defined by varname. May be absolute or relative to +// confdir, with default in confdir +string RclConfig::getConfdirPath(const char *varname, const char *dflt) const { - string dbdir; - if (!getConfParam("dbdir", dbdir)) { - LOGERR(("RclConfig::getDbDir: no db directory in configuration\n")); + string result; + if (!getConfParam(varname, result)) { + result = path_cat(getConfDir(), dflt); } else { - dbdir = path_tildexpand(dbdir); + result = path_tildexpand(result); // If not an absolute path, compute relative to config dir - if (dbdir.at(0) != '/') { - LOGDEB1(("Dbdir not abs, catting with confdir\n")); - dbdir = path_cat(getConfDir(), dbdir); + if (result.at(0) != '/') { + result = path_cat(getConfDir(), result); } } - LOGDEB1(("RclConfig::getDbDir: dbdir: [%s]\n", dbdir.c_str())); - return path_canon(dbdir); + return path_canon(result); + +} + +string RclConfig::getDbDir() const +{ + return getConfdirPath("dbdir", "xapiandb"); +} + +string RclConfig::getStopfile() const +{ + return getConfdirPath("stoplistfile", "stoplist.txt"); +} + +string RclConfig::getSynGroupsFile() const +{ + return getConfdirPath("syngroupsfile", "syngroups.txt"); +} + +// The index status file is fast changing, so it's possible to put it outside +// of the config directory (for ssds, not sure this is really useful). +string RclConfig::getIdxStatusFile() const +{ + return getConfdirPath("idxstatusfile", "idxstatus.txt"); } void RclConfig::urlrewrite(const string& dbdir, string& url) const @@ -1213,32 +1236,11 @@ bool RclConfig::sourceChanged() const return false; } -string RclConfig::getStopfile() const -{ - return path_cat(getConfDir(), "stoplist.txt"); -} string RclConfig::getPidfile() const { return path_cat(getConfDir(), "index.pid"); } -// The index status file is fast changing, so it's possible to put it outside -// of the config directory (for ssds, not sure this is really useful). -string RclConfig::getIdxStatusFile() const -{ - string path; - if (!getConfParam("idxstatusfile", path)) { - return path_cat(getConfDir(), "idxstatus.txt"); - } else { - path = path_tildexpand(path); - // If not an absolute path, compute relative to config dir - if (path.at(0) != '/') { - path = path_cat(getConfDir(), path); - } - return path_canon(path); - } -} - string RclConfig::getWebQueueDir() const { string webqueuedir; diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index be26eaa1..4a371d50 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -171,10 +171,13 @@ class RclConfig { * need for other status */ vector getTopdirs() const; + string getConfdirPath(const char *varname, const char *dflt) const; /** Get database directory */ string getDbDir() const; /** Get stoplist file name */ string getStopfile() const; + /** Get synonym groups file name */ + string getSynGroupsFile() const; /** Get indexing pid file name */ string getPidfile() const; /** Get indexing status file name */ diff --git a/src/common/syngroups.cpp b/src/common/syngroups.cpp index e72cf122..e6da110b 100644 --- a/src/common/syngroups.cpp +++ b/src/common/syngroups.cpp @@ -63,20 +63,29 @@ SynGroups::~SynGroups() const int LL = 1024; -SynGroups::SynGroups(const string& fn) +SynGroups::SynGroups() : m(new Internal) { +} + +bool SynGroups::setfile(const string& fn) +{ + LOGDEB(("SynGroups::setfile(%s)\n", fn.c_str())); if (!m) { - LOGERR(("SynGroups::SynGroups:: new Internal failed: no mem ?\n")); - return; + LOGERR(("SynGroups:setfile:: new Internal failed: no mem ?\n")); + return false; } + // Don't set ok to true. + if (fn.empty()) + return true; + ifstream input; input.open(fn.c_str(), ios::in); if (!input.is_open()) { - LOGERR(("SynGroups::SynGroups:: could not open %s errno %d\n", + LOGERR(("SynGroups:setfile:: could not open %s errno %d\n", fn.c_str(), errno)); - return; + return false; } char cline[LL]; @@ -91,7 +100,7 @@ SynGroups::SynGroups(const string& fn) if (!input.good()) { if (input.bad()) { LOGDEB(("Parse: input.bad()\n")); - return; + return false; } // Must be eof ? But maybe we have a partial line which // must be processed. This happens if the last line before @@ -130,7 +139,7 @@ SynGroups::SynGroups(const string& fn) vector words; if (!stringToStrings(line, words)) { - LOGERR(("SynGroups::SynGroups: %s: bad line %d: %s\n", + LOGERR(("SynGroups:setfile: %s: bad line %d: %s\n", fn.c_str(), lnum, line.c_str())); continue; } @@ -138,7 +147,7 @@ SynGroups::SynGroups(const string& fn) if (words.empty()) continue; if (words.size() == 1) { - LOGDEB(("SynGroups::SynGroups: single term group at line %d ??\n", + LOGDEB(("SynGroups:setfile: single term group at line %d ??\n", lnum)); continue; } @@ -148,8 +157,11 @@ SynGroups::SynGroups(const string& fn) it != words.end(); it++) { m->terms[*it] = lnum; } + LOGDEB(("SynGroups::setfile: group: [%s]\n", + stringsToString(m->groups[lnum]).c_str())); } m->ok = true; + return true; } vector SynGroups::getgroup(const string& term) diff --git a/src/common/syngroups.h b/src/common/syngroups.h index 9f3c523b..62a71523 100644 --- a/src/common/syngroups.h +++ b/src/common/syngroups.h @@ -26,8 +26,9 @@ // in a group are equivalent. class SynGroups { public: - SynGroups(const std::string& fname); + SynGroups(); ~SynGroups(); + bool setfile(const std::string& fname); std::vector getgroup(const std::string& term); bool ok(); private: diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 40d0e083..ef92b82a 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -762,6 +762,10 @@ bool Db::open(OpenMode mode, OpenError *error) } if (!m_config->getStopfile().empty()) m_stops.setFile(m_config->getStopfile()); + // Synonyms are only used at query time for now + if (mode == DbRO) + m_syngroups.setfile(m_config->getSynGroupsFile()); + string dir = m_config->getDbDir(); string ermsg; try { diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 1d73ab4e..70e87dc6 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -29,6 +29,7 @@ #include "rclconfig.h" #include "utf8iter.h" #include "textsplit.h" +#include "syngroups.h" using std::string; using std::vector; @@ -480,6 +481,9 @@ private: * after init */ // Stop terms: those don't get indexed. StopList m_stops; + // Synonym groups + SynGroups m_syngroups; + // Truncation length for stored meta fields int m_idxMetaStoredLen; // This is how long an abstract we keep or build from beginning of diff --git a/src/rcldb/rclterms.cpp b/src/rcldb/rclterms.cpp index 132e509d..8d0e4abd 100644 --- a/src/rcldb/rclterms.cpp +++ b/src/rcldb/rclterms.cpp @@ -235,7 +235,8 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term, } } else { - // Expansion is STEM or NONE (which may still need case/diac exp) + // Expansion is STEM or NONE (which may still need synonyms + // and case/diac exp) vector lexp; if (diac_sensitive && case_sensitive) { @@ -273,6 +274,30 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term, } LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str())); + lexp.clear(); + // Expand the result for synonyms. Note that doing it here + // means that multi-term synonyms will not work + // (e.g. stakhanovist -> "hard at work". We would have to + // separate the multi-word expansions for our caller to + // add them as phrases to the query. Not impossible, but + // let's keep it at single words for now. + if (m_syngroups.ok()) { + LOGDEB(("ExpTerm: got syngroups\n")); + for (vector::const_iterator it = exp1.begin(); + it != exp1.end(); it++) { + vector sg = m_syngroups.getgroup(*it); + if (!sg.empty()) { + LOGDEB(("ExpTerm: syns: %s -> %s\n", + it->c_str(), stringsToString(sg).c_str())); + lexp.insert(lexp.end(), sg.begin(), sg.end()); + } + } + sort(lexp.begin(), lexp.end()); + lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end()); + // Keep result in exp1 for next step + exp1.swap(lexp); + } + // Expand the resulting list for case (all stemdb content // is lowercase) lexp.clear();