/* Copyright (C) 2004 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "autoconfig.h" #include #include #include #include #include #include #include #include #include using namespace std; #include "xapian.h" #include "rclconfig.h" #include "debuglog.h" #include "rcldb.h" #include "rcldb_p.h" #include "stemdb.h" #include "textsplit.h" #include "transcode.h" #include "unacpp.h" #include "conftree.h" #include "pathut.h" #include "smallut.h" #include "utf8iter.h" #include "searchdata.h" #include "rclquery.h" #include "rclquery_p.h" #include "md5.h" #include "rclversion.h" #include "cancelcheck.h" #include "ptmutex.h" #include "termproc.h" #include "expansiondbs.h" #include "rclinit.h" // Recoll index format version is stored in user metadata. When this change, // we can't open the db and will have to reindex. static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY"); static const string cstr_RCL_IDX_VERSION("1"); static const string cstr_mbreaks("rclmbreaks"); namespace Rcl { // Some prefixes that we could get from the fields file, but are not going // to ever change. static const string fileext_prefix = "XE"; const string mimetype_prefix = "T"; static const string xapday_prefix = "D"; static const string xapmonth_prefix = "M"; static const string xapyear_prefix = "Y"; const string pathelt_prefix = "XP"; const string udi_prefix("Q"); const string parent_prefix("F"); // Special terms to mark begin/end of field (for anchored searches), and // page breaks string start_of_field_term; string end_of_field_term; const string page_break_term = "XXPG/"; // Special term to mark documents with children. const string has_children_term("XXC/"); // Field name for the unsplit file name. Has to exist in the field file // because of usage in termmatch() const string unsplitFilenameFieldName = "rclUnsplitFN"; static const string unsplitfilename_prefix = "XSFS"; // Empty string md5s static const string cstr_md5empty("d41d8cd98f00b204e9800998ecf8427e"); static const int MB = 1024 * 1024; string version_string(){ return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") + string(Xapian::version_string()); } // Synthetic abstract marker (to discriminate from abstract actually // found in document) static const string cstr_syntAbs("?!#@"); // Compute the unique term used to link documents to their origin. // "Q" + external udi static inline string make_uniterm(const string& udi) { string uniterm(wrap_prefix(udi_prefix)); uniterm.append(udi); return uniterm; } // Compute parent term used to link documents to their parent document (if any) // "F" + parent external udi static inline string make_parentterm(const string& udi) { // I prefer to be in possible conflict with omega than with // user-defined fields (Xxxx) that we also allow. "F" is currently // not used by omega (2008-07) string pterm(wrap_prefix(parent_prefix)); pterm.append(udi); return pterm; } Db::Native::Native(Db *db) : m_rcldb(db), m_isopen(false), m_iswritable(false), m_noversionwrite(false) #ifdef IDX_THREADS , m_wqueue("DbUpd", m_rcldb->m_config->getThrConf(RclConfig::ThrDbWrite).first), m_loglevel(4), m_totalworkns(0LL), m_havewriteq(false) #endif // IDX_THREADS { LOGDEB1(("Native::Native: me %p\n", this)); } Db::Native::~Native() { LOGDEB1(("Native::~Native: me %p\n", this)); #ifdef IDX_THREADS if (m_havewriteq) { void *status = m_wqueue.setTerminateAndWait(); LOGDEB2(("Native::~Native: worker status %ld\n", long(status))); } #endif // IDX_THREADS } #ifdef IDX_THREADS void *DbUpdWorker(void* vdbp) { recoll_threadinit(); Db::Native *ndbp = (Db::Native *)vdbp; WorkQueue *tqp = &(ndbp->m_wqueue); DebugLog::getdbl()->setloglevel(ndbp->m_loglevel); DbUpdTask *tsk = 0; for (;;) { size_t qsz = -1; if (!tqp->take(&tsk, &qsz)) { tqp->workerExit(); return (void*)1; } bool status = false; switch (tsk->op) { case DbUpdTask::AddOrUpdate: LOGDEB(("DbUpdWorker: got add/update task, ql %d\n", int(qsz))); status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm, tsk->doc, tsk->txtlen); break; case DbUpdTask::Delete: LOGDEB(("DbUpdWorker: got delete task, ql %d\n", int(qsz))); status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm); break; case DbUpdTask::PurgeOrphans: LOGDEB(("DbUpdWorker: got orphans purge task, ql %d\n", int(qsz))); status = ndbp->purgeFileWrite(true, tsk->udi, tsk->uniterm); break; default: LOGERR(("DbUpdWorker: unknown op %d !!\n", tsk->op)); break; } if (!status) { LOGERR(("DbUpdWorker: xxWrite failed\n")); tqp->workerExit(); delete tsk; return (void*)0; } delete tsk; } } void Db::Native::maybeStartThreads() { m_loglevel = DebugLog::getdbl()->getlevel(); m_havewriteq = false; const RclConfig *cnf = m_rcldb->m_config; int writeqlen = cnf->getThrConf(RclConfig::ThrDbWrite).first; int writethreads = cnf->getThrConf(RclConfig::ThrDbWrite).second; if (writethreads > 1) { LOGINFO(("RclDb: write threads count was forced down to 1\n")); writethreads = 1; } if (writeqlen >= 0 && writethreads > 0) { if (!m_wqueue.start(writethreads, DbUpdWorker, this)) { LOGERR(("Db::Db: Worker start failed\n")); return; } m_havewriteq = true; } LOGDEB(("RclDb:: threads: haveWriteQ %d, wqlen %d wqts %d\n", m_havewriteq, writeqlen, writethreads)); } #endif // IDX_THREADS /* See comment in class declaration: return all subdocuments of a * document given by its unique id. */ bool Db::Native::subDocs(const string &udi, int idxi, vector& docids) { LOGDEB2(("subDocs: [%s]\n", uniterm.c_str())); string pterm = make_parentterm(udi); vector candidates; XAPTRY(docids.clear(); candidates.insert(candidates.begin(), xrdb.postlist_begin(pterm), xrdb.postlist_end(pterm)), xrdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { LOGERR(("Rcl::Db::subDocs: %s\n", m_rcldb->m_reason.c_str())); return false; } else { for (unsigned int i = 0; i < candidates.size(); i++) { if (whatDbIdx(candidates[i]) == (size_t)idxi) { docids.push_back(candidates[i]); } } LOGDEB0(("Db::Native::subDocs: returning %d ids\n", docids.size())); return true; } } bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi) { Xapian::TermIterator xit; XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(wrap_prefix(udi_prefix)), xrdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { LOGERR(("xdocToUdi: xapian error: %s\n", m_rcldb->m_reason.c_str())); return false; } if (xit != xdoc.termlist_end()) { udi = *xit; if (!udi.empty()) { udi = udi.substr(wrap_prefix(udi_prefix).size()); return true; } } return false; } // Clear term from document if its frequency is 0. This should // probably be done by Xapian when the freq goes to 0 when removing a // posting, but we have to do it ourselves bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term) { LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str())); // Find the term Xapian::TermIterator xit; XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);, xrdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n", term.c_str(), m_rcldb->m_reason.c_str())); return false; } if (xit == xdoc.termlist_end() || term.compare(*xit)) { LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n", term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str())); return false; } // Clear the term if its frequency is 0 if (xit.get_wdf() == 0) { LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str())); XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n", term.c_str(), m_rcldb->m_reason.c_str())); } } return true; } // Holder for term + pos struct DocPosting { DocPosting(string t, Xapian::termpos ps) : term(t), pos(ps) {} string term; Xapian::termpos pos; }; // Clear all terms for given field for given document. // The terms to be cleared are all those with the appropriate // prefix. We also remove the postings for the unprefixed terms (that // is, we undo what we did when indexing). bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx, Xapian::termcount wdfdec) { LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n", pfx.c_str(), unsigned(xdoc.get_docid()))); vector eraselist; string wrapd = wrap_prefix(pfx); m_rcldb->m_reason.clear(); for (int tries = 0; tries < 2; tries++) { try { Xapian::TermIterator xit; xit = xdoc.termlist_begin(); xit.skip_to(wrapd); while (xit != xdoc.termlist_end() && !(*xit).compare(0, wrapd.size(), wrapd)) { LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str())); Xapian::PositionIterator posit; for (posit = xit.positionlist_begin(); posit != xit.positionlist_end(); posit++) { eraselist.push_back(DocPosting(*xit, *posit)); eraselist.push_back(DocPosting(strip_prefix(*xit), *posit)); } xit++; } } catch (const Xapian::DatabaseModifiedError &e) { m_rcldb->m_reason = e.get_msg(); xrdb.reopen(); continue; } XCATCHERROR(m_rcldb->m_reason); break; } if (!m_rcldb->m_reason.empty()) { LOGERR(("Db::clearField: failed building erase list: %s\n", m_rcldb->m_reason.c_str())); return false; } // Now remove the found positions, and the terms if the wdf is 0 for (vector::const_iterator it = eraselist.begin(); it != eraselist.end(); it++) { LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n", it->term.c_str(), int(it->pos))); XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);, xwdb,m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { // Not that this normally fails for non-prefixed XXST and // ND, don't make a fuss LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n", it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str())); } clearDocTermIfWdf0(xdoc, it->term); } return true; } // Check if doc given by udi is indexed by term bool Db::Native::hasTerm(const string& udi, int idxi, const string& term) { LOGDEB2(("Native::hasTerm: udi [%s] term [%s]\n",udi.c_str(),term.c_str())); Xapian::Document xdoc; if (getDoc(udi, idxi, xdoc)) { Xapian::TermIterator xit; XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);, xrdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { LOGERR(("Rcl::Native::hasTerm: %s\n", m_rcldb->m_reason.c_str())); return false; } if (xit != xdoc.termlist_end() && !term.compare(*xit)) { return true; } } return false; } // Retrieve Xapian document, given udi. There may be several identical udis // if we are using multiple indexes. Xapian::docid Db::Native::getDoc(const string& udi, int idxi, Xapian::Document& xdoc) { string uniterm = make_uniterm(udi); for (int tries = 0; tries < 2; tries++) { try { Xapian::PostingIterator docid; for (docid = xrdb.postlist_begin(uniterm); docid != xrdb.postlist_end(uniterm); docid++) { xdoc = xrdb.get_document(*docid); if (whatDbIdx(*docid) == (size_t)idxi) return *docid; } // Udi not in Db. return 0; } catch (const Xapian::DatabaseModifiedError &e) { m_rcldb->m_reason = e.get_msg(); xrdb.reopen(); continue; } XCATCHERROR(m_rcldb->m_reason); break; } LOGERR(("Db::Native::getDoc: Xapian error: %s\n", m_rcldb->m_reason.c_str())); return 0; } // Turn data record from db into document fields bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc) { LOGDEB2(("Db::dbDataToRclDoc: data:\n%s\n", data.c_str())); ConfSimple parms(data); if (!parms.ok()) return false; doc.xdocid = docid; doc.haspages = hasPages(docid); // Compute what index this comes from, and check for path translations string dbdir = m_rcldb->m_basedir; doc.idxi = 0; if (!m_rcldb->m_extraDbs.empty()) { unsigned int idxi = whatDbIdx(docid); // idxi is in [0, extraDbs.size()]. 0 is for the main index, // idxi-1 indexes into the additional dbs array. if (idxi) { dbdir = m_rcldb->m_extraDbs[idxi - 1]; doc.idxi = idxi; } } parms.get(Doc::keyurl, doc.idxurl); doc.url = doc.idxurl; m_rcldb->m_config->urlrewrite(dbdir, doc.url); if (!doc.url.compare(doc.idxurl)) doc.idxurl.clear(); // Special cases: parms.get(Doc::keytp, doc.mimetype); parms.get(Doc::keyfmt, doc.fmtime); parms.get(Doc::keydmt, doc.dmtime); parms.get(Doc::keyoc, doc.origcharset); parms.get(cstr_caption, doc.meta[Doc::keytt]); parms.get(Doc::keyabs, doc.meta[Doc::keyabs]); // Possibly remove synthetic abstract indicator (if it's there, we // used to index the beginning of the text as abstract). doc.syntabs = false; if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) { doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length()); doc.syntabs = true; } parms.get(Doc::keyipt, doc.ipath); parms.get(Doc::keypcs, doc.pcbytes); parms.get(Doc::keyfs, doc.fbytes); parms.get(Doc::keyds, doc.dbytes); parms.get(Doc::keysig, doc.sig); // Normal key/value pairs: vector keys = parms.getNames(string()); for (vector::const_iterator it = keys.begin(); it != keys.end(); it++) { if (doc.meta.find(*it) == doc.meta.end()) parms.get(*it, doc.meta[*it]); } doc.meta[Doc::keyurl] = doc.url; doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime; return true; } bool Db::Native::hasPages(Xapian::docid docid) { string ermsg; Xapian::PositionIterator pos; XAPTRY(pos = xrdb.positionlist_begin(docid, page_break_term); if (pos != xrdb.positionlist_end(docid, page_break_term)) { return true; }, xrdb, ermsg); if (!ermsg.empty()) { LOGERR(("Db::Native::hasPages: xapian error: %s\n", ermsg.c_str())); } return false; } // Return the positions list for the page break term bool Db::Native::getPagePositions(Xapian::docid docid, vector& vpos) { vpos.clear(); // Need to retrieve the document record to check for multiple page breaks // that we store there for lack of better place map mbreaksmap; try { Xapian::Document xdoc = xrdb.get_document(docid); string data = xdoc.get_data(); Doc doc; string mbreaks; if (dbDataToRclDoc(docid, data, doc) && doc.getmeta(cstr_mbreaks, &mbreaks)) { vector values; stringToTokens(mbreaks, values, ","); for (unsigned int i = 0; i < values.size() - 1; i += 2) { int pos = atoi(values[i].c_str()) + baseTextPosition; int incr = atoi(values[i+1].c_str()); mbreaksmap[pos] = incr; } } } catch (...) { } string qterm = page_break_term; Xapian::PositionIterator pos; try { for (pos = xrdb.positionlist_begin(docid, qterm); pos != xrdb.positionlist_end(docid, qterm); pos++) { int ipos = *pos; if (ipos < int(baseTextPosition)) { LOGDEB(("getPagePositions: got page position %d not in body\n", ipos)); // Not in text body. Strange... continue; } map::iterator it = mbreaksmap.find(ipos); if (it != mbreaksmap.end()) { LOGDEB1(("getPagePositions: found multibreak at %d incr %d\n", ipos, it->second)); for (int i = 0 ; i < it->second; i++) vpos.push_back(ipos); } vpos.push_back(ipos); } } catch (...) { // Term does not occur. No problem. } return true; } int Db::Native::getPageNumberForPosition(const vector& pbreaks, unsigned int pos) { if (pos < baseTextPosition) // Not in text body return -1; vector::const_iterator it = upper_bound(pbreaks.begin(), pbreaks.end(), pos); return it - pbreaks.begin() + 1; } bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, Xapian::Document& newdocument, size_t textlen) { #ifdef IDX_THREADS Chrono chron; PTMutexLocker lock(m_mutex); #endif // Check file system full every mbyte of indexed text. It's a bit wasteful // to do this after having prepared the document, but it needs to be in // the single-threaded section. if (m_rcldb->m_maxFsOccupPc > 0 && (m_rcldb->m_occFirstCheck || (m_rcldb->m_curtxtsz - m_rcldb->m_occtxtsz) / MB >= 1)) { LOGDEB(("Db::add: checking file system usage\n")); int pc; m_rcldb->m_occFirstCheck = 0; if (fsocc(m_rcldb->m_basedir, &pc) && pc >= m_rcldb->m_maxFsOccupPc) { LOGERR(("Db::add: stop indexing: file system " "%d%% full > max %d%%\n", pc, m_rcldb->m_maxFsOccupPc)); return false; } m_rcldb->m_occtxtsz = m_rcldb->m_curtxtsz; } const char *fnc = udi.c_str(); string ermsg; // Add db entry or update existing entry: try { Xapian::docid did = xwdb.replace_document(uniterm, newdocument); if (did < m_rcldb->updated.size()) { m_rcldb->updated[did] = true; LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc)); } else { LOGINFO(("Db::add: docid %d added [%s]\n", did, fnc)); } } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db::add: replace_document failed: %s\n", ermsg.c_str())); ermsg.erase(); // FIXME: is this ever actually needed? try { xwdb.add_document(newdocument); LOGDEB(("Db::add: %s added (failed re-seek for duplicate)\n", fnc)); } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db::add: add_document failed: %s\n", ermsg.c_str())); return false; } } // Test if we're over the flush threshold (limit memory usage): bool ret = m_rcldb->maybeflush(textlen); #ifdef IDX_THREADS m_totalworkns += chron.nanos(); #endif return ret; } bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi, const string& uniterm) { #if defined(IDX_THREADS) // We need a mutex even if we have a write queue (so we can only // be called by a single thread) to protect about multiple acces // to xrdb from subDocs() which is also called from needupdate() // (called from outside the write thread ! PTMutexLocker lock(m_mutex); #endif // IDX_THREADS string ermsg; try { Xapian::PostingIterator docid = xwdb.postlist_begin(uniterm); if (docid == xwdb.postlist_end(uniterm)) { return true; } if (m_rcldb->m_flushMb > 0) { Xapian::termcount trms = xwdb.get_doclength(*docid); m_rcldb->maybeflush(trms * 5); } string sig; if (orphansOnly) { Xapian::Document doc = xwdb.get_document(*docid); sig = doc.get_value(VALUE_SIG); if (sig.empty()) { LOGINFO(("purgeFileWrite: got empty sig\n")); return false; } } else { LOGDEB(("purgeFile: delete docid %d\n", *docid)); xwdb.delete_document(*docid); } vector docids; subDocs(udi, 0, docids); LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size())); for (vector::iterator it = docids.begin(); it != docids.end(); it++) { if (m_rcldb->m_flushMb > 0) { Xapian::termcount trms = xwdb.get_doclength(*it); m_rcldb->maybeflush(trms * 5); } string subdocsig; if (orphansOnly) { Xapian::Document doc = xwdb.get_document(*it); subdocsig = doc.get_value(VALUE_SIG); if (subdocsig.empty()) { LOGINFO(("purgeFileWrite: got empty sig for subdoc??\n")); continue; } } if (!orphansOnly || sig != subdocsig) { LOGDEB(("Db::purgeFile: delete subdoc %d\n", *it)); xwdb.delete_document(*it); } } return true; } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db::purgeFileWrite: %s\n", ermsg.c_str())); } return false; } /* Rcl::Db methods ///////////////////////////////// */ bool Db::o_inPlaceReset; Db::Db(const RclConfig *cfp) : m_ndb(0), m_mode(Db::DbRO), m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1), m_idxAbsTruncLen(250), m_synthAbsLen(250), m_synthAbsWordCtxLen(4), m_flushMb(-1), m_maxFsOccupPc(0) { m_config = new RclConfig(*cfp); if (start_of_field_term.empty()) { if (o_index_stripchars) { start_of_field_term = "XXST"; end_of_field_term = "XXND"; } else { start_of_field_term = "XXST/"; end_of_field_term = "XXND/"; } } m_ndb = new Native(this); if (m_config) { m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc); m_config->getConfParam("idxflushmb", &m_flushMb); } } Db::~Db() { LOGDEB2(("Db::~Db\n")); if (m_ndb == 0) return; LOGDEB(("Db::~Db: isopen %d m_iswritable %d\n", m_ndb->m_isopen, m_ndb->m_iswritable)); i_close(true); delete m_config; } vector Db::getStemmerNames() { vector res; stringToStrings(Xapian::Stem::get_available_languages(), res); return res; } bool Db::open(OpenMode mode, OpenError *error) { if (error) *error = DbOpenMainDb; if (m_ndb == 0 || m_config == 0) { m_reason = "Null configuration or Xapian Db"; return false; } LOGDEB(("Db::open: m_isopen %d m_iswritable %d mode %d\n", m_ndb->m_isopen, m_ndb->m_iswritable, mode)); if (m_ndb->m_isopen) { // We used to return an error here but I see no reason to if (!close()) return false; } if (!m_config->getStopfile().empty()) m_stops.setFile(m_config->getStopfile()); string dir = m_config->getDbDir(); string ermsg; try { switch (mode) { case DbUpd: case DbTrunc: { int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN : Xapian::DB_CREATE_OR_OVERWRITE; m_ndb->xwdb = Xapian::WritableDatabase(dir, action); // If db is empty, write the data format version at once // to avoid stupid error messages: if (m_ndb->xwdb.get_doccount() == 0) m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION); m_ndb->m_iswritable = true; #ifdef IDX_THREADS m_ndb->maybeStartThreads(); #endif // We open a readonly object in all cases (possibly in // addition to the r/w one) because some operations // are faster when performed through a Database: no // forced flushes on allterms_begin(), ie, used in // subDocs() m_ndb->xrdb = Xapian::Database(dir); LOGDEB(("Db::open: lastdocid: %d\n", m_ndb->xwdb.get_lastdocid())); LOGDEB2(("Db::open: resetting updated\n")); updated.resize(m_ndb->xwdb.get_lastdocid() + 1); for (unsigned int i = 0; i < updated.size(); i++) updated[i] = false; } break; case DbRO: default: m_ndb->m_iswritable = false; m_ndb->xrdb = Xapian::Database(dir); for (vector::iterator it = m_extraDbs.begin(); it != m_extraDbs.end(); it++) { if (error) *error = DbOpenExtraDb; LOGDEB(("Db::Open: adding query db [%s]\n", it->c_str())); // An error here used to be non-fatal (1.13 and older) // but I can't see why m_ndb->xrdb.add_database(Xapian::Database(*it)); } break; } if (error) *error = DbOpenMainDb; // Check index format version. Must not try to check a just created or // truncated db if (mode != DbTrunc && m_ndb->xrdb.get_doccount() > 0) { string version = m_ndb->xrdb.get_metadata(cstr_RCL_IDX_VERSION_KEY); if (version.compare(cstr_RCL_IDX_VERSION)) { m_ndb->m_noversionwrite = true; LOGERR(("Rcl::Db::open: file index [%s], software [%s]\n", version.c_str(), cstr_RCL_IDX_VERSION.c_str())); throw Xapian::DatabaseError("Recoll index version mismatch", "", ""); } } m_mode = mode; m_ndb->m_isopen = true; m_basedir = dir; if (error) *error = DbOpenNoError; return true; } XCATCHERROR(ermsg); m_reason = ermsg; LOGERR(("Db::open: exception while opening [%s]: %s\n", dir.c_str(), ermsg.c_str())); return false; } // Note: xapian has no close call, we delete and recreate the db bool Db::close() { LOGDEB1(("Db::close()\n")); return i_close(false); } bool Db::i_close(bool final) { if (m_ndb == 0) return false; LOGDEB(("Db::i_close(%d): m_isopen %d m_iswritable %d\n", final, m_ndb->m_isopen, m_ndb->m_iswritable)); if (m_ndb->m_isopen == false && !final) return true; string ermsg; try { bool w = m_ndb->m_iswritable; if (w) { #ifdef IDX_THREADS waitUpdIdle(); #endif if (!m_ndb->m_noversionwrite) m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION); LOGDEB(("Rcl::Db:close: xapian will close. May take some time\n")); } deleteZ(m_ndb); if (w) LOGDEB(("Rcl::Db:close() xapian close done.\n")); if (final) { return true; } m_ndb = new Native(this); if (m_ndb) { return true; } LOGERR(("Rcl::Db::close(): cant recreate db object\n")); return false; } XCATCHERROR(ermsg); LOGERR(("Db:close: exception while deleting db: %s\n", ermsg.c_str())); return false; } // Reopen the db with a changed list of additional dbs bool Db::adjustdbs() { if (m_mode != DbRO) { LOGERR(("Db::adjustdbs: mode not RO\n")); return false; } if (m_ndb && m_ndb->m_isopen) { if (!close()) return false; if (!open(m_mode)) { return false; } } return true; } int Db::docCnt() { int res = -1; if (!m_ndb || !m_ndb->m_isopen) return -1; XAPTRY(res = m_ndb->xrdb.get_doccount(), m_ndb->xrdb, m_reason); if (!m_reason.empty()) { LOGERR(("Db::docCnt: got error: %s\n", m_reason.c_str())); return -1; } return res; } int Db::termDocCnt(const string& _term) { int res = -1; if (!m_ndb || !m_ndb->m_isopen) return -1; string term = _term; if (o_index_stripchars) if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str())); return 0; } if (m_stops.isStop(term)) { LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str())); return 0; } XAPTRY(res = m_ndb->xrdb.get_termfreq(term), m_ndb->xrdb, m_reason); if (!m_reason.empty()) { LOGERR(("Db::termDocCnt: got error: %s\n", m_reason.c_str())); return -1; } return res; } bool Db::addQueryDb(const string &_dir) { string dir = _dir; LOGDEB0(("Db::addQueryDb: ndb %p iswritable %d db [%s]\n", m_ndb, (m_ndb)?m_ndb->m_iswritable:0, dir.c_str())); if (!m_ndb) return false; if (m_ndb->m_iswritable) return false; dir = path_canon(dir); if (find(m_extraDbs.begin(), m_extraDbs.end(), dir) == m_extraDbs.end()) { m_extraDbs.push_back(dir); } return adjustdbs(); } bool Db::rmQueryDb(const string &dir) { if (!m_ndb) return false; if (m_ndb->m_iswritable) return false; if (dir.empty()) { m_extraDbs.clear(); } else { vector::iterator it = find(m_extraDbs.begin(), m_extraDbs.end(), dir); if (it != m_extraDbs.end()) { m_extraDbs.erase(it); } } return adjustdbs(); } // Determining what index a doc result comes from is based on the // modulo of the docid against the db count. Ref: // http://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID size_t Db::whatDbIdx(const Doc& doc) { return m_ndb->whatDbIdx(doc.xdocid); } size_t Db::Native::whatDbIdx(Xapian::docid id) { LOGDEB1(("Db::whatDbIdx: xdocid %lu, %u extraDbs\n", (unsigned long)id, m_extraDbs.size())); if (id == 0) return (size_t)-1; if (m_rcldb->m_extraDbs.size() == 0) return 0; return (id - 1) % (m_rcldb->m_extraDbs.size() + 1); } bool Db::testDbDir(const string &dir, bool *stripped_p) { string aerr; bool mstripped = true; LOGDEB(("Db::testDbDir: [%s]\n", dir.c_str())); try { Xapian::Database db(dir); // If we have terms with a leading ':' it's an // unstripped index Xapian::TermIterator term = db.allterms_begin(":"); if (term == db.allterms_end()) mstripped = true; else mstripped = false; } XCATCHERROR(aerr); if (!aerr.empty()) { LOGERR(("Db::Open: error while trying to open database " "from [%s]: %s\n", dir.c_str(), aerr.c_str())); return false; } if (stripped_p) *stripped_p = mstripped; return true; } bool Db::isopen() { if (m_ndb == 0) return false; return m_ndb->m_isopen; } // Try to translate field specification into field prefix. bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp) { if (m_config && m_config->getFieldTraits(fld, ftpp)) return true; *ftpp = 0; return false; } // The splitter breaks text into words and adds postings to the Xapian // document. We use a single object to split all of the document // fields and position jumps to separate fields class TextSplitDb : public TextSplitP { public: Xapian::Document &doc; // Xapian document // Base for document section. Gets large increment when we change // sections, to avoid cross-section proximity matches. Xapian::termpos basepos; // Current relative position. This is the remembered value from // the splitter callback. The term position is reset for each call // to text_to_words(), so that the last value of curpos is the // section size (last relative term position), and this is what // gets added to basepos in addition to the inter-section increment // to compute the first position of the next section. Xapian::termpos curpos; TextSplitDb(Xapian::Document &d, TermProc *prc) : TextSplitP(prc), doc(d), basepos(1), curpos(0), wdfinc(1) {} // Reimplement text_to_words to insert the begin and end anchor terms. virtual bool text_to_words(const string &in) { bool ret = false; string ermsg; try { // Index the possibly prefixed start term. doc.add_posting(prefix + start_of_field_term, basepos, wdfinc); ++basepos; } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); goto out; } if (!TextSplitP::text_to_words(in)) { LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n")); goto out; } try { // Index the possibly prefixed end term. doc.add_posting(prefix + end_of_field_term, basepos + curpos + 1, wdfinc); ++basepos; } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); goto out; } ret = true; out: basepos += curpos + 100; return true; } void setprefix(const string& pref) { if (pref.empty()) prefix.clear(); else prefix = wrap_prefix(pref); } void setwdfinc(int i) { wdfinc = i; } friend class TermProcIdx; private: // If prefix is set, we also add a posting for the prefixed terms // (ie: for titles, add postings for both "term" and "Sterm") string prefix; // Some fields have more weight int wdfinc; }; class TermProcIdx : public TermProc { public: TermProcIdx() : TermProc(0), m_ts(0), m_lastpagepos(0), m_pageincr(0) {} void setTSD(TextSplitDb *ts) {m_ts = ts;} bool takeword(const std::string &term, int pos, int, int) { // Compute absolute position (pos is relative to current segment), // and remember relative. m_ts->curpos = pos; pos += m_ts->basepos; // Don't try to add empty term Xapian doesnt like it... Safety check // this should not happen. if (term.empty()) return true; string ermsg; try { // Index without prefix, using the field-specific weighting LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str())); m_ts->doc.add_posting(term, pos, m_ts->wdfinc); #ifdef TESTING_XAPIAN_SPELL if (Db::isSpellingCandidate(term)) { m_ts->db.add_spelling(term); } #endif // Index the prefixed term. if (!m_ts->prefix.empty()) { m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc); } return true; } XCATCHERROR(ermsg); LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); return false; } void newpage(int pos) { pos += m_ts->basepos; if (pos < int(baseTextPosition)) { LOGDEB(("newpage: not in body\n", pos)); return; } m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos); if (pos == m_lastpagepos) { m_pageincr++; LOGDEB2(("newpage: same pos, pageincr %d lastpagepos %d\n", m_pageincr, m_lastpagepos)); } else { LOGDEB2(("newpage: pos change, pageincr %d lastpagepos %d\n", m_pageincr, m_lastpagepos)); if (m_pageincr > 0) { // Remember the multiple page break at this position unsigned int relpos = m_lastpagepos - baseTextPosition; LOGDEB2(("Remembering multiple page break. Relpos %u cnt %d\n", relpos, m_pageincr)); m_pageincrvec.push_back(pair(relpos, m_pageincr)); } m_pageincr = 0; } m_lastpagepos = pos; } virtual bool flush() { if (m_pageincr > 0) { unsigned int relpos = m_lastpagepos - baseTextPosition; LOGDEB2(("Remembering multiple page break. Position %u cnt %d\n", relpos, m_pageincr)); m_pageincrvec.push_back(pair(relpos, m_pageincr)); m_pageincr = 0; } return TermProc::flush(); } TextSplitDb *m_ts; // Auxiliary page breaks data for positions with multiple page breaks. int m_lastpagepos; // increment of page breaks at same pos. Normally 0, 1.. when several // breaks at the same pos int m_pageincr; vector > m_pageincrvec; }; #ifdef TESTING_XAPIAN_SPELL string Db::getSpellingSuggestion(const string& word) { if (m_ndb == 0) return string(); string term = word; if (o_index_stripchars) if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) { LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str())); return string(); } if (!isSpellingCandidate(term)) return string(); return m_ndb->xrdb.get_spelling_suggestion(term); } #endif // Let our user set the parameters for abstract processing void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen) { LOGDEB1(("Db::setAbstractParams: trunc %d syntlen %d ctxlen %d\n", idxtrunc, syntlen, syntctxlen)); if (idxtrunc > 0) m_idxAbsTruncLen = idxtrunc; if (syntlen > 0) m_synthAbsLen = syntlen; if (syntctxlen > 0) m_synthAbsWordCtxLen = syntctxlen; } static const string cstr_nc("\n\r\x0c\\"); #define RECORD_APPEND(R, NM, VAL) {R += NM + "=" + VAL + "\n";} // Add document in internal form to the database: index the terms in // the title abstract and body and add special terms for file name, // date, mime type etc. , create the document data record (more // metadata), and update database bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) { LOGDEB(("Db::add: udi [%s] parent [%s]\n", udi.c_str(), parent_udi.c_str())); if (m_ndb == 0) return false; Xapian::Document newdocument; // The term processing pipeline: TermProcIdx tpidx; TermProc *nxt = &tpidx; TermProcStop tpstop(nxt, m_stops);nxt = &tpstop; //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon; TermProcPrep tpprep(nxt); if (o_index_stripchars) nxt = &tpprep; TextSplitDb splitter(newdocument, nxt); tpidx.setTSD(&splitter); // Udi unique term: this is used for file existence/uptodate // checks, and unique id for the replace_document() call. string uniterm = make_uniterm(udi); if (doc.onlyxattr) { // Only updating an existing doc with new extended attributes // data. Need to read the old doc and its data record // first. This is so different from the normal processing that // it uses a fully separate code path (with some duplication // unfortunately) if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument)) return false; } else { // If the ipath is like a path, index the last element. This is // for compound documents like zip and chm for which the filter // uses the file path as ipath. if (!doc.ipath.empty() && doc.ipath.find_first_not_of("0123456789") != string::npos) { string utf8ipathlast; // There is no way in hell we could have an idea of the // charset here, so let's hope it's ascii or utf-8. We call // transcode to strip the bad chars and pray if (transcode(path_getsimple(doc.ipath), utf8ipathlast, "UTF-8", "UTF-8")) { splitter.text_to_words(utf8ipathlast); } } // Split and index the path from the url for path-based filtering { string path = url_gpath(doc.url); vector vpath; stringToTokens(path, vpath, "/"); // If vpath is not /, the last elt is the file/dir name, not a // part of the path. if (vpath.size()) vpath.resize(vpath.size()-1); splitter.curpos = 0; newdocument.add_posting(wrap_prefix(pathelt_prefix), splitter.basepos + splitter.curpos++); for (vector::iterator it = vpath.begin(); it != vpath.end(); it++){ if (it->length() > 230) { // Just truncate it. May still be useful because of wildcards *it = it->substr(0, 230); } newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, splitter.basepos + splitter.curpos++); } } // Index textual metadata. These are all indexed as text with // positions, as we may want to do phrase searches with them (this // makes no sense for keywords by the way). // // The order has no importance, and we set a position gap of 100 // between fields to avoid false proximity matches. map::iterator meta_it; for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) { if (!meta_it->second.empty()) { const FieldTraits *ftp; // We don't test for an empty prefix here. Some fields are part // of the internal conf with an empty prefix (ie: abstract). if (!fieldToTraits(meta_it->first, &ftp)) { LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n", meta_it->first.c_str())); continue; } LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc, meta_it->second.c_str())); splitter.setprefix(ftp->pfx); splitter.setwdfinc(ftp->wdfinc); if (!splitter.text_to_words(meta_it->second)) LOGDEB(("Db::addOrUpdate: split failed for %s\n", meta_it->first.c_str())); } } splitter.setprefix(string()); splitter.setwdfinc(1); if (splitter.curpos < baseTextPosition) splitter.basepos = baseTextPosition; // Split and index body text LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str())); #ifdef TEXTSPLIT_STATS splitter.resetStats(); #endif if (!splitter.text_to_words(doc.text)) LOGDEB(("Db::addOrUpdate: split failed for main text\n")); #ifdef TEXTSPLIT_STATS // Reject bad data. unrecognized base64 text is characterized by // high avg word length and high variation (because there are // word-splitters like +/ inside the data). TextSplit::Stats::Values v = splitter.getStats(); // v.avglen > 15 && v.sigma > 12 if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) { LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats " "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n", v.count, v.avglen, v.sigma, doc.url.c_str(), doc.ipath.c_str(), doc.text.c_str())); return true; } #endif ////// Special terms for other metadata. No positions for these. // Mime type newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype); // Simple file name indexed unsplit for specific "file name" // searches. This is not the same as a filename: clause inside the // query language. // We also add a term for the filename extension if any. string utf8fn; if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) { string fn; if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) { // We should truncate after extracting the extension, but this is // a pathological case anyway if (fn.size() > 230) utf8truncate(fn, 230); string::size_type pos = fn.rfind('.'); if (pos != string::npos && pos != fn.length() - 1) { newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + fn.substr(pos + 1)); } newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0); } } newdocument.add_boolean_term(uniterm); // Parent term. This is used to find all descendents, mostly // to delete them when the parent goes away if (!parent_udi.empty()) { newdocument.add_boolean_term(make_parentterm(parent_udi)); } // Dates etc. time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : doc.dmtime.c_str()); struct tm *tm = localtime(&mtime); char buf[9]; snprintf(buf, 9, "%04d%02d%02d", tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday); // Date (YYYYMMDD) newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); // Month (YYYYMM) buf[6] = '\0'; newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf)); // Year (YYYY) buf[4] = '\0'; newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); ////////////////////////////////////////////////////////////////// // Document data record. omindex has the following nl separated fields: // - url // - sample // - caption (title limited to 100 chars) // - mime type // // The title, author, abstract and keywords fields are special, // they always get stored in the document data // record. Configurable other fields can be, too. // // We truncate stored fields abstract, title and keywords to // reasonable lengths and suppress newlines (so that the data // record can keep a simple syntax) string record; RECORD_APPEND(record, Doc::keyurl, doc.url); RECORD_APPEND(record, Doc::keytp, doc.mimetype); // We left-zero-pad the times so that they are lexico-sortable leftzeropad(doc.fmtime, 11); RECORD_APPEND(record, Doc::keyfmt, doc.fmtime); if (!doc.dmtime.empty()) { leftzeropad(doc.dmtime, 11); RECORD_APPEND(record, Doc::keydmt, doc.dmtime); } RECORD_APPEND(record, Doc::keyoc, doc.origcharset); if (doc.fbytes.empty()) doc.fbytes = doc.pcbytes; if (!doc.fbytes.empty()) { RECORD_APPEND(record, Doc::keyfs, doc.fbytes); leftzeropad(doc.fbytes, 12); newdocument.add_value(VALUE_SIZE, doc.fbytes); } if (doc.haschildren) { newdocument.add_boolean_term(has_children_term); } if (!doc.pcbytes.empty()) RECORD_APPEND(record, Doc::keypcs, doc.pcbytes); char sizebuf[30]; sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); RECORD_APPEND(record, Doc::keyds, sizebuf); // Note that we add the signature both as a value and in the data record if (!doc.sig.empty()) { RECORD_APPEND(record, Doc::keysig, doc.sig); newdocument.add_value(VALUE_SIG, doc.sig); } if (!doc.ipath.empty()) RECORD_APPEND(record, Doc::keyipt, doc.ipath); doc.meta[Doc::keytt] = neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc); if (!doc.meta[Doc::keytt].empty()) RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]); trimstring(doc.meta[Doc::keykw], " \t\r\n"); doc.meta[Doc::keykw] = neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc); // No need to explicitly append the keywords, this will be done by // the "stored" loop // If abstract is empty, we make up one with the beginning of the // document. This is then not indexed, but part of the doc data so // that we can return it to a query without having to decode the // original file. bool syntabs = false; // Note that the map accesses by operator[] create empty entries if they // don't exist yet. trimstring(doc.meta[Doc::keyabs], " \t\r\n"); if (doc.meta[Doc::keyabs].empty()) { syntabs = true; if (!doc.text.empty()) doc.meta[Doc::keyabs] = cstr_syntAbs + neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc); } else { doc.meta[Doc::keyabs] = neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen), cstr_nc); } const set& stored = m_config->getStoredFields(); for (set::const_iterator it = stored.begin(); it != stored.end(); it++) { string nm = m_config->fieldCanon(*it); if (!doc.meta[nm].empty()) { string value = neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc); RECORD_APPEND(record, nm, value); } } // If empty pages (multiple break at same pos) were recorded, save // them (this is because we have no way to record them in the // Xapian list if (!tpidx.m_pageincrvec.empty()) { ostringstream multibreaks; for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) { if (i != 0) multibreaks << ","; multibreaks << tpidx.m_pageincrvec[i].first << "," << tpidx.m_pageincrvec[i].second; } RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str()); } // If the file's md5 was computed, add value and term. // The value is optionally used for query result duplicate elimination, // and the term to find the duplicates. // We don't do this for empty docs. const string *md5; if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() && md5->compare(cstr_md5empty)) { string digest; MD5HexScan(*md5, digest); newdocument.add_value(VALUE_MD5, digest); newdocument.add_boolean_term(wrap_prefix("XM") + *md5); } LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str())); newdocument.set_data(record); } #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, newdocument, doc.text.length()); if (!m_ndb->m_wqueue.put(tp)) { LOGERR(("Db::addOrUpdate:Cant queue task\n")); return false; } else { return true; } } #endif return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument, doc.text.length()); } bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, Doc &doc, Xapian::Document& xdoc) { LOGDEB0(("Db::docToXdocXattrOnly\n")); PTMutexLocker lock(m_mutex); // Read existing document and its data record if (getDoc(udi, 0, xdoc) == 0) { LOGERR(("docToXdocXattrOnly: existing doc not found\n")); return false; } string data; XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str())); return false; } // Clear the term lists for the incoming fields and index the new values map::iterator meta_it; for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) { const FieldTraits *ftp; if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) { LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n", meta_it->first.c_str())); continue; } // Clear the previous terms for the field clearField(xdoc, ftp->pfx, ftp->wdfinc); LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n", meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc, meta_it->second.c_str())); splitter->setprefix(ftp->pfx); splitter->setwdfinc(ftp->wdfinc); if (!splitter->text_to_words(meta_it->second)) LOGDEB(("Db::xattrOnly: split failed for %s\n", meta_it->first.c_str())); } xdoc.add_value(VALUE_SIG, doc.sig); // Parse current data record into a dict for ease of processing ConfSimple datadic(data); if (!datadic.ok()) { LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n")); return false; } // For each "stored" field, check if set in doc metadata and // update the value if it is const set& stored = m_rcldb->m_config->getStoredFields(); for (set::const_iterator it = stored.begin(); it != stored.end(); it++) { string nm = m_rcldb->m_config->fieldCanon(*it); if (doc.getmeta(nm, 0)) { string value = neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc); datadic.set(nm, value, ""); } } // Recreate the record. We want to do this with the local RECORD_APPEND // method for consistency in format, instead of using ConfSimple print vector names = datadic.getNames(""); data.clear(); for (vector::const_iterator it = names.begin(); it != names.end(); it++) { string value; datadic.get(*it, value, ""); RECORD_APPEND(data, *it, value); } RECORD_APPEND(data, Doc::keysig, doc.sig); xdoc.set_data(data); return true; } #ifdef IDX_THREADS void Db::waitUpdIdle() { if (m_ndb->m_iswritable && m_ndb->m_havewriteq) { Chrono chron; m_ndb->m_wqueue.waitIdle(); // We flush here just for correct measurement of the thread work time string ermsg; try { m_ndb->xwdb.flush(); } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db::waitUpdIdle: flush() failed: %s\n", ermsg.c_str())); } m_ndb->m_totalworkns += chron.nanos(); LOGINFO(("Db::waitUpdIdle: total xapian work %lld mS\n", m_ndb->m_totalworkns/1000000)); } } #endif // Flush when idxflushmbs is reached bool Db::maybeflush(off_t moretext) { if (m_flushMb > 0) { m_curtxtsz += moretext; if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) { LOGDEB(("Db::add/delete: txt size >= %d Mb, flushing\n", m_flushMb)); return doFlush(); } } return true; } bool Db::doFlush() { if (!m_ndb) { LOGERR(("Db::doFLush: no ndb??\n")); return false; } string ermsg; try { m_ndb->xwdb.flush(); } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db::doFlush: flush() failed: %s\n", ermsg.c_str())); return false; } m_flushtxtsz = m_curtxtsz; return true; } // Test if doc given by udi has changed since last indexed (test sigs) bool Db::needUpdate(const string &udi, const string& sig, bool *existed) { if (m_ndb == 0) return false; // If we are doing an in place or full reset, no need to // test. Note that there is no need to update the existence map // either, it will be done when updating the index if (o_inPlaceReset || m_mode == DbTrunc) { // For in place reset, pretend the doc existed, to enable subdoc purge if (existed) *existed = o_inPlaceReset; return true; } string uniterm = make_uniterm(udi); string ermsg; #ifdef IDX_THREADS // Need to protect against interaction with the doc update/insert // thread which also updates the existence map, and even multiple // accesses to the readonly Xapian::Database are not allowed // anyway PTMutexLocker lock(m_ndb->m_mutex); #endif // We look up the document indexed by the uniterm. This is either // the actual document file, or, for a multi-document file, the // pseudo-doc we create to stand for the file itself. // We try twice in case database needs to be reopened. for (int tries = 0; tries < 2; tries++) { try { // Get the doc or pseudo-doc Xapian::PostingIterator docid = m_ndb->xrdb.postlist_begin(uniterm); if (docid == m_ndb->xrdb.postlist_end(uniterm)) { // If no document exist with this path, we do need update LOGDEB(("Db::needUpdate:yes (new): [%s]\n", uniterm.c_str())); if (existed) *existed = false; return true; } Xapian::Document doc = m_ndb->xrdb.get_document(*docid); if (existed) *existed = true; // Retrieve old file/doc signature from value string osig = doc.get_value(VALUE_SIG); LOGDEB2(("Db::needUpdate: oldsig [%s] new [%s]\n", osig.c_str(), sig.c_str())); // Compare new/old sig if (sig != osig) { LOGDEB(("Db::needUpdate:yes: olsig [%s] new [%s] [%s]\n", osig.c_str(), sig.c_str(), uniterm.c_str())); // Db is not up to date. Let's index the file return true; } LOGDEB(("Db::needUpdate:no: [%s]\n", uniterm.c_str())); // Up to date. // Set the uptodate flag for doc / pseudo doc if (m_mode != DbRO) { updated[*docid] = true; // Set the existence flag for all the subdocs (if any) vector docids; if (!m_ndb->subDocs(udi, 0, docids)) { LOGERR(("Rcl::Db::needUpdate: can't get subdocs\n")); return true; } for (vector::iterator it = docids.begin(); it != docids.end(); it++) { if (*it < updated.size()) { LOGDEB2(("Db::needUpdate: docid %d set\n", *it)); updated[*it] = true; } } } return false; } catch (const Xapian::DatabaseModifiedError &e) { LOGDEB(("Db::needUpdate: got modified error. reopen/retry\n")); m_reason = e.get_msg(); m_ndb->xrdb.reopen(); continue; } XCATCHERROR(m_reason); break; } LOGERR(("Db::needUpdate: error while checking existence: %s\n", m_reason.c_str())); return true; } // Return existing stem db languages vector Db::getStemLangs() { LOGDEB(("Db::getStemLang\n")); vector langs; if (m_ndb == 0 || m_ndb->m_isopen == false) return langs; StemDb db(m_ndb->xrdb); db.getMembers(langs); return langs; } /** * Delete stem db for given language */ bool Db::deleteStemDb(const string& lang) { LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str())); if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) return false; XapWritableSynFamily db(m_ndb->xwdb, synFamStem); return db.deleteMember(lang); } /** * Create database of stem to parents associations for a given language. * We walk the list of all terms, stem them, and create another Xapian db * with documents indexed by a single term (the stem), and with the list of * parent terms in the document data. */ bool Db::createStemDbs(const vector& langs) { LOGDEB(("Db::createStemDbs\n")); if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) { LOGERR(("createStemDb: db not open or not writable\n")); return false; } return createExpansionDbs(m_ndb->xwdb, langs); } /** * This is called at the end of an indexing session, to delete the * documents for files that are no longer there. This can ONLY be called * after a full file-system tree walk, else the file existence flags will * be wrong. */ bool Db::purge() { LOGDEB(("Db::purge\n")); if (m_ndb == 0) return false; LOGDEB(("Db::purge: m_isopen %d m_iswritable %d\n", m_ndb->m_isopen, m_ndb->m_iswritable)); if (m_ndb->m_isopen == false || m_ndb->m_iswritable == false) return false; #ifdef IDX_THREADS // If we manage our own write queue, make sure it's drained and closed if (m_ndb->m_havewriteq) m_ndb->m_wqueue.setTerminateAndWait(); // else we need to lock out other top level threads. This is just // a precaution as they should have been waited for by the top // level actor at this point PTMutexLocker lock(m_ndb->m_mutex, m_ndb->m_havewriteq); #endif // IDX_THREADS // For xapian versions up to 1.0.1, deleting a non-existant // document would trigger an exception that would discard any // pending update. This could lose both previous added documents // or deletions. Adding the flush before the delete pass ensured // that any added document would go to the index. Kept here // because it doesn't really hurt. try { m_ndb->xwdb.flush(); } catch (...) { LOGERR(("Db::purge: 1st flush failed\n")); } // Walk the document array and delete any xapian document whose // flag is not set (we did not see its source during indexing). int purgecount = 0; for (Xapian::docid docid = 1; docid < updated.size(); ++docid) { if (!updated[docid]) { if ((purgecount+1) % 100 == 0) { try { CancelCheck::instance().checkCancel(); } catch(CancelExcept) { LOGINFO(("Db::purge: partially cancelled\n")); break; } } try { if (m_flushMb > 0) { // We use an average term length of 5 for // estimating the doc sizes which is probably not // accurate but gives rough consistency with what // we do for add/update. I should fetch the doc // size from the data record, but this would be // bad for performance. Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid); maybeflush(trms * 5); } m_ndb->xwdb.delete_document(docid); LOGDEB(("Db::purge: deleted document #%d\n", docid)); } catch (const Xapian::DocNotFoundError &) { LOGDEB0(("Db::purge: document #%d not found\n", docid)); } catch (const Xapian::Error &e) { LOGERR(("Db::purge: document #%d: %s\n", docid, e.get_msg().c_str())); } catch (...) { LOGERR(("Db::purge: document #%d: unknown error\n", docid)); } purgecount++; } } try { m_ndb->xwdb.flush(); } catch (...) { LOGERR(("Db::purge: 2nd flush failed\n")); } return true; } // Test for doc existence. bool Db::docExists(const string& uniterm) { #ifdef IDX_THREADS // Need to protect read db against multiaccess. PTMutexLocker lock(m_ndb->m_mutex); #endif string ermsg; try { Xapian::PostingIterator docid = m_ndb->xrdb.postlist_begin(uniterm); if (docid == m_ndb->xrdb.postlist_end(uniterm)) { return false; } else { return true; } } XCATCHERROR(ermsg); if (!ermsg.empty()) { LOGERR(("Db::docExists(%s) %s\n", uniterm.c_str(), ermsg.c_str())); } return false; } /* Delete document(s) for given unique identifier (doc and descendents) */ bool Db::purgeFile(const string &udi, bool *existed) { LOGDEB(("Db:purgeFile: [%s]\n", udi.c_str())); if (m_ndb == 0 || !m_ndb->m_iswritable) return false; string uniterm = make_uniterm(udi); bool exists = docExists(uniterm); if (existed) *existed = exists; if (!exists) return true; #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm, Xapian::Document(), (size_t)-1); if (!m_ndb->m_wqueue.put(tp)) { LOGERR(("Db::purgeFile:Cant queue task\n")); return false; } else { return true; } } #endif /* We get there is IDX_THREADS is not defined or there is no queue */ return m_ndb->purgeFileWrite(false, udi, uniterm); } /* Delete subdocs with an out of date sig. We do this to purge obsolete subdocs during a partial update where no general purge will be done */ bool Db::purgeOrphans(const string &udi) { LOGDEB(("Db:purgeOrphans: [%s]\n", udi.c_str())); if (m_ndb == 0 || !m_ndb->m_iswritable) return false; string uniterm = make_uniterm(udi); #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm, Xapian::Document(), (size_t)-1); if (!m_ndb->m_wqueue.put(tp)) { LOGERR(("Db::purgeFile:Cant queue task\n")); return false; } else { return true; } } #endif /* We get there is IDX_THREADS is not defined or there is no queue */ return m_ndb->purgeFileWrite(true, udi, uniterm); } bool Db::dbStats(DbStats& res) { if (!m_ndb || !m_ndb->m_isopen) return false; Xapian::Database xdb = m_ndb->xrdb; XAPTRY(res.dbdoccount = xdb.get_doccount(); res.dbavgdoclen = xdb.get_avlength(); res.mindoclen = xdb.get_doclength_lower_bound(); res.maxdoclen = xdb.get_doclength_upper_bound(); , xdb, m_reason); if (!m_reason.empty()) return false; return true; } // Retrieve document defined by Unique doc identifier. This is used // by the GUI history feature and by open parent/getenclosing // ! The return value is always true except for fatal errors. Document // existence should be tested by looking at doc.pc bool Db::getDoc(const string &udi, const Doc& idxdoc, Doc &doc) { LOGDEB(("Db:getDoc: [%s]\n", udi.c_str())); if (m_ndb == 0) return false; // Initialize what we can in any case. If this is history, caller // will make partial display in case of error doc.meta[Rcl::Doc::keyrr] = "100%"; doc.pc = 100; Xapian::Document xdoc; Xapian::docid docid; int idxi = idxdoc.idxi; if ((docid = m_ndb->getDoc(udi, idxi, xdoc))) { string data = xdoc.get_data(); doc.meta[Rcl::Doc::keyudi] = udi; return m_ndb->dbDataToRclDoc(docid, data, doc); } else { // Document found in history no longer in the // database. We return true (because their might be // other ok docs further) but indicate the error with // pc = -1 doc.pc = -1; LOGINFO(("Db:getDoc: no such doc in index: [%s]\n", udi.c_str())); return true; } } bool Db::hasSubDocs(const Doc &idoc) { if (m_ndb == 0) return false; string inudi; if (!idoc.getmeta(Doc::keyudi, &inudi) || inudi.empty()) { LOGERR(("Db::hasSubDocs: no input udi or empty\n")); return false; } vector docids; if (!m_ndb->subDocs(inudi, idoc.idxi, docids)) { LOGDEB(("Db:getSubDocs: lower level subdocs failed\n")); return false; } if (!docids.empty()) return true; // Check if doc has an has_children term if (m_ndb->hasTerm(inudi, idoc.idxi, has_children_term)) return true; return false; } // Retrieve all subdocuments of a given one, which may not be a file-level // one (in which case, we have to retrieve this first, then filter the ipaths) bool Db::getSubDocs(const Doc &idoc, vector& subdocs) { if (m_ndb == 0) return false; string inudi; if (!idoc.getmeta(Doc::keyudi, &inudi) || inudi.empty()) { LOGERR(("Db::getSubDocs: no input udi or empty\n")); return false; } string rootudi; string ipath = idoc.ipath; if (ipath.empty()) { // File-level doc. Use it as root rootudi = inudi; } else { // See if we have a parent term Xapian::Document xdoc; if (!m_ndb->getDoc(inudi, idoc.idxi, xdoc)) { LOGERR(("Db::getSubDocs: can't get Xapian document\n")); return false; } Xapian::TermIterator xit; XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(wrap_prefix(parent_prefix)), m_ndb->xrdb, m_reason); if (!m_reason.empty()) { LOGERR(("Db::getSubDocs: xapian error: %s\n", m_reason.c_str())); return false; } if (xit == xdoc.termlist_end()) { LOGERR(("Db::getSubDocs: parent term not found\n")); return false; } rootudi = strip_prefix(*xit); } LOGDEB(("Db::getSubDocs: root: [%s]\n", rootudi.c_str())); // Retrieve all subdoc xapian ids for the root vector docids; if (!m_ndb->subDocs(rootudi, idoc.idxi, docids)) { LOGDEB(("Db:getSubDocs: lower level subdocs failed\n")); return false; } // Retrieve doc, filter, and build output list for (int tries = 0; tries < 2; tries++) { try { for (vector::const_iterator it = docids.begin(); it != docids.end(); it++) { Xapian::Document xdoc = m_ndb->xrdb.get_document(*it); string data = xdoc.get_data(); string docudi; m_ndb->xdocToUdi(xdoc, docudi); Doc doc; doc.meta[Doc::keyudi] = docudi; doc.meta[Doc::keyrr] = "100%"; doc.pc = 100; if (!m_ndb->dbDataToRclDoc(*it, data, doc)) { LOGERR(("Db::getSubDocs: doc conversion error\n")); return false; } if (ipath.empty() || doc.ipath.find(ipath) == 0) subdocs.push_back(doc); } return true; } catch (const Xapian::DatabaseModifiedError &e) { m_reason = e.get_msg(); m_ndb->xrdb.reopen(); continue; } XCATCHERROR(m_reason); break; } LOGERR(("Db::getSubDocs: Xapian error: %s\n", m_reason.c_str())); return false; } } // End namespace Rcl