/* Copyright (C) 2005 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "autoconfig.h" #include #include #include #include #include "cstr.h" #include "pathut.h" #include "debuglog.h" #include "fstreewalk.h" #include "beaglequeue.h" #include "beaglequeuecache.h" #include "circache.h" #include "smallut.h" #include "fileudi.h" #include "internfile.h" #include "wipedir.h" #include "indexer.h" #include "readfile.h" #include "conftree.h" #include "transcode.h" #include "cancelcheck.h" #include #include #include using namespace std; #include // Beagle creates a file named .xxx (where xxx is the name for the main file // in the queue), to hold external metadata (http or created by Beagle). // This class reads the .xxx, dotfile, and turns it into an Rcl::Doc holder class BeagleDotFile { public: BeagleDotFile(RclConfig *conf, const string& fn) : m_conf(conf), m_fn(fn) {} // Read input line, strip it of eol and return as c++ string bool readLine(string& line) { static const int LL = 2048; char cline[LL]; cline[0] = 0; m_input.getline(cline, LL-1); if (!m_input.good()) { if (m_input.bad()) { LOGERR(("beagleDotFileRead: input.bad()\n")); } return false; } int ll = strlen(cline); while (ll > 0 && (cline[ll-1] == '\n' || cline[ll-1] == '\r')) { cline[ll-1] = 0; ll--; } line.assign(cline, ll); LOGDEB2(("BeagleDotFile:readLine: [%s]\n", line.c_str())); return true; } // Process a beagle dot file and set interesting stuff in the doc bool toDoc(Rcl::Doc& doc) { string line; m_input.open(m_fn.c_str(), ios::in); if (!m_input.good()) { LOGERR(("BeagleDotFile: open failed for [%s]\n", m_fn.c_str())); return false; } // Read the 3 first lines: // - url // - hit type: we only know about Bookmark and WebHistory for now // - content-type. if (!readLine(line)) return false; doc.url = line; if (!readLine(line)) return false; doc.meta[Rcl::Doc::keybght] = line; if (!readLine(line)) return false; doc.mimetype = line; // We set the bookmarks mtype as html (the text is empty // anyway), so that the html viewer will be called on 'Open' bool isbookmark = false; if (!stringlowercmp("bookmark", doc.meta[Rcl::Doc::keybght])) { isbookmark = true; doc.mimetype = "text/html"; } string confstr; string ss(" "); // Read the rest: fields and keywords. We do a little // massaging of the input lines, then use a ConfSimple to // parse, and finally insert the key/value pairs into the doc // meta[] array for (;;) { if (!readLine(line)) { // Eof hopefully break; } if (line.find("t:") != 0) continue; line = line.substr(2); confstr += line + "\n"; } ConfSimple fields(confstr, 1); vector names = fields.getNames(cstr_null); for (vector::iterator it = names.begin(); it != names.end(); it++) { string value; fields.get(*it, value, cstr_null); if (!value.compare("undefined") || !value.compare("null")) continue; string *valuep = &value; string cvalue; if (isbookmark) { // It appears that bookmarks are stored in the users' // locale charset (not too sure). No idea what to do // for other types, would have to check the plugin. string charset = m_conf->getDefCharset(true); transcode(value, cvalue, charset, "UTF-8"); valuep = &cvalue; } string caname = m_conf->fieldCanon(*it); doc.meta[caname].append(ss + *valuep); } // Finally build the confsimple that we will save to the // cache, from the doc fields. This could also be done in // parallel with the doc.meta build above, but simpler this // way. We need it because not all interesting doc fields are // in the meta array (ie: mimetype, url), and we want // something homogenous and easy to save. for (map::const_iterator it = doc.meta.begin(); it != doc.meta.end(); it++) { m_fields.set((*it).first, (*it).second, cstr_null); } m_fields.set(cstr_url, doc.url, cstr_null); m_fields.set(cstr_bgc_mimetype, doc.mimetype, cstr_null); return true; } RclConfig *m_conf; ConfSimple m_fields; string m_fn; ifstream m_input; }; // Initialize. Compute paths and create a temporary directory that will be // used by internfile() BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db, DbIxStatusUpdater *updfunc) : m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc), m_nocacheindex(false) { if (!m_config->getConfParam("webqueuedir", m_queuedir)) m_queuedir = "~/.recollweb/ToIndex/"; m_queuedir = path_tildexpand(m_queuedir); path_catslash(m_queuedir); m_cache = new BeagleQueueCache(cnf); } BeagleQueueIndexer::~BeagleQueueIndexer() { LOGDEB(("BeagleQueueIndexer::~\n")); deleteZ(m_cache); } // Index document stored in the cache. bool BeagleQueueIndexer::indexFromCache(const string& udi) { if (!m_db) return false; CancelCheck::instance().checkCancel(); Rcl::Doc dotdoc; string data; string hittype; if (!m_cache || !m_cache->getFromCache(udi, dotdoc, data, &hittype)) { LOGERR(("BeagleQueueIndexer::indexFromCache: cache failed\n")); return false; } if (hittype.empty()) { LOGERR(("BeagleIndexer::index: cc entry has no hit type\n")); return false; } if (!stringlowercmp("bookmark", hittype)) { // Just index the dotdoc dotdoc.meta[Rcl::Doc::keybcknd] = "BGL"; return m_db->addOrUpdate(udi, cstr_null, dotdoc); } else if (stringlowercmp("webhistory", dotdoc.meta[Rcl::Doc::keybght]) || (dotdoc.mimetype.compare("text/html") && dotdoc.mimetype.compare(cstr_textplain))) { LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n", dotdoc.meta[Rcl::Doc::keybght].c_str(), dotdoc.mimetype.c_str())); return true; } else { Rcl::Doc doc; FileInterner interner(data, m_config, m_tmpdir, FileInterner::FIF_doUseInputMimetype, dotdoc.mimetype); FileInterner::Status fis; try { fis = interner.internfile(doc); } catch (CancelExcept) { LOGERR(("BeagleQueueIndexer: interrupted\n")); return false; } if (fis != FileInterner::FIDone) { LOGERR(("BeagleQueueIndexer: bad status from internfile\n")); return false; } doc.mimetype = dotdoc.mimetype; doc.fmtime = dotdoc.fmtime; doc.url = dotdoc.url; doc.pcbytes = dotdoc.pcbytes; doc.sig.clear(); doc.meta[Rcl::Doc::keybcknd] = "BGL"; return m_db->addOrUpdate(udi, cstr_null, doc); } } void BeagleQueueIndexer::updstatus(const string& udi) { if (m_updater) { ++(m_updater->status.docsdone); if (m_updater->status.dbtotdocs < m_updater->status.docsdone) m_updater->status.dbtotdocs = m_updater->status.docsdone; m_updater->status.fn = udi; m_updater->update(); } } bool BeagleQueueIndexer::index() { if (!m_db) return false; LOGDEB(("BeagleQueueIndexer::processqueue: [%s]\n", m_queuedir.c_str())); m_config->setKeyDir(m_queuedir); if (!makepath(m_queuedir)) { LOGERR(("BeagleQueueIndexer:: can't create queuedir [%s] errno %d\n", m_queuedir.c_str(), errno)); return false; } if (!m_cache || !m_cache->cc()) { LOGERR(("BeagleQueueIndexer: cache initialization failed\n")); return false; } CirCache *cc = m_cache->cc(); // First check/index files found in the cache. If the index was reset, // this actually does work, else it sets the existence flags (avoid // purging). We don't do this when called from indexFiles if (!m_nocacheindex) { bool eof; if (!cc->rewind(eof)) { // rewind can return eof if the cache is empty if (!eof) return false; } while (cc->next(eof)) { string udi; if (!cc->getCurrentUdi(udi)) { LOGERR(("BeagleQueueIndexer:: cache file damaged\n")); break; } if (udi.empty()) continue; if (m_db->needUpdate(udi, cstr_null)) { try { // indexFromCache does a CirCache::get(). We could // arrange to use a getCurrent() instead, would be more // efficient indexFromCache(udi); updstatus(udi); } catch (CancelExcept) { LOGERR(("BeagleQueueIndexer: interrupted\n")); return false; } } } } // Finally index the queue FsTreeWalker walker(FsTreeWalker::FtwNoRecurse); walker.addSkippedName(".*"); FsTreeWalker::Status status = walker.walk(m_queuedir, *this); LOGDEB(("BeagleQueueIndexer::processqueue: done: status %d\n", status)); return true; } // Index a list of files (sent by the real time monitor) bool BeagleQueueIndexer::indexFiles(list& files) { LOGDEB(("BeagleQueueIndexer::indexFiles\n")); if (!m_db) { LOGERR(("BeagleQueueIndexer::indexfiles no db??\n")); return false; } for (list::iterator it = files.begin(); it != files.end();) { if (it->empty()) {//?? it++; continue; } string father = path_getfather(*it); if (father.compare(m_queuedir)) { LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nq)\n", it->c_str())); it++; continue; } // Pb: we are often called with the dot file, before the // normal file exists, and sometimes never called for the // normal file afterwards (ie for bookmarks where the normal // file is empty). So we perform a normal queue run at the end // of the function to catch older stuff. Still this is not // perfect, sometimes some files will not be indexed before // the next run. string fn = path_getsimple(*it); if (fn.empty() || fn.at(0) == '.') { it++; continue; } struct stat st; if (lstat(it->c_str(), &st) != 0) { LOGERR(("BeagleQueueIndexer::indexfiles: cant stat [%s]\n", it->c_str())); it++; continue; } if (!S_ISREG(st.st_mode)) { LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nr)\n", it->c_str())); it++; continue; } processone(*it, &st, FsTreeWalker::FtwRegular); it = files.erase(it); } m_nocacheindex = true; index(); // Note: no need to reset nocacheindex, we're in the monitor now return true; } FsTreeWalker::Status BeagleQueueIndexer::processone(const string &path, const struct stat *stp, FsTreeWalker::CbFlag flg) { if (!m_db) //?? return FsTreeWalker::FtwError; bool dounlink = false; if (flg != FsTreeWalker::FtwRegular) return FsTreeWalker::FtwOk; string dotpath = path_cat(path_getfather(path), string(".") + path_getsimple(path)); LOGDEB(("BeagleQueueIndexer: prc1: [%s]\n", path.c_str())); BeagleDotFile dotfile(m_config, dotpath); Rcl::Doc dotdoc; string udi, udipath; if (!dotfile.toDoc(dotdoc)) goto out; //dotdoc.dump(1); // Have to use the hit type for the udi, because the same url can exist // as a bookmark or a page. udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url)); make_udi(udipath, cstr_null, udi); LOGDEB(("BeagleQueueIndexer: prc1: udi [%s]\n", udi.c_str())); char ascdate[30]; sprintf(ascdate, "%ld", long(stp->st_mtime)); // We only process bookmarks or text/html and text/plain files. if (!stringlowercmp("bookmark", dotdoc.meta[Rcl::Doc::keybght])) { // For bookmarks, we just index the doc that was built from the // metadata. if (dotdoc.fmtime.empty()) dotdoc.fmtime = ascdate; char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); dotdoc.pcbytes = cbuf; // Document signature for up to date checks: none. dotdoc.sig.clear(); dotdoc.meta[Rcl::Doc::keybcknd] = "BGL"; if (!m_db->addOrUpdate(udi, cstr_null, dotdoc)) return FsTreeWalker::FtwError; } else if (stringlowercmp("webhistory", dotdoc.meta[Rcl::Doc::keybght]) || (dotdoc.mimetype.compare("text/html") && dotdoc.mimetype.compare(cstr_textplain))) { LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n", dotdoc.meta[Rcl::Doc::keybght].c_str(), dotdoc.mimetype.c_str())); // Unlink them anyway dounlink = true; goto out; } else { Rcl::Doc doc; // Store the dotdoc fields in the future doc. In case someone wants // to use beagle-generated fields like beagle:inurl doc.meta = dotdoc.meta; FileInterner interner(path, stp, m_config, m_tmpdir, FileInterner::FIF_doUseInputMimetype, &dotdoc.mimetype); FileInterner::Status fis; try { fis = interner.internfile(doc); } catch (CancelExcept) { LOGERR(("BeagleQueueIndexer: interrupted\n")); goto out; } if (fis != FileInterner::FIDone && fis != FileInterner::FIAgain) { LOGERR(("BeagleQueueIndexer: bad status from internfile\n")); // TOBEDONE: internfile can return FIAgain here if it is // paging a big text file, we should loop. Means we're // only indexing the first page for text/plain files // bigger than the page size (dlft: 1MB) for now. goto out; } if (doc.fmtime.empty()) doc.fmtime = ascdate; char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); doc.pcbytes = cbuf; // Document signature for up to date checks: none. doc.sig.clear(); doc.url = dotdoc.url; doc.meta[Rcl::Doc::keybcknd] = "BGL"; if (!m_db->addOrUpdate(udi, cstr_null, doc)) return FsTreeWalker::FtwError; } // Copy to cache { // doc fields not in meta, needing saving to the cache dotfile.m_fields.set("fmtime", dotdoc.fmtime, cstr_null); // fbytes is used for historical reasons, should be pcbytes, but makes // no sense to change. dotfile.m_fields.set(cstr_fbytes, dotdoc.pcbytes, cstr_null); dotfile.m_fields.set("udi", udi, cstr_null); string fdata; file_to_string(path, fdata); if (!m_cache || !m_cache->cc()) { LOGERR(("BeagleQueueIndexer: cache initialization failed\n")); goto out; } if (!m_cache->cc()->put(udi, &dotfile.m_fields, fdata, 0)) { LOGERR(("BeagleQueueIndexer::prc1: cache_put failed; %s\n", m_cache->cc()->getReason().c_str())); goto out; } } updstatus(udi); dounlink = true; out: if (dounlink) { unlink(path.c_str()); unlink(dotpath.c_str()); } return FsTreeWalker::FtwOk; }