Remove improper assertion use from beagle cache handling code
This commit is contained in:
commit
60d4843576
811 changed files with 338532 additions and 0 deletions
495
src/index/beaglequeue.cpp
Normal file
495
src/index/beaglequeue.cpp
Normal file
|
@ -0,0 +1,495 @@
|
|||
/* Copyright (C) 2005 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "cstr.h"
|
||||
#include "pathut.h"
|
||||
#include "debuglog.h"
|
||||
#include "fstreewalk.h"
|
||||
#include "beaglequeue.h"
|
||||
#include "beaglequeuecache.h"
|
||||
#include "circache.h"
|
||||
#include "smallut.h"
|
||||
#include "fileudi.h"
|
||||
#include "internfile.h"
|
||||
#include "wipedir.h"
|
||||
#include "indexer.h"
|
||||
#include "readfile.h"
|
||||
#include "conftree.h"
|
||||
#include "transcode.h"
|
||||
#include "cancelcheck.h"
|
||||
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
using namespace std;
|
||||
|
||||
#include <sys/stat.h>
|
||||
|
||||
|
||||
// Beagle creates a file named .xxx (where xxx is the name for the main file
|
||||
// in the queue), to hold external metadata (http or created by Beagle).
|
||||
// This class reads the .xxx, dotfile, and turns it into an Rcl::Doc holder
|
||||
class BeagleDotFile {
|
||||
public:
|
||||
BeagleDotFile(RclConfig *conf, const string& fn)
|
||||
: m_conf(conf), m_fn(fn)
|
||||
{}
|
||||
|
||||
// Read input line, strip it of eol and return as c++ string
|
||||
bool readLine(string& line)
|
||||
{
|
||||
static const int LL = 2048;
|
||||
char cline[LL];
|
||||
cline[0] = 0;
|
||||
m_input.getline(cline, LL-1);
|
||||
if (!m_input.good()) {
|
||||
if (m_input.bad()) {
|
||||
LOGERR(("beagleDotFileRead: input.bad()\n"));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
int ll = strlen(cline);
|
||||
while (ll > 0 && (cline[ll-1] == '\n' || cline[ll-1] == '\r')) {
|
||||
cline[ll-1] = 0;
|
||||
ll--;
|
||||
}
|
||||
line.assign(cline, ll);
|
||||
LOGDEB2(("BeagleDotFile:readLine: [%s]\n", line.c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Process a beagle dot file and set interesting stuff in the doc
|
||||
bool toDoc(Rcl::Doc& doc)
|
||||
{
|
||||
string line;
|
||||
|
||||
m_input.open(m_fn.c_str(), ios::in);
|
||||
if (!m_input.good()) {
|
||||
LOGERR(("BeagleDotFile: open failed for [%s]\n", m_fn.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read the 3 first lines:
|
||||
// - url
|
||||
// - hit type: we only know about Bookmark and WebHistory for now
|
||||
// - content-type.
|
||||
if (!readLine(line))
|
||||
return false;
|
||||
doc.url = line;
|
||||
if (!readLine(line))
|
||||
return false;
|
||||
doc.meta[Rcl::Doc::keybght] = line;
|
||||
if (!readLine(line))
|
||||
return false;
|
||||
doc.mimetype = line;
|
||||
|
||||
// We set the bookmarks mtype as html (the text is empty
|
||||
// anyway), so that the html viewer will be called on 'Open'
|
||||
bool isbookmark = false;
|
||||
if (!stringlowercmp("bookmark", doc.meta[Rcl::Doc::keybght])) {
|
||||
isbookmark = true;
|
||||
doc.mimetype = "text/html";
|
||||
}
|
||||
|
||||
string confstr;
|
||||
string ss(" ");
|
||||
// Read the rest: fields and keywords. We do a little
|
||||
// massaging of the input lines, then use a ConfSimple to
|
||||
// parse, and finally insert the key/value pairs into the doc
|
||||
// meta[] array
|
||||
for (;;) {
|
||||
if (!readLine(line)) {
|
||||
// Eof hopefully
|
||||
break;
|
||||
}
|
||||
if (line.find("t:") != 0)
|
||||
continue;
|
||||
line = line.substr(2);
|
||||
confstr += line + "\n";
|
||||
}
|
||||
ConfSimple fields(confstr, 1);
|
||||
vector<string> names = fields.getNames(cstr_null);
|
||||
for (vector<string>::iterator it = names.begin();
|
||||
it != names.end(); it++) {
|
||||
string value;
|
||||
fields.get(*it, value, cstr_null);
|
||||
if (!value.compare("undefined") || !value.compare("null"))
|
||||
continue;
|
||||
|
||||
string *valuep = &value;
|
||||
string cvalue;
|
||||
if (isbookmark) {
|
||||
// It appears that bookmarks are stored in the users'
|
||||
// locale charset (not too sure). No idea what to do
|
||||
// for other types, would have to check the plugin.
|
||||
string charset = m_conf->getDefCharset(true);
|
||||
transcode(value, cvalue, charset, "UTF-8");
|
||||
valuep = &cvalue;
|
||||
}
|
||||
|
||||
string caname = m_conf->fieldCanon(*it);
|
||||
doc.meta[caname].append(ss + *valuep);
|
||||
}
|
||||
|
||||
// Finally build the confsimple that we will save to the
|
||||
// cache, from the doc fields. This could also be done in
|
||||
// parallel with the doc.meta build above, but simpler this
|
||||
// way. We need it because not all interesting doc fields are
|
||||
// in the meta array (ie: mimetype, url), and we want
|
||||
// something homogenous and easy to save.
|
||||
for (map<string,string>::const_iterator it = doc.meta.begin();
|
||||
it != doc.meta.end(); it++) {
|
||||
m_fields.set((*it).first, (*it).second, cstr_null);
|
||||
}
|
||||
m_fields.set(cstr_url, doc.url, cstr_null);
|
||||
m_fields.set(cstr_bgc_mimetype, doc.mimetype, cstr_null);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
RclConfig *m_conf;
|
||||
ConfSimple m_fields;
|
||||
string m_fn;
|
||||
ifstream m_input;
|
||||
};
|
||||
|
||||
// Initialize. Compute paths and create a temporary directory that will be
|
||||
// used by internfile()
|
||||
BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
||||
DbIxStatusUpdater *updfunc)
|
||||
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
|
||||
m_nocacheindex(false)
|
||||
{
|
||||
|
||||
if (!m_config->getConfParam("beaglequeuedir", m_queuedir))
|
||||
m_queuedir = path_tildexpand("~/.beagle/ToIndex/");
|
||||
path_catslash(m_queuedir);
|
||||
m_cache = new BeagleQueueCache(cnf);
|
||||
}
|
||||
|
||||
BeagleQueueIndexer::~BeagleQueueIndexer()
|
||||
{
|
||||
LOGDEB(("BeagleQueueIndexer::~\n"));
|
||||
deleteZ(m_cache);
|
||||
}
|
||||
|
||||
// Index document stored in the cache.
|
||||
bool BeagleQueueIndexer::indexFromCache(const string& udi)
|
||||
{
|
||||
if (!m_db)
|
||||
return false;
|
||||
|
||||
CancelCheck::instance().checkCancel();
|
||||
|
||||
Rcl::Doc dotdoc;
|
||||
string data;
|
||||
string hittype;
|
||||
|
||||
if (!m_cache || !m_cache->getFromCache(udi, dotdoc, data, &hittype)) {
|
||||
LOGERR(("BeagleQueueIndexer::indexFromCache: cache failed\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (hittype.empty()) {
|
||||
LOGERR(("BeagleIndexer::index: cc entry has no hit type\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!stringlowercmp("bookmark", hittype)) {
|
||||
// Just index the dotdoc
|
||||
dotdoc.meta[Rcl::Doc::keybcknd] = "BGL";
|
||||
return m_db->addOrUpdate(udi, cstr_null, dotdoc);
|
||||
} else if (stringlowercmp("webhistory", dotdoc.meta[Rcl::Doc::keybght]) ||
|
||||
(dotdoc.mimetype.compare("text/html") &&
|
||||
dotdoc.mimetype.compare(cstr_textplain))) {
|
||||
LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
|
||||
dotdoc.meta[Rcl::Doc::keybght].c_str(), dotdoc.mimetype.c_str()));
|
||||
return true;
|
||||
} else {
|
||||
Rcl::Doc doc;
|
||||
FileInterner interner(data, m_config, m_tmpdir,
|
||||
FileInterner::FIF_doUseInputMimetype,
|
||||
dotdoc.mimetype);
|
||||
FileInterner::Status fis;
|
||||
try {
|
||||
fis = interner.internfile(doc);
|
||||
} catch (CancelExcept) {
|
||||
LOGERR(("BeagleQueueIndexer: interrupted\n"));
|
||||
return false;
|
||||
}
|
||||
if (fis != FileInterner::FIDone) {
|
||||
LOGERR(("BeagleQueueIndexer: bad status from internfile\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
doc.mimetype = dotdoc.mimetype;
|
||||
doc.fmtime = dotdoc.fmtime;
|
||||
doc.url = dotdoc.url;
|
||||
doc.pcbytes = dotdoc.pcbytes;
|
||||
doc.sig.clear();
|
||||
doc.meta[Rcl::Doc::keybcknd] = "BGL";
|
||||
return m_db->addOrUpdate(udi, cstr_null, doc);
|
||||
}
|
||||
}
|
||||
|
||||
void BeagleQueueIndexer::updstatus(const string& udi)
|
||||
{
|
||||
if (m_updater) {
|
||||
++(m_updater->status.docsdone);
|
||||
if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
|
||||
m_updater->status.dbtotdocs = m_updater->status.docsdone;
|
||||
m_updater->status.fn = udi;
|
||||
m_updater->update();
|
||||
}
|
||||
}
|
||||
|
||||
bool BeagleQueueIndexer::index()
|
||||
{
|
||||
if (!m_db)
|
||||
return false;
|
||||
LOGDEB(("BeagleQueueIndexer::processqueue: [%s]\n", m_queuedir.c_str()));
|
||||
m_config->setKeyDir(m_queuedir);
|
||||
if (!m_cache || !m_cache->cc()) {
|
||||
LOGERR(("BeagleQueueIndexer: cache initialization failed\n"));
|
||||
return false;
|
||||
}
|
||||
CirCache *cc = m_cache->cc();
|
||||
// First check/index files found in the cache. If the index was reset,
|
||||
// this actually does work, else it sets the existence flags (avoid
|
||||
// purging). We don't do this when called from indexFiles
|
||||
if (!m_nocacheindex) {
|
||||
bool eof;
|
||||
if (!cc->rewind(eof)) {
|
||||
// rewind can return eof if the cache is empty
|
||||
if (!eof)
|
||||
return false;
|
||||
}
|
||||
while (cc->next(eof)) {
|
||||
string udi;
|
||||
if (!cc->getCurrentUdi(udi)) {
|
||||
LOGERR(("BeagleQueueIndexer:: cache file damaged\n"));
|
||||
break;
|
||||
}
|
||||
if (udi.empty())
|
||||
continue;
|
||||
if (m_db->needUpdate(udi, cstr_null)) {
|
||||
try {
|
||||
// indexFromCache does a CirCache::get(). We could
|
||||
// arrange to use a getCurrent() instead, would be more
|
||||
// efficient
|
||||
indexFromCache(udi);
|
||||
updstatus(udi);
|
||||
} catch (CancelExcept) {
|
||||
LOGERR(("BeagleQueueIndexer: interrupted\n"));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Finally index the queue
|
||||
FsTreeWalker walker(FsTreeWalker::FtwNoRecurse);
|
||||
walker.addSkippedName(".*");
|
||||
FsTreeWalker::Status status =walker.walk(m_queuedir, *this);
|
||||
LOGDEB(("BeagleQueueIndexer::processqueue: done: status %d\n", status));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Index a list of files (sent by the real time monitor)
|
||||
bool BeagleQueueIndexer::indexFiles(list<string>& files)
|
||||
{
|
||||
LOGDEB(("BeagleQueueIndexer::indexFiles\n"));
|
||||
|
||||
if (!m_db) {
|
||||
LOGERR(("BeagleQueueIndexer::indexfiles no db??\n"));
|
||||
return false;
|
||||
}
|
||||
for (list<string>::iterator it = files.begin(); it != files.end();) {
|
||||
if (it->empty()) {//??
|
||||
it++; continue;
|
||||
}
|
||||
string father = path_getfather(*it);
|
||||
if (father.compare(m_queuedir)) {
|
||||
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nq)\n",
|
||||
it->c_str()));
|
||||
it++; continue;
|
||||
}
|
||||
// Pb: we are often called with the dot file, before the
|
||||
// normal file exists, and sometimes never called for the
|
||||
// normal file afterwards (ie for bookmarks where the normal
|
||||
// file is empty). So we perform a normal queue run at the end
|
||||
// of the function to catch older stuff. Still this is not
|
||||
// perfect, sometimes some files will not be indexed before
|
||||
// the next run.
|
||||
string fn = path_getsimple(*it);
|
||||
if (fn.empty() || fn.at(0) == '.') {
|
||||
it++; continue;
|
||||
}
|
||||
struct stat st;
|
||||
if (lstat(it->c_str(), &st) != 0) {
|
||||
LOGERR(("BeagleQueueIndexer::indexfiles: cant stat [%s]\n",
|
||||
it->c_str()));
|
||||
it++; continue;
|
||||
}
|
||||
if (!S_ISREG(st.st_mode)) {
|
||||
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nr)\n",
|
||||
it->c_str()));
|
||||
it++; continue;
|
||||
}
|
||||
|
||||
processone(*it, &st, FsTreeWalker::FtwRegular);
|
||||
it = files.erase(it);
|
||||
}
|
||||
m_nocacheindex = true;
|
||||
index();
|
||||
// Note: no need to reset nocacheindex, we're in the monitor now
|
||||
return true;
|
||||
}
|
||||
|
||||
FsTreeWalker::Status
|
||||
BeagleQueueIndexer::processone(const string &path,
|
||||
const struct stat *stp,
|
||||
FsTreeWalker::CbFlag flg)
|
||||
{
|
||||
if (!m_db) //??
|
||||
return FsTreeWalker::FtwError;
|
||||
|
||||
bool dounlink = false;
|
||||
|
||||
if (flg != FsTreeWalker::FtwRegular)
|
||||
return FsTreeWalker::FtwOk;
|
||||
|
||||
string dotpath = path_cat(path_getfather(path),
|
||||
string(".") + path_getsimple(path));
|
||||
LOGDEB(("BeagleQueueIndexer: prc1: [%s]\n", path.c_str()));
|
||||
|
||||
BeagleDotFile dotfile(m_config, dotpath);
|
||||
Rcl::Doc dotdoc;
|
||||
string udi, udipath;
|
||||
if (!dotfile.toDoc(dotdoc))
|
||||
goto out;
|
||||
//dotdoc.dump(1);
|
||||
|
||||
// Have to use the hit type for the udi, because the same url can exist
|
||||
// as a bookmark or a page.
|
||||
udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url));
|
||||
make_udi(udipath, cstr_null, udi);
|
||||
|
||||
LOGDEB(("BeagleQueueIndexer: prc1: udi [%s]\n", udi.c_str()));
|
||||
char ascdate[30];
|
||||
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||
|
||||
// We only process bookmarks or text/html and text/plain files.
|
||||
if (!stringlowercmp("bookmark", dotdoc.meta[Rcl::Doc::keybght])) {
|
||||
// For bookmarks, we just index the doc that was built from the
|
||||
// metadata.
|
||||
if (dotdoc.fmtime.empty())
|
||||
dotdoc.fmtime = ascdate;
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, OFFTPC, stp->st_size);
|
||||
dotdoc.pcbytes = cbuf;
|
||||
|
||||
// Document signature for up to date checks: none.
|
||||
dotdoc.sig.clear();
|
||||
|
||||
dotdoc.meta[Rcl::Doc::keybcknd] = "BGL";
|
||||
if (!m_db->addOrUpdate(udi, cstr_null, dotdoc))
|
||||
return FsTreeWalker::FtwError;
|
||||
|
||||
} else if (stringlowercmp("webhistory", dotdoc.meta[Rcl::Doc::keybght]) ||
|
||||
(dotdoc.mimetype.compare("text/html") &&
|
||||
dotdoc.mimetype.compare(cstr_textplain))) {
|
||||
LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
|
||||
dotdoc.meta[Rcl::Doc::keybght].c_str(), dotdoc.mimetype.c_str()));
|
||||
// Unlink them anyway
|
||||
dounlink = true;
|
||||
goto out;
|
||||
} else {
|
||||
Rcl::Doc doc;
|
||||
// Store the dotdoc fields in the future doc. In case someone wants
|
||||
// to use beagle-generated fields like beagle:inurl
|
||||
doc.meta = dotdoc.meta;
|
||||
|
||||
FileInterner interner(path, stp, m_config, m_tmpdir,
|
||||
FileInterner::FIF_doUseInputMimetype,
|
||||
&dotdoc.mimetype);
|
||||
FileInterner::Status fis;
|
||||
try {
|
||||
fis = interner.internfile(doc);
|
||||
} catch (CancelExcept) {
|
||||
LOGERR(("BeagleQueueIndexer: interrupted\n"));
|
||||
goto out;
|
||||
}
|
||||
if (fis != FileInterner::FIDone && fis != FileInterner::FIAgain) {
|
||||
LOGERR(("BeagleQueueIndexer: bad status from internfile\n"));
|
||||
// TOBEDONE: internfile can return FIAgain here if it is
|
||||
// paging a big text file, we should loop. Means we're
|
||||
// only indexing the first page for text/plain files
|
||||
// bigger than the page size (dlft: 1MB) for now.
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (doc.fmtime.empty())
|
||||
doc.fmtime = ascdate;
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, OFFTPC, stp->st_size);
|
||||
doc.pcbytes = cbuf;
|
||||
// Document signature for up to date checks: none.
|
||||
doc.sig.clear();
|
||||
doc.url = dotdoc.url;
|
||||
|
||||
doc.meta[Rcl::Doc::keybcknd] = "BGL";
|
||||
if (!m_db->addOrUpdate(udi, cstr_null, doc))
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
|
||||
// Copy to cache
|
||||
{
|
||||
// doc fields not in meta, needing saving to the cache
|
||||
dotfile.m_fields.set("fmtime", dotdoc.fmtime, cstr_null);
|
||||
// fbytes is used for historical reasons, should be pcbytes, but makes
|
||||
// no sense to change.
|
||||
dotfile.m_fields.set(cstr_fbytes, dotdoc.pcbytes, cstr_null);
|
||||
dotfile.m_fields.set("udi", udi, cstr_null);
|
||||
string fdata;
|
||||
file_to_string(path, fdata);
|
||||
if (!m_cache || !m_cache->cc()) {
|
||||
LOGERR(("BeagleQueueIndexer: cache initialization failed\n"));
|
||||
goto out;
|
||||
}
|
||||
if (!m_cache->cc()->put(udi, &dotfile.m_fields, fdata, 0)) {
|
||||
LOGERR(("BeagleQueueIndexer::prc1: cache_put failed; %s\n",
|
||||
m_cache->cc()->getReason().c_str()));
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
updstatus(udi);
|
||||
dounlink = true;
|
||||
out:
|
||||
if (dounlink) {
|
||||
unlink(path.c_str());
|
||||
unlink(dotpath.c_str());
|
||||
}
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue