cleaned up processing of metadata from diverse origins (doc,extattrs,localfields)

This commit is contained in:
Jean-Francois Dockes 2013-01-29 14:33:57 +01:00
parent 89996fde05
commit 6423c3a91b
9 changed files with 83 additions and 47 deletions

View file

@ -50,7 +50,7 @@ $headAndBody = 1;
sub xapianTag {
my $imgtag = shift;
while ( ( $tagre, $xapiantag) = each %{$tagMap} ) {
return $xapiantag if $imgtag =~ /$tagre/i;
return $xapiantag if $imgtag =~ /^$tagre$/i;
}
return undef;
}

View file

@ -134,6 +134,12 @@ awk 'BEGIN'\
$0 = part1 mid part2
}
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
# "Subject" metadata field is more like an HTML "description"
if(doescape == 0 && $0 ~ /<meta ?name="Subject"/){
gsub(/="Subject"/, "=\"Description\"", $0)
}
if ($0 == "<pre>"){
# Begin of body text. need to escape some chars from now on as
# pdftotext sometimes doesnt do it

View file

@ -47,8 +47,10 @@
#include "rclinit.h"
#include "execmd.h"
// When using extended attributes, we have to use the ctime.
// This is quite an expensive price to pay...
// When using extended attributes, we have to use the ctime, because
// this is all that gets set when the attributes are modified. This
// is quite an expensive price to pay...
#ifdef RCL_USE_XATTR
#define RCL_STTIME st_ctime
#else
@ -420,23 +422,22 @@ void FsIndexer::localfieldsfromconf()
vector<string> nmlst = attrs.getNames(cstr_null);
for (vector<string>::const_iterator it = nmlst.begin();
it != nmlst.end(); it++) {
attrs.get(*it, m_localfields[*it]);
string nm = m_config->fieldCanon(*it);
attrs.get(*it, m_localfields[nm]);
LOGDEB2(("FsIndexer::localfieldsfromconf: [%s]->[%s]\n",
nm.c_str(), m_localfields[nm].c_str()));
}
}
//
void FsIndexer::setlocalfields(const map<string, string>& fields, Rcl::Doc& doc)
{
for (map<string, string>::const_iterator it = fields.begin();
it != fields.end(); it++) {
// Should local fields override those coming from the document
// ? I think not, but not too sure. We could also chose to
// concatenate the values ?
if (doc.meta.find(it->second) == doc.meta.end()) {
// Being chosen by the user, localfields override values from
// the filter. The key is already canonic (see
// localfieldsfromconf())
doc.meta[it->first] = it->second;
}
}
}
// Metadata gathering commands
@ -484,7 +485,9 @@ void FsIndexer::reapmetadata(const vector<MDReaper>& reapers, const string& fn,
}
string output;
if (ExecCmd::backtick(cmd, output)) {
doc.meta[rp->fieldname] += string(" ") + output;
// addmeta() creates or appends. fieldname is already
// canonic (see above)
doc.addmeta(rp->fieldname, output);
}
}
}
@ -534,7 +537,7 @@ void *FsIndexerInternfileWorker(void * fsp)
TempDir tmpdir;
RclConfig myconf(*(fip->m_stableconfig));
InternfileTask *tsk;
InternfileTask *tsk = 0;
for (;;) {
if (!tqp->take(&tsk)) {
tqp->workerExit();

View file

@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
code &= ~(IN_ISDIR|IN_ONESHOT);
switch (code) {
case IN_ACCESS: return "IN_ACCESS";
case IN_MODIFY: return "IN_MODIFY";
case IN_ATTRIB: return "IN_ATTRIB";
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
case IN_CLOSE: return "IN_CLOSE";
case IN_OPEN: return "IN_OPEN";
case IN_MOVED_FROM: return "IN_MOVED_FROM";
case IN_MOVED_TO: return "IN_MOVED_TO";
case IN_MOVE: return "IN_MOVE";
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
case IN_CREATE: return "IN_CREATE";
case IN_DELETE: return "IN_DELETE";
case IN_DELETE_SELF: return "IN_DELETE_SELF";
case IN_MOVE_SELF: return "IN_MOVE_SELF";
case IN_UNMOUNT: return "IN_UNMOUNT";
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
case IN_IGNORED: return "IN_IGNORED";
case IN_MODIFY: return "IN_MODIFY";
case IN_MOVE: return "IN_MOVE";
case IN_MOVED_FROM: return "IN_MOVED_FROM";
case IN_MOVED_TO: return "IN_MOVED_TO";
case IN_MOVE_SELF: return "IN_MOVE_SELF";
case IN_OPEN: return "IN_OPEN";
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
case IN_UNMOUNT: return "IN_UNMOUNT";
default: {
static char msg[50];
sprintf(msg, "Unknown event 0x%x", code);
@ -599,6 +599,12 @@ bool RclIntf::addWatch(const string& path, bool)
// CLOSE_WRITE is covered through MODIFY. CREATE is needed for mkdirs
uint32_t mask = IN_MODIFY | IN_CREATE
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
#ifdef RCL_USE_XATTR
// It seems that IN_ATTRIB is not needed to receive extattr
// modification events, which is a bit weird because only ctime is
// set.
// | IN_ATTRIB
#endif
#ifdef IN_DONT_FOLLOW
| IN_DONT_FOLLOW
#endif
@ -692,7 +698,7 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
eraseWatchSubTree(m_idtopath, ev.m_path);
}
// IN_ATTRIB apparently not needed, see comment above
if (evp->mask & (IN_MODIFY)) {
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {

View file

@ -80,6 +80,7 @@ static string colon_restore(const string& in)
#ifdef RCL_USE_XATTR
void FileInterner::reapXAttrs(const string& path)
{
LOGDEB2(("FileInterner::reapXAttrs: [%s]\n", path.c_str()));
vector<string> xnames;
if (!pxattr::list(path, &xnames)) {
LOGERR(("FileInterner::reapXattrs: pxattr::list: errno %d\n", errno));
@ -98,6 +99,8 @@ void FileInterner::reapXAttrs(const string& path)
}
// Encode should we ?
m_XAttrsFields[mit->second] = value;
LOGDEB2(("FileInterner::reapXAttrs: got [%s] -> [%s]\n",
mit->second.c_str(), value.c_str()));
}
}
}
@ -626,7 +629,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
it->first == cstr_dj_keycharset) {
// don't need/want these.
} else {
doc.meta[it->first] = it->second;
doc.addmeta(m_cfg->fieldCanon(it->first), it->second);
}
}
if (doc.meta[Rcl::Doc::keyabs].empty() &&
@ -659,10 +662,12 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
#ifdef RCL_USE_XATTR
// Set fields from extended file attributes.
// These can be overriden by values from inside the file
// These can be later augmented by values from inside the file
for (map<string,string>::const_iterator it = m_XAttrsFields.begin();
it != m_XAttrsFields.end(); it++) {
doc.meta[it->first] = it->second;
LOGDEB1(("Internfile:: setting [%s] from xattrs value [%s]\n",
m_cfg->fieldCanon(it->first).c_str(), it->second.c_str()));
doc.meta[m_cfg->fieldCanon(it->first)] = it->second;
}
#endif //RCL_USE_XATTR

View file

@ -232,19 +232,22 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
ConfSimple parms(data);
if (!parms.ok())
return false;
// Special cases:
parms.get(Doc::keyurl, doc.url);
parms.get(Doc::keytp, doc.mimetype);
parms.get(Doc::keyfmt, doc.fmtime);
parms.get(Doc::keydmt, doc.dmtime);
parms.get(Doc::keyoc, doc.origcharset);
parms.get(cstr_caption, doc.meta[Doc::keytt]);
parms.get(Doc::keykw, doc.meta[Doc::keykw]);
parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
// Possibly remove synthetic abstract indicator (if it's there, we
// used to index the beginning of the text as abstract).
doc.syntabs = false;
if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) {
doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
doc.meta[Doc::keyabs] =
doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
doc.syntabs = true;
}
parms.get(Doc::keyipt, doc.ipath);
@ -254,7 +257,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
parms.get(Doc::keysig, doc.sig);
doc.xdocid = docid;
// Other, not predefined meta fields:
// Normal key/value pairs:
vector<string> keys = parms.getNames(string());
for (vector<string>::const_iterator it = keys.begin();
it != keys.end(); it++) {
@ -1073,8 +1076,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
trimstring(doc.meta[Doc::keykw], " \t\r\n");
doc.meta[Doc::keykw] =
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
if (!doc.meta[Doc::keykw].empty())
RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);
// No need to explicitly append the keywords, this will be done by
// the "stored" loop
// If abstract is empty, we make up one with the beginning of the
// document. This is then not indexed, but part of the doc data so
@ -1094,16 +1097,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
cstr_nc);
}
if (!doc.meta[Doc::keyabs].empty())
RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);
const set<string>& stored = m_config->getStoredFields();
for (set<string>::const_iterator it = stored.begin();
it != stored.end(); it++) {
string nm = m_config->fieldCanon(*it);
if (!doc.meta[*it].empty()) {
if (!doc.meta[nm].empty()) {
string value =
neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
RECORD_APPEND(record, nm, value);
}
}

View file

@ -170,13 +170,29 @@ class Doc {
}
}
// Create entry or append text to existing entry.
bool addmeta(const string& nm, const string& value)
{
map<string,string>::iterator mit = meta.find(nm);
if (mit == meta.end()) {
meta[nm] = value;
} else if (mit->second.empty()) {
mit->second = value;
} else {
mit->second += string(" - ") + value;
}
return true;
}
void dump(bool dotext=false) const;
// The official names for recoll native fields when used in a text
// context (ie: the python interface duplicates some of the fixed
// fields in the meta array, these are the names used). Defined in
// rcldoc.cpp. For fields stored in the meta[] array (ie, title,
// author), filters _must_ use these values
// rcldoc.cpp. Fields stored in the meta[] array (ie, title,
// author), _must_ use these canonical values, not aliases. This is
// enforced in internfile.cpp and misc other bits of metadata-gathering
// code
static const string keyurl; // url
static const string keyfn; // file name
static const string keyipt; // ipath

View file

@ -64,15 +64,12 @@ recipient = XTO
[stored]
############################
# Some fields are stored in the document data record inside the index and
# can be returned in result lists. There is no necessity that stored fields
# should be indexed (have a prefix in the preceding section) (example:
# "url", but this one doesn't need to be listed here, it's stored by hard
# code)
# can be displayed in result lists. There is no necessity that stored fields
# should be indexed (have a prefix in the preceding section). Example: "url"
#
# Some fields are stored by default, don't add them here:
# caption, keywords, abstract, mimetype, url
# caption, mimetype, url
# Only canonical names should be used here, not aliases.
# "author" used to be stored by default, now set here as optional
# "rclaptg" is used for viewer specialization (depending on local config)
# "rclbes" defines the backend type (ie normal fs, firefox cache). Should
# probably be hardcoded, don't remove it
@ -81,6 +78,8 @@ recipient=
rclaptg=
rclbes=
filename=
keywords=
abstract=
[aliases]
##########################

View file

@ -29,4 +29,4 @@ localfields = rclaptg=gnuinfo
mhmboxquirks = tbird
[/home/dockes/projets/fulltext/testrecoll/cjk]
localfields= keyword = ckjtsthuniique: blabla= "some string"
localfields= ; keyword = ckjtsthuniique; blabla= "some string"