cleaned up processing of metadata from diverse origins (doc,extattrs,localfields)

This commit is contained in:
Jean-Francois Dockes 2013-01-29 14:33:57 +01:00
parent 89996fde05
commit 6423c3a91b
9 changed files with 83 additions and 47 deletions

View file

@ -50,7 +50,7 @@ $headAndBody = 1;
sub xapianTag { sub xapianTag {
my $imgtag = shift; my $imgtag = shift;
while ( ( $tagre, $xapiantag) = each %{$tagMap} ) { while ( ( $tagre, $xapiantag) = each %{$tagMap} ) {
return $xapiantag if $imgtag =~ /$tagre/i; return $xapiantag if $imgtag =~ /^$tagre$/i;
} }
return undef; return undef;
} }

View file

@ -132,7 +132,13 @@ awk 'BEGIN'\
gsub(/>/, "\\>", mid) gsub(/>/, "\\>", mid)
mid = "<title>" mid "</title>" mid = "<title>" mid "</title>"
$0 = part1 mid part2 $0 = part1 mid part2
} }
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
# "Subject" metadata field is more like an HTML "description"
if(doescape == 0 && $0 ~ /<meta ?name="Subject"/){
gsub(/="Subject"/, "=\"Description\"", $0)
}
if ($0 == "<pre>"){ if ($0 == "<pre>"){
# Begin of body text. need to escape some chars from now on as # Begin of body text. need to escape some chars from now on as

View file

@ -47,8 +47,10 @@
#include "rclinit.h" #include "rclinit.h"
#include "execmd.h" #include "execmd.h"
// When using extended attributes, we have to use the ctime.
// This is quite an expensive price to pay... // When using extended attributes, we have to use the ctime, because
// this is all that gets set when the attributes are modified. This
// is quite an expensive price to pay...
#ifdef RCL_USE_XATTR #ifdef RCL_USE_XATTR
#define RCL_STTIME st_ctime #define RCL_STTIME st_ctime
#else #else
@ -420,22 +422,21 @@ void FsIndexer::localfieldsfromconf()
vector<string> nmlst = attrs.getNames(cstr_null); vector<string> nmlst = attrs.getNames(cstr_null);
for (vector<string>::const_iterator it = nmlst.begin(); for (vector<string>::const_iterator it = nmlst.begin();
it != nmlst.end(); it++) { it != nmlst.end(); it++) {
attrs.get(*it, m_localfields[*it]); string nm = m_config->fieldCanon(*it);
attrs.get(*it, m_localfields[nm]);
LOGDEB2(("FsIndexer::localfieldsfromconf: [%s]->[%s]\n",
nm.c_str(), m_localfields[nm].c_str()));
} }
} }
//
void FsIndexer::setlocalfields(const map<string, string>& fields, Rcl::Doc& doc) void FsIndexer::setlocalfields(const map<string, string>& fields, Rcl::Doc& doc)
{ {
for (map<string, string>::const_iterator it = fields.begin(); for (map<string, string>::const_iterator it = fields.begin();
it != fields.end(); it++) { it != fields.end(); it++) {
// Should local fields override those coming from the document // Being chosen by the user, localfields override values from
// ? I think not, but not too sure. We could also chose to // the filter. The key is already canonic (see
// concatenate the values ? // localfieldsfromconf())
if (doc.meta.find(it->second) == doc.meta.end()) { doc.meta[it->first] = it->second;
doc.meta[it->first] = it->second;
}
} }
} }
@ -484,7 +485,9 @@ void FsIndexer::reapmetadata(const vector<MDReaper>& reapers, const string& fn,
} }
string output; string output;
if (ExecCmd::backtick(cmd, output)) { if (ExecCmd::backtick(cmd, output)) {
doc.meta[rp->fieldname] += string(" ") + output; // addmeta() creates or appends. fieldname is already
// canonic (see above)
doc.addmeta(rp->fieldname, output);
} }
} }
} }
@ -534,7 +537,7 @@ void *FsIndexerInternfileWorker(void * fsp)
TempDir tmpdir; TempDir tmpdir;
RclConfig myconf(*(fip->m_stableconfig)); RclConfig myconf(*(fip->m_stableconfig));
InternfileTask *tsk; InternfileTask *tsk = 0;
for (;;) { for (;;) {
if (!tqp->take(&tsk)) { if (!tqp->take(&tsk)) {
tqp->workerExit(); tqp->workerExit();

View file

@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
code &= ~(IN_ISDIR|IN_ONESHOT); code &= ~(IN_ISDIR|IN_ONESHOT);
switch (code) { switch (code) {
case IN_ACCESS: return "IN_ACCESS"; case IN_ACCESS: return "IN_ACCESS";
case IN_MODIFY: return "IN_MODIFY";
case IN_ATTRIB: return "IN_ATTRIB"; case IN_ATTRIB: return "IN_ATTRIB";
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
case IN_CLOSE: return "IN_CLOSE"; case IN_CLOSE: return "IN_CLOSE";
case IN_OPEN: return "IN_OPEN"; case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
case IN_MOVED_FROM: return "IN_MOVED_FROM"; case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
case IN_MOVED_TO: return "IN_MOVED_TO";
case IN_MOVE: return "IN_MOVE";
case IN_CREATE: return "IN_CREATE"; case IN_CREATE: return "IN_CREATE";
case IN_DELETE: return "IN_DELETE"; case IN_DELETE: return "IN_DELETE";
case IN_DELETE_SELF: return "IN_DELETE_SELF"; case IN_DELETE_SELF: return "IN_DELETE_SELF";
case IN_MOVE_SELF: return "IN_MOVE_SELF";
case IN_UNMOUNT: return "IN_UNMOUNT";
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
case IN_IGNORED: return "IN_IGNORED"; case IN_IGNORED: return "IN_IGNORED";
case IN_MODIFY: return "IN_MODIFY";
case IN_MOVE: return "IN_MOVE";
case IN_MOVED_FROM: return "IN_MOVED_FROM";
case IN_MOVED_TO: return "IN_MOVED_TO";
case IN_MOVE_SELF: return "IN_MOVE_SELF";
case IN_OPEN: return "IN_OPEN";
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
case IN_UNMOUNT: return "IN_UNMOUNT";
default: { default: {
static char msg[50]; static char msg[50];
sprintf(msg, "Unknown event 0x%x", code); sprintf(msg, "Unknown event 0x%x", code);
@ -599,6 +599,12 @@ bool RclIntf::addWatch(const string& path, bool)
// CLOSE_WRITE is covered through MODIFY. CREATE is needed for mkdirs // CLOSE_WRITE is covered through MODIFY. CREATE is needed for mkdirs
uint32_t mask = IN_MODIFY | IN_CREATE uint32_t mask = IN_MODIFY | IN_CREATE
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE | IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
#ifdef RCL_USE_XATTR
// It seems that IN_ATTRIB is not needed to receive extattr
// modification events, which is a bit weird because only ctime is
// set.
// | IN_ATTRIB
#endif
#ifdef IN_DONT_FOLLOW #ifdef IN_DONT_FOLLOW
| IN_DONT_FOLLOW | IN_DONT_FOLLOW
#endif #endif
@ -692,7 +698,7 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
eraseWatchSubTree(m_idtopath, ev.m_path); eraseWatchSubTree(m_idtopath, ev.m_path);
} }
// IN_ATTRIB apparently not needed, see comment above
if (evp->mask & (IN_MODIFY)) { if (evp->mask & (IN_MODIFY)) {
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY; ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) { } else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {

View file

@ -80,6 +80,7 @@ static string colon_restore(const string& in)
#ifdef RCL_USE_XATTR #ifdef RCL_USE_XATTR
void FileInterner::reapXAttrs(const string& path) void FileInterner::reapXAttrs(const string& path)
{ {
LOGDEB2(("FileInterner::reapXAttrs: [%s]\n", path.c_str()));
vector<string> xnames; vector<string> xnames;
if (!pxattr::list(path, &xnames)) { if (!pxattr::list(path, &xnames)) {
LOGERR(("FileInterner::reapXattrs: pxattr::list: errno %d\n", errno)); LOGERR(("FileInterner::reapXattrs: pxattr::list: errno %d\n", errno));
@ -98,6 +99,8 @@ void FileInterner::reapXAttrs(const string& path)
} }
// Encode should we ? // Encode should we ?
m_XAttrsFields[mit->second] = value; m_XAttrsFields[mit->second] = value;
LOGDEB2(("FileInterner::reapXAttrs: got [%s] -> [%s]\n",
mit->second.c_str(), value.c_str()));
} }
} }
} }
@ -626,7 +629,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
it->first == cstr_dj_keycharset) { it->first == cstr_dj_keycharset) {
// don't need/want these. // don't need/want these.
} else { } else {
doc.meta[it->first] = it->second; doc.addmeta(m_cfg->fieldCanon(it->first), it->second);
} }
} }
if (doc.meta[Rcl::Doc::keyabs].empty() && if (doc.meta[Rcl::Doc::keyabs].empty() &&
@ -659,10 +662,12 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
#ifdef RCL_USE_XATTR #ifdef RCL_USE_XATTR
// Set fields from extended file attributes. // Set fields from extended file attributes.
// These can be overriden by values from inside the file // These can be later augmented by values from inside the file
for (map<string,string>::const_iterator it = m_XAttrsFields.begin(); for (map<string,string>::const_iterator it = m_XAttrsFields.begin();
it != m_XAttrsFields.end(); it++) { it != m_XAttrsFields.end(); it++) {
doc.meta[it->first] = it->second; LOGDEB1(("Internfile:: setting [%s] from xattrs value [%s]\n",
m_cfg->fieldCanon(it->first).c_str(), it->second.c_str()));
doc.meta[m_cfg->fieldCanon(it->first)] = it->second;
} }
#endif //RCL_USE_XATTR #endif //RCL_USE_XATTR

View file

@ -232,19 +232,22 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
ConfSimple parms(data); ConfSimple parms(data);
if (!parms.ok()) if (!parms.ok())
return false; return false;
// Special cases:
parms.get(Doc::keyurl, doc.url); parms.get(Doc::keyurl, doc.url);
parms.get(Doc::keytp, doc.mimetype); parms.get(Doc::keytp, doc.mimetype);
parms.get(Doc::keyfmt, doc.fmtime); parms.get(Doc::keyfmt, doc.fmtime);
parms.get(Doc::keydmt, doc.dmtime); parms.get(Doc::keydmt, doc.dmtime);
parms.get(Doc::keyoc, doc.origcharset); parms.get(Doc::keyoc, doc.origcharset);
parms.get(cstr_caption, doc.meta[Doc::keytt]); parms.get(cstr_caption, doc.meta[Doc::keytt]);
parms.get(Doc::keykw, doc.meta[Doc::keykw]);
parms.get(Doc::keyabs, doc.meta[Doc::keyabs]); parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
// Possibly remove synthetic abstract indicator (if it's there, we // Possibly remove synthetic abstract indicator (if it's there, we
// used to index the beginning of the text as abstract). // used to index the beginning of the text as abstract).
doc.syntabs = false; doc.syntabs = false;
if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) { if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) {
doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length()); doc.meta[Doc::keyabs] =
doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
doc.syntabs = true; doc.syntabs = true;
} }
parms.get(Doc::keyipt, doc.ipath); parms.get(Doc::keyipt, doc.ipath);
@ -254,7 +257,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
parms.get(Doc::keysig, doc.sig); parms.get(Doc::keysig, doc.sig);
doc.xdocid = docid; doc.xdocid = docid;
// Other, not predefined meta fields: // Normal key/value pairs:
vector<string> keys = parms.getNames(string()); vector<string> keys = parms.getNames(string());
for (vector<string>::const_iterator it = keys.begin(); for (vector<string>::const_iterator it = keys.begin();
it != keys.end(); it++) { it != keys.end(); it++) {
@ -1073,8 +1076,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
trimstring(doc.meta[Doc::keykw], " \t\r\n"); trimstring(doc.meta[Doc::keykw], " \t\r\n");
doc.meta[Doc::keykw] = doc.meta[Doc::keykw] =
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc); neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
if (!doc.meta[Doc::keykw].empty()) // No need to explicitly append the keywords, this will be done by
RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]); // the "stored" loop
// If abstract is empty, we make up one with the beginning of the // If abstract is empty, we make up one with the beginning of the
// document. This is then not indexed, but part of the doc data so // document. This is then not indexed, but part of the doc data so
@ -1094,16 +1097,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen), neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
cstr_nc); cstr_nc);
} }
if (!doc.meta[Doc::keyabs].empty())
RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);
const set<string>& stored = m_config->getStoredFields(); const set<string>& stored = m_config->getStoredFields();
for (set<string>::const_iterator it = stored.begin(); for (set<string>::const_iterator it = stored.begin();
it != stored.end(); it++) { it != stored.end(); it++) {
string nm = m_config->fieldCanon(*it); string nm = m_config->fieldCanon(*it);
if (!doc.meta[*it].empty()) { if (!doc.meta[nm].empty()) {
string value = string value =
neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc); neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
RECORD_APPEND(record, nm, value); RECORD_APPEND(record, nm, value);
} }
} }

View file

@ -170,13 +170,29 @@ class Doc {
} }
} }
// Create entry or append text to existing entry.
bool addmeta(const string& nm, const string& value)
{
map<string,string>::iterator mit = meta.find(nm);
if (mit == meta.end()) {
meta[nm] = value;
} else if (mit->second.empty()) {
mit->second = value;
} else {
mit->second += string(" - ") + value;
}
return true;
}
void dump(bool dotext=false) const; void dump(bool dotext=false) const;
// The official names for recoll native fields when used in a text // The official names for recoll native fields when used in a text
// context (ie: the python interface duplicates some of the fixed // context (ie: the python interface duplicates some of the fixed
// fields in the meta array, these are the names used). Defined in // fields in the meta array, these are the names used). Defined in
// rcldoc.cpp. For fields stored in the meta[] array (ie, title, // rcldoc.cpp. Fields stored in the meta[] array (ie, title,
// author), filters _must_ use these values // author), _must_ use these canonical values, not aliases. This is
// enforced in internfile.cpp and misc other bits of metadata-gathering
// code
static const string keyurl; // url static const string keyurl; // url
static const string keyfn; // file name static const string keyfn; // file name
static const string keyipt; // ipath static const string keyipt; // ipath

View file

@ -64,15 +64,12 @@ recipient = XTO
[stored] [stored]
############################ ############################
# Some fields are stored in the document data record inside the index and # Some fields are stored in the document data record inside the index and
# can be returned in result lists. There is no necessity that stored fields # can be displayed in result lists. There is no necessity that stored fields
# should be indexed (have a prefix in the preceding section) (example: # should be indexed (have a prefix in the preceding section). Example: "url"
# "url", but this one doesn't need to be listed here, it's stored by hard
# code)
# #
# Some fields are stored by default, don't add them here: # Some fields are stored by default, don't add them here:
# caption, keywords, abstract, mimetype, url # caption, mimetype, url
# Only canonical names should be used here, not aliases. # Only canonical names should be used here, not aliases.
# "author" used to be stored by default, now set here as optional
# "rclaptg" is used for viewer specialization (depending on local config) # "rclaptg" is used for viewer specialization (depending on local config)
# "rclbes" defines the backend type (ie normal fs, firefox cache). Should # "rclbes" defines the backend type (ie normal fs, firefox cache). Should
# probably be hardcoded, don't remove it # probably be hardcoded, don't remove it
@ -81,6 +78,8 @@ recipient=
rclaptg= rclaptg=
rclbes= rclbes=
filename= filename=
keywords=
abstract=
[aliases] [aliases]
########################## ##########################

View file

@ -29,4 +29,4 @@ localfields = rclaptg=gnuinfo
mhmboxquirks = tbird mhmboxquirks = tbird
[/home/dockes/projets/fulltext/testrecoll/cjk] [/home/dockes/projets/fulltext/testrecoll/cjk]
localfields= keyword = ckjtsthuniique: blabla= "some string" localfields= ; keyword = ckjtsthuniique; blabla= "some string"