cleaned up processing of metadata from diverse origins (doc,extattrs,localfields)
This commit is contained in:
parent
89996fde05
commit
6423c3a91b
9 changed files with 83 additions and 47 deletions
|
@ -50,7 +50,7 @@ $headAndBody = 1;
|
|||
sub xapianTag {
|
||||
my $imgtag = shift;
|
||||
while ( ( $tagre, $xapiantag) = each %{$tagMap} ) {
|
||||
return $xapiantag if $imgtag =~ /$tagre/i;
|
||||
return $xapiantag if $imgtag =~ /^$tagre$/i;
|
||||
}
|
||||
return undef;
|
||||
}
|
||||
|
|
|
@ -134,6 +134,12 @@ awk 'BEGIN'\
|
|||
$0 = part1 mid part2
|
||||
}
|
||||
|
||||
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
|
||||
# "Subject" metadata field is more like an HTML "description"
|
||||
if(doescape == 0 && $0 ~ /<meta ?name="Subject"/){
|
||||
gsub(/="Subject"/, "=\"Description\"", $0)
|
||||
}
|
||||
|
||||
if ($0 == "<pre>"){
|
||||
# Begin of body text. need to escape some chars from now on as
|
||||
# pdftotext sometimes doesnt do it
|
||||
|
|
|
@ -47,8 +47,10 @@
|
|||
#include "rclinit.h"
|
||||
#include "execmd.h"
|
||||
|
||||
// When using extended attributes, we have to use the ctime.
|
||||
// This is quite an expensive price to pay...
|
||||
|
||||
// When using extended attributes, we have to use the ctime, because
|
||||
// this is all that gets set when the attributes are modified. This
|
||||
// is quite an expensive price to pay...
|
||||
#ifdef RCL_USE_XATTR
|
||||
#define RCL_STTIME st_ctime
|
||||
#else
|
||||
|
@ -420,24 +422,23 @@ void FsIndexer::localfieldsfromconf()
|
|||
vector<string> nmlst = attrs.getNames(cstr_null);
|
||||
for (vector<string>::const_iterator it = nmlst.begin();
|
||||
it != nmlst.end(); it++) {
|
||||
attrs.get(*it, m_localfields[*it]);
|
||||
string nm = m_config->fieldCanon(*it);
|
||||
attrs.get(*it, m_localfields[nm]);
|
||||
LOGDEB2(("FsIndexer::localfieldsfromconf: [%s]->[%s]\n",
|
||||
nm.c_str(), m_localfields[nm].c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
void FsIndexer::setlocalfields(const map<string, string>& fields, Rcl::Doc& doc)
|
||||
{
|
||||
for (map<string, string>::const_iterator it = fields.begin();
|
||||
it != fields.end(); it++) {
|
||||
// Should local fields override those coming from the document
|
||||
// ? I think not, but not too sure. We could also chose to
|
||||
// concatenate the values ?
|
||||
if (doc.meta.find(it->second) == doc.meta.end()) {
|
||||
// Being chosen by the user, localfields override values from
|
||||
// the filter. The key is already canonic (see
|
||||
// localfieldsfromconf())
|
||||
doc.meta[it->first] = it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Metadata gathering commands
|
||||
void FsIndexer::mdreapersfromconf()
|
||||
|
@ -484,7 +485,9 @@ void FsIndexer::reapmetadata(const vector<MDReaper>& reapers, const string& fn,
|
|||
}
|
||||
string output;
|
||||
if (ExecCmd::backtick(cmd, output)) {
|
||||
doc.meta[rp->fieldname] += string(" ") + output;
|
||||
// addmeta() creates or appends. fieldname is already
|
||||
// canonic (see above)
|
||||
doc.addmeta(rp->fieldname, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -534,7 +537,7 @@ void *FsIndexerInternfileWorker(void * fsp)
|
|||
TempDir tmpdir;
|
||||
RclConfig myconf(*(fip->m_stableconfig));
|
||||
|
||||
InternfileTask *tsk;
|
||||
InternfileTask *tsk = 0;
|
||||
for (;;) {
|
||||
if (!tqp->take(&tsk)) {
|
||||
tqp->workerExit();
|
||||
|
|
|
@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
|
|||
code &= ~(IN_ISDIR|IN_ONESHOT);
|
||||
switch (code) {
|
||||
case IN_ACCESS: return "IN_ACCESS";
|
||||
case IN_MODIFY: return "IN_MODIFY";
|
||||
case IN_ATTRIB: return "IN_ATTRIB";
|
||||
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
|
||||
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
||||
case IN_CLOSE: return "IN_CLOSE";
|
||||
case IN_OPEN: return "IN_OPEN";
|
||||
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
||||
case IN_MOVED_TO: return "IN_MOVED_TO";
|
||||
case IN_MOVE: return "IN_MOVE";
|
||||
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
||||
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
|
||||
case IN_CREATE: return "IN_CREATE";
|
||||
case IN_DELETE: return "IN_DELETE";
|
||||
case IN_DELETE_SELF: return "IN_DELETE_SELF";
|
||||
case IN_MOVE_SELF: return "IN_MOVE_SELF";
|
||||
case IN_UNMOUNT: return "IN_UNMOUNT";
|
||||
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
||||
case IN_IGNORED: return "IN_IGNORED";
|
||||
case IN_MODIFY: return "IN_MODIFY";
|
||||
case IN_MOVE: return "IN_MOVE";
|
||||
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
||||
case IN_MOVED_TO: return "IN_MOVED_TO";
|
||||
case IN_MOVE_SELF: return "IN_MOVE_SELF";
|
||||
case IN_OPEN: return "IN_OPEN";
|
||||
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
||||
case IN_UNMOUNT: return "IN_UNMOUNT";
|
||||
default: {
|
||||
static char msg[50];
|
||||
sprintf(msg, "Unknown event 0x%x", code);
|
||||
|
@ -599,6 +599,12 @@ bool RclIntf::addWatch(const string& path, bool)
|
|||
// CLOSE_WRITE is covered through MODIFY. CREATE is needed for mkdirs
|
||||
uint32_t mask = IN_MODIFY | IN_CREATE
|
||||
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
|
||||
#ifdef RCL_USE_XATTR
|
||||
// It seems that IN_ATTRIB is not needed to receive extattr
|
||||
// modification events, which is a bit weird because only ctime is
|
||||
// set.
|
||||
// | IN_ATTRIB
|
||||
#endif
|
||||
#ifdef IN_DONT_FOLLOW
|
||||
| IN_DONT_FOLLOW
|
||||
#endif
|
||||
|
@ -692,7 +698,7 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
|
|||
eraseWatchSubTree(m_idtopath, ev.m_path);
|
||||
}
|
||||
|
||||
|
||||
// IN_ATTRIB apparently not needed, see comment above
|
||||
if (evp->mask & (IN_MODIFY)) {
|
||||
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
|
||||
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
|
||||
|
|
|
@ -80,6 +80,7 @@ static string colon_restore(const string& in)
|
|||
#ifdef RCL_USE_XATTR
|
||||
void FileInterner::reapXAttrs(const string& path)
|
||||
{
|
||||
LOGDEB2(("FileInterner::reapXAttrs: [%s]\n", path.c_str()));
|
||||
vector<string> xnames;
|
||||
if (!pxattr::list(path, &xnames)) {
|
||||
LOGERR(("FileInterner::reapXattrs: pxattr::list: errno %d\n", errno));
|
||||
|
@ -98,6 +99,8 @@ void FileInterner::reapXAttrs(const string& path)
|
|||
}
|
||||
// Encode should we ?
|
||||
m_XAttrsFields[mit->second] = value;
|
||||
LOGDEB2(("FileInterner::reapXAttrs: got [%s] -> [%s]\n",
|
||||
mit->second.c_str(), value.c_str()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -626,7 +629,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
|||
it->first == cstr_dj_keycharset) {
|
||||
// don't need/want these.
|
||||
} else {
|
||||
doc.meta[it->first] = it->second;
|
||||
doc.addmeta(m_cfg->fieldCanon(it->first), it->second);
|
||||
}
|
||||
}
|
||||
if (doc.meta[Rcl::Doc::keyabs].empty() &&
|
||||
|
@ -659,10 +662,12 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
|
|||
|
||||
#ifdef RCL_USE_XATTR
|
||||
// Set fields from extended file attributes.
|
||||
// These can be overriden by values from inside the file
|
||||
// These can be later augmented by values from inside the file
|
||||
for (map<string,string>::const_iterator it = m_XAttrsFields.begin();
|
||||
it != m_XAttrsFields.end(); it++) {
|
||||
doc.meta[it->first] = it->second;
|
||||
LOGDEB1(("Internfile:: setting [%s] from xattrs value [%s]\n",
|
||||
m_cfg->fieldCanon(it->first).c_str(), it->second.c_str()));
|
||||
doc.meta[m_cfg->fieldCanon(it->first)] = it->second;
|
||||
}
|
||||
#endif //RCL_USE_XATTR
|
||||
|
||||
|
|
|
@ -232,19 +232,22 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
|||
ConfSimple parms(data);
|
||||
if (!parms.ok())
|
||||
return false;
|
||||
|
||||
// Special cases:
|
||||
parms.get(Doc::keyurl, doc.url);
|
||||
parms.get(Doc::keytp, doc.mimetype);
|
||||
parms.get(Doc::keyfmt, doc.fmtime);
|
||||
parms.get(Doc::keydmt, doc.dmtime);
|
||||
parms.get(Doc::keyoc, doc.origcharset);
|
||||
parms.get(cstr_caption, doc.meta[Doc::keytt]);
|
||||
parms.get(Doc::keykw, doc.meta[Doc::keykw]);
|
||||
|
||||
parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
|
||||
// Possibly remove synthetic abstract indicator (if it's there, we
|
||||
// used to index the beginning of the text as abstract).
|
||||
doc.syntabs = false;
|
||||
if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) {
|
||||
doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
|
||||
doc.meta[Doc::keyabs] =
|
||||
doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
|
||||
doc.syntabs = true;
|
||||
}
|
||||
parms.get(Doc::keyipt, doc.ipath);
|
||||
|
@ -254,7 +257,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
|||
parms.get(Doc::keysig, doc.sig);
|
||||
doc.xdocid = docid;
|
||||
|
||||
// Other, not predefined meta fields:
|
||||
// Normal key/value pairs:
|
||||
vector<string> keys = parms.getNames(string());
|
||||
for (vector<string>::const_iterator it = keys.begin();
|
||||
it != keys.end(); it++) {
|
||||
|
@ -1073,8 +1076,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||
trimstring(doc.meta[Doc::keykw], " \t\r\n");
|
||||
doc.meta[Doc::keykw] =
|
||||
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
|
||||
if (!doc.meta[Doc::keykw].empty())
|
||||
RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);
|
||||
// No need to explicitly append the keywords, this will be done by
|
||||
// the "stored" loop
|
||||
|
||||
// If abstract is empty, we make up one with the beginning of the
|
||||
// document. This is then not indexed, but part of the doc data so
|
||||
|
@ -1094,16 +1097,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
||||
cstr_nc);
|
||||
}
|
||||
if (!doc.meta[Doc::keyabs].empty())
|
||||
RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);
|
||||
|
||||
const set<string>& stored = m_config->getStoredFields();
|
||||
for (set<string>::const_iterator it = stored.begin();
|
||||
it != stored.end(); it++) {
|
||||
string nm = m_config->fieldCanon(*it);
|
||||
if (!doc.meta[*it].empty()) {
|
||||
if (!doc.meta[nm].empty()) {
|
||||
string value =
|
||||
neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
|
||||
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
||||
RECORD_APPEND(record, nm, value);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -170,13 +170,29 @@ class Doc {
|
|||
}
|
||||
}
|
||||
|
||||
// Create entry or append text to existing entry.
|
||||
bool addmeta(const string& nm, const string& value)
|
||||
{
|
||||
map<string,string>::iterator mit = meta.find(nm);
|
||||
if (mit == meta.end()) {
|
||||
meta[nm] = value;
|
||||
} else if (mit->second.empty()) {
|
||||
mit->second = value;
|
||||
} else {
|
||||
mit->second += string(" - ") + value;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void dump(bool dotext=false) const;
|
||||
|
||||
// The official names for recoll native fields when used in a text
|
||||
// context (ie: the python interface duplicates some of the fixed
|
||||
// fields in the meta array, these are the names used). Defined in
|
||||
// rcldoc.cpp. For fields stored in the meta[] array (ie, title,
|
||||
// author), filters _must_ use these values
|
||||
// rcldoc.cpp. Fields stored in the meta[] array (ie, title,
|
||||
// author), _must_ use these canonical values, not aliases. This is
|
||||
// enforced in internfile.cpp and misc other bits of metadata-gathering
|
||||
// code
|
||||
static const string keyurl; // url
|
||||
static const string keyfn; // file name
|
||||
static const string keyipt; // ipath
|
||||
|
|
|
@ -64,15 +64,12 @@ recipient = XTO
|
|||
[stored]
|
||||
############################
|
||||
# Some fields are stored in the document data record inside the index and
|
||||
# can be returned in result lists. There is no necessity that stored fields
|
||||
# should be indexed (have a prefix in the preceding section) (example:
|
||||
# "url", but this one doesn't need to be listed here, it's stored by hard
|
||||
# code)
|
||||
# can be displayed in result lists. There is no necessity that stored fields
|
||||
# should be indexed (have a prefix in the preceding section). Example: "url"
|
||||
#
|
||||
# Some fields are stored by default, don't add them here:
|
||||
# caption, keywords, abstract, mimetype, url
|
||||
# caption, mimetype, url
|
||||
# Only canonical names should be used here, not aliases.
|
||||
# "author" used to be stored by default, now set here as optional
|
||||
# "rclaptg" is used for viewer specialization (depending on local config)
|
||||
# "rclbes" defines the backend type (ie normal fs, firefox cache). Should
|
||||
# probably be hardcoded, don't remove it
|
||||
|
@ -81,6 +78,8 @@ recipient=
|
|||
rclaptg=
|
||||
rclbes=
|
||||
filename=
|
||||
keywords=
|
||||
abstract=
|
||||
|
||||
[aliases]
|
||||
##########################
|
||||
|
|
|
@ -29,4 +29,4 @@ localfields = rclaptg=gnuinfo
|
|||
mhmboxquirks = tbird
|
||||
|
||||
[/home/dockes/projets/fulltext/testrecoll/cjk]
|
||||
localfields= keyword = ckjtsthuniique: blabla= "some string"
|
||||
localfields= ; keyword = ckjtsthuniique; blabla= "some string"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue