cleaned up processing of metadata from diverse origins (doc,extattrs,localfields)
This commit is contained in:
parent
89996fde05
commit
6423c3a91b
9 changed files with 83 additions and 47 deletions
|
@ -50,7 +50,7 @@ $headAndBody = 1;
|
||||||
sub xapianTag {
|
sub xapianTag {
|
||||||
my $imgtag = shift;
|
my $imgtag = shift;
|
||||||
while ( ( $tagre, $xapiantag) = each %{$tagMap} ) {
|
while ( ( $tagre, $xapiantag) = each %{$tagMap} ) {
|
||||||
return $xapiantag if $imgtag =~ /$tagre/i;
|
return $xapiantag if $imgtag =~ /^$tagre$/i;
|
||||||
}
|
}
|
||||||
return undef;
|
return undef;
|
||||||
}
|
}
|
||||||
|
|
|
@ -134,6 +134,12 @@ awk 'BEGIN'\
|
||||||
$0 = part1 mid part2
|
$0 = part1 mid part2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Recoll treats "Subject" as a "title" element (based on emails). The PDF
|
||||||
|
# "Subject" metadata field is more like an HTML "description"
|
||||||
|
if(doescape == 0 && $0 ~ /<meta ?name="Subject"/){
|
||||||
|
gsub(/="Subject"/, "=\"Description\"", $0)
|
||||||
|
}
|
||||||
|
|
||||||
if ($0 == "<pre>"){
|
if ($0 == "<pre>"){
|
||||||
# Begin of body text. need to escape some chars from now on as
|
# Begin of body text. need to escape some chars from now on as
|
||||||
# pdftotext sometimes doesnt do it
|
# pdftotext sometimes doesnt do it
|
||||||
|
|
|
@ -47,8 +47,10 @@
|
||||||
#include "rclinit.h"
|
#include "rclinit.h"
|
||||||
#include "execmd.h"
|
#include "execmd.h"
|
||||||
|
|
||||||
// When using extended attributes, we have to use the ctime.
|
|
||||||
// This is quite an expensive price to pay...
|
// When using extended attributes, we have to use the ctime, because
|
||||||
|
// this is all that gets set when the attributes are modified. This
|
||||||
|
// is quite an expensive price to pay...
|
||||||
#ifdef RCL_USE_XATTR
|
#ifdef RCL_USE_XATTR
|
||||||
#define RCL_STTIME st_ctime
|
#define RCL_STTIME st_ctime
|
||||||
#else
|
#else
|
||||||
|
@ -420,23 +422,22 @@ void FsIndexer::localfieldsfromconf()
|
||||||
vector<string> nmlst = attrs.getNames(cstr_null);
|
vector<string> nmlst = attrs.getNames(cstr_null);
|
||||||
for (vector<string>::const_iterator it = nmlst.begin();
|
for (vector<string>::const_iterator it = nmlst.begin();
|
||||||
it != nmlst.end(); it++) {
|
it != nmlst.end(); it++) {
|
||||||
attrs.get(*it, m_localfields[*it]);
|
string nm = m_config->fieldCanon(*it);
|
||||||
|
attrs.get(*it, m_localfields[nm]);
|
||||||
|
LOGDEB2(("FsIndexer::localfieldsfromconf: [%s]->[%s]\n",
|
||||||
|
nm.c_str(), m_localfields[nm].c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
void FsIndexer::setlocalfields(const map<string, string>& fields, Rcl::Doc& doc)
|
void FsIndexer::setlocalfields(const map<string, string>& fields, Rcl::Doc& doc)
|
||||||
{
|
{
|
||||||
for (map<string, string>::const_iterator it = fields.begin();
|
for (map<string, string>::const_iterator it = fields.begin();
|
||||||
it != fields.end(); it++) {
|
it != fields.end(); it++) {
|
||||||
// Should local fields override those coming from the document
|
// Being chosen by the user, localfields override values from
|
||||||
// ? I think not, but not too sure. We could also chose to
|
// the filter. The key is already canonic (see
|
||||||
// concatenate the values ?
|
// localfieldsfromconf())
|
||||||
if (doc.meta.find(it->second) == doc.meta.end()) {
|
|
||||||
doc.meta[it->first] = it->second;
|
doc.meta[it->first] = it->second;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metadata gathering commands
|
// Metadata gathering commands
|
||||||
|
@ -484,7 +485,9 @@ void FsIndexer::reapmetadata(const vector<MDReaper>& reapers, const string& fn,
|
||||||
}
|
}
|
||||||
string output;
|
string output;
|
||||||
if (ExecCmd::backtick(cmd, output)) {
|
if (ExecCmd::backtick(cmd, output)) {
|
||||||
doc.meta[rp->fieldname] += string(" ") + output;
|
// addmeta() creates or appends. fieldname is already
|
||||||
|
// canonic (see above)
|
||||||
|
doc.addmeta(rp->fieldname, output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -534,7 +537,7 @@ void *FsIndexerInternfileWorker(void * fsp)
|
||||||
TempDir tmpdir;
|
TempDir tmpdir;
|
||||||
RclConfig myconf(*(fip->m_stableconfig));
|
RclConfig myconf(*(fip->m_stableconfig));
|
||||||
|
|
||||||
InternfileTask *tsk;
|
InternfileTask *tsk = 0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
if (!tqp->take(&tsk)) {
|
if (!tqp->take(&tsk)) {
|
||||||
tqp->workerExit();
|
tqp->workerExit();
|
||||||
|
|
|
@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
|
||||||
code &= ~(IN_ISDIR|IN_ONESHOT);
|
code &= ~(IN_ISDIR|IN_ONESHOT);
|
||||||
switch (code) {
|
switch (code) {
|
||||||
case IN_ACCESS: return "IN_ACCESS";
|
case IN_ACCESS: return "IN_ACCESS";
|
||||||
case IN_MODIFY: return "IN_MODIFY";
|
|
||||||
case IN_ATTRIB: return "IN_ATTRIB";
|
case IN_ATTRIB: return "IN_ATTRIB";
|
||||||
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
|
|
||||||
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
|
||||||
case IN_CLOSE: return "IN_CLOSE";
|
case IN_CLOSE: return "IN_CLOSE";
|
||||||
case IN_OPEN: return "IN_OPEN";
|
case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
|
||||||
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
|
||||||
case IN_MOVED_TO: return "IN_MOVED_TO";
|
|
||||||
case IN_MOVE: return "IN_MOVE";
|
|
||||||
case IN_CREATE: return "IN_CREATE";
|
case IN_CREATE: return "IN_CREATE";
|
||||||
case IN_DELETE: return "IN_DELETE";
|
case IN_DELETE: return "IN_DELETE";
|
||||||
case IN_DELETE_SELF: return "IN_DELETE_SELF";
|
case IN_DELETE_SELF: return "IN_DELETE_SELF";
|
||||||
case IN_MOVE_SELF: return "IN_MOVE_SELF";
|
|
||||||
case IN_UNMOUNT: return "IN_UNMOUNT";
|
|
||||||
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
|
||||||
case IN_IGNORED: return "IN_IGNORED";
|
case IN_IGNORED: return "IN_IGNORED";
|
||||||
|
case IN_MODIFY: return "IN_MODIFY";
|
||||||
|
case IN_MOVE: return "IN_MOVE";
|
||||||
|
case IN_MOVED_FROM: return "IN_MOVED_FROM";
|
||||||
|
case IN_MOVED_TO: return "IN_MOVED_TO";
|
||||||
|
case IN_MOVE_SELF: return "IN_MOVE_SELF";
|
||||||
|
case IN_OPEN: return "IN_OPEN";
|
||||||
|
case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
|
||||||
|
case IN_UNMOUNT: return "IN_UNMOUNT";
|
||||||
default: {
|
default: {
|
||||||
static char msg[50];
|
static char msg[50];
|
||||||
sprintf(msg, "Unknown event 0x%x", code);
|
sprintf(msg, "Unknown event 0x%x", code);
|
||||||
|
@ -599,6 +599,12 @@ bool RclIntf::addWatch(const string& path, bool)
|
||||||
// CLOSE_WRITE is covered through MODIFY. CREATE is needed for mkdirs
|
// CLOSE_WRITE is covered through MODIFY. CREATE is needed for mkdirs
|
||||||
uint32_t mask = IN_MODIFY | IN_CREATE
|
uint32_t mask = IN_MODIFY | IN_CREATE
|
||||||
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
|
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
|
||||||
|
#ifdef RCL_USE_XATTR
|
||||||
|
// It seems that IN_ATTRIB is not needed to receive extattr
|
||||||
|
// modification events, which is a bit weird because only ctime is
|
||||||
|
// set.
|
||||||
|
// | IN_ATTRIB
|
||||||
|
#endif
|
||||||
#ifdef IN_DONT_FOLLOW
|
#ifdef IN_DONT_FOLLOW
|
||||||
| IN_DONT_FOLLOW
|
| IN_DONT_FOLLOW
|
||||||
#endif
|
#endif
|
||||||
|
@ -692,7 +698,7 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
|
||||||
eraseWatchSubTree(m_idtopath, ev.m_path);
|
eraseWatchSubTree(m_idtopath, ev.m_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IN_ATTRIB apparently not needed, see comment above
|
||||||
if (evp->mask & (IN_MODIFY)) {
|
if (evp->mask & (IN_MODIFY)) {
|
||||||
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
|
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
|
||||||
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
|
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
|
||||||
|
|
|
@ -80,6 +80,7 @@ static string colon_restore(const string& in)
|
||||||
#ifdef RCL_USE_XATTR
|
#ifdef RCL_USE_XATTR
|
||||||
void FileInterner::reapXAttrs(const string& path)
|
void FileInterner::reapXAttrs(const string& path)
|
||||||
{
|
{
|
||||||
|
LOGDEB2(("FileInterner::reapXAttrs: [%s]\n", path.c_str()));
|
||||||
vector<string> xnames;
|
vector<string> xnames;
|
||||||
if (!pxattr::list(path, &xnames)) {
|
if (!pxattr::list(path, &xnames)) {
|
||||||
LOGERR(("FileInterner::reapXattrs: pxattr::list: errno %d\n", errno));
|
LOGERR(("FileInterner::reapXattrs: pxattr::list: errno %d\n", errno));
|
||||||
|
@ -98,6 +99,8 @@ void FileInterner::reapXAttrs(const string& path)
|
||||||
}
|
}
|
||||||
// Encode should we ?
|
// Encode should we ?
|
||||||
m_XAttrsFields[mit->second] = value;
|
m_XAttrsFields[mit->second] = value;
|
||||||
|
LOGDEB2(("FileInterner::reapXAttrs: got [%s] -> [%s]\n",
|
||||||
|
mit->second.c_str(), value.c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -626,7 +629,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
||||||
it->first == cstr_dj_keycharset) {
|
it->first == cstr_dj_keycharset) {
|
||||||
// don't need/want these.
|
// don't need/want these.
|
||||||
} else {
|
} else {
|
||||||
doc.meta[it->first] = it->second;
|
doc.addmeta(m_cfg->fieldCanon(it->first), it->second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (doc.meta[Rcl::Doc::keyabs].empty() &&
|
if (doc.meta[Rcl::Doc::keyabs].empty() &&
|
||||||
|
@ -659,10 +662,12 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
|
||||||
|
|
||||||
#ifdef RCL_USE_XATTR
|
#ifdef RCL_USE_XATTR
|
||||||
// Set fields from extended file attributes.
|
// Set fields from extended file attributes.
|
||||||
// These can be overriden by values from inside the file
|
// These can be later augmented by values from inside the file
|
||||||
for (map<string,string>::const_iterator it = m_XAttrsFields.begin();
|
for (map<string,string>::const_iterator it = m_XAttrsFields.begin();
|
||||||
it != m_XAttrsFields.end(); it++) {
|
it != m_XAttrsFields.end(); it++) {
|
||||||
doc.meta[it->first] = it->second;
|
LOGDEB1(("Internfile:: setting [%s] from xattrs value [%s]\n",
|
||||||
|
m_cfg->fieldCanon(it->first).c_str(), it->second.c_str()));
|
||||||
|
doc.meta[m_cfg->fieldCanon(it->first)] = it->second;
|
||||||
}
|
}
|
||||||
#endif //RCL_USE_XATTR
|
#endif //RCL_USE_XATTR
|
||||||
|
|
||||||
|
|
|
@ -232,19 +232,22 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
||||||
ConfSimple parms(data);
|
ConfSimple parms(data);
|
||||||
if (!parms.ok())
|
if (!parms.ok())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
// Special cases:
|
||||||
parms.get(Doc::keyurl, doc.url);
|
parms.get(Doc::keyurl, doc.url);
|
||||||
parms.get(Doc::keytp, doc.mimetype);
|
parms.get(Doc::keytp, doc.mimetype);
|
||||||
parms.get(Doc::keyfmt, doc.fmtime);
|
parms.get(Doc::keyfmt, doc.fmtime);
|
||||||
parms.get(Doc::keydmt, doc.dmtime);
|
parms.get(Doc::keydmt, doc.dmtime);
|
||||||
parms.get(Doc::keyoc, doc.origcharset);
|
parms.get(Doc::keyoc, doc.origcharset);
|
||||||
parms.get(cstr_caption, doc.meta[Doc::keytt]);
|
parms.get(cstr_caption, doc.meta[Doc::keytt]);
|
||||||
parms.get(Doc::keykw, doc.meta[Doc::keykw]);
|
|
||||||
parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
|
parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
|
||||||
// Possibly remove synthetic abstract indicator (if it's there, we
|
// Possibly remove synthetic abstract indicator (if it's there, we
|
||||||
// used to index the beginning of the text as abstract).
|
// used to index the beginning of the text as abstract).
|
||||||
doc.syntabs = false;
|
doc.syntabs = false;
|
||||||
if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) {
|
if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) {
|
||||||
doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
|
doc.meta[Doc::keyabs] =
|
||||||
|
doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
|
||||||
doc.syntabs = true;
|
doc.syntabs = true;
|
||||||
}
|
}
|
||||||
parms.get(Doc::keyipt, doc.ipath);
|
parms.get(Doc::keyipt, doc.ipath);
|
||||||
|
@ -254,7 +257,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
||||||
parms.get(Doc::keysig, doc.sig);
|
parms.get(Doc::keysig, doc.sig);
|
||||||
doc.xdocid = docid;
|
doc.xdocid = docid;
|
||||||
|
|
||||||
// Other, not predefined meta fields:
|
// Normal key/value pairs:
|
||||||
vector<string> keys = parms.getNames(string());
|
vector<string> keys = parms.getNames(string());
|
||||||
for (vector<string>::const_iterator it = keys.begin();
|
for (vector<string>::const_iterator it = keys.begin();
|
||||||
it != keys.end(); it++) {
|
it != keys.end(); it++) {
|
||||||
|
@ -1073,8 +1076,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||||
trimstring(doc.meta[Doc::keykw], " \t\r\n");
|
trimstring(doc.meta[Doc::keykw], " \t\r\n");
|
||||||
doc.meta[Doc::keykw] =
|
doc.meta[Doc::keykw] =
|
||||||
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
|
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
|
||||||
if (!doc.meta[Doc::keykw].empty())
|
// No need to explicitly append the keywords, this will be done by
|
||||||
RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);
|
// the "stored" loop
|
||||||
|
|
||||||
// If abstract is empty, we make up one with the beginning of the
|
// If abstract is empty, we make up one with the beginning of the
|
||||||
// document. This is then not indexed, but part of the doc data so
|
// document. This is then not indexed, but part of the doc data so
|
||||||
|
@ -1094,16 +1097,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||||
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
||||||
cstr_nc);
|
cstr_nc);
|
||||||
}
|
}
|
||||||
if (!doc.meta[Doc::keyabs].empty())
|
|
||||||
RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);
|
|
||||||
|
|
||||||
const set<string>& stored = m_config->getStoredFields();
|
const set<string>& stored = m_config->getStoredFields();
|
||||||
for (set<string>::const_iterator it = stored.begin();
|
for (set<string>::const_iterator it = stored.begin();
|
||||||
it != stored.end(); it++) {
|
it != stored.end(); it++) {
|
||||||
string nm = m_config->fieldCanon(*it);
|
string nm = m_config->fieldCanon(*it);
|
||||||
if (!doc.meta[*it].empty()) {
|
if (!doc.meta[nm].empty()) {
|
||||||
string value =
|
string value =
|
||||||
neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
|
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
||||||
RECORD_APPEND(record, nm, value);
|
RECORD_APPEND(record, nm, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -170,13 +170,29 @@ class Doc {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create entry or append text to existing entry.
|
||||||
|
bool addmeta(const string& nm, const string& value)
|
||||||
|
{
|
||||||
|
map<string,string>::iterator mit = meta.find(nm);
|
||||||
|
if (mit == meta.end()) {
|
||||||
|
meta[nm] = value;
|
||||||
|
} else if (mit->second.empty()) {
|
||||||
|
mit->second = value;
|
||||||
|
} else {
|
||||||
|
mit->second += string(" - ") + value;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void dump(bool dotext=false) const;
|
void dump(bool dotext=false) const;
|
||||||
|
|
||||||
// The official names for recoll native fields when used in a text
|
// The official names for recoll native fields when used in a text
|
||||||
// context (ie: the python interface duplicates some of the fixed
|
// context (ie: the python interface duplicates some of the fixed
|
||||||
// fields in the meta array, these are the names used). Defined in
|
// fields in the meta array, these are the names used). Defined in
|
||||||
// rcldoc.cpp. For fields stored in the meta[] array (ie, title,
|
// rcldoc.cpp. Fields stored in the meta[] array (ie, title,
|
||||||
// author), filters _must_ use these values
|
// author), _must_ use these canonical values, not aliases. This is
|
||||||
|
// enforced in internfile.cpp and misc other bits of metadata-gathering
|
||||||
|
// code
|
||||||
static const string keyurl; // url
|
static const string keyurl; // url
|
||||||
static const string keyfn; // file name
|
static const string keyfn; // file name
|
||||||
static const string keyipt; // ipath
|
static const string keyipt; // ipath
|
||||||
|
|
|
@ -64,15 +64,12 @@ recipient = XTO
|
||||||
[stored]
|
[stored]
|
||||||
############################
|
############################
|
||||||
# Some fields are stored in the document data record inside the index and
|
# Some fields are stored in the document data record inside the index and
|
||||||
# can be returned in result lists. There is no necessity that stored fields
|
# can be displayed in result lists. There is no necessity that stored fields
|
||||||
# should be indexed (have a prefix in the preceding section) (example:
|
# should be indexed (have a prefix in the preceding section). Example: "url"
|
||||||
# "url", but this one doesn't need to be listed here, it's stored by hard
|
|
||||||
# code)
|
|
||||||
#
|
#
|
||||||
# Some fields are stored by default, don't add them here:
|
# Some fields are stored by default, don't add them here:
|
||||||
# caption, keywords, abstract, mimetype, url
|
# caption, mimetype, url
|
||||||
# Only canonical names should be used here, not aliases.
|
# Only canonical names should be used here, not aliases.
|
||||||
# "author" used to be stored by default, now set here as optional
|
|
||||||
# "rclaptg" is used for viewer specialization (depending on local config)
|
# "rclaptg" is used for viewer specialization (depending on local config)
|
||||||
# "rclbes" defines the backend type (ie normal fs, firefox cache). Should
|
# "rclbes" defines the backend type (ie normal fs, firefox cache). Should
|
||||||
# probably be hardcoded, don't remove it
|
# probably be hardcoded, don't remove it
|
||||||
|
@ -81,6 +78,8 @@ recipient=
|
||||||
rclaptg=
|
rclaptg=
|
||||||
rclbes=
|
rclbes=
|
||||||
filename=
|
filename=
|
||||||
|
keywords=
|
||||||
|
abstract=
|
||||||
|
|
||||||
[aliases]
|
[aliases]
|
||||||
##########################
|
##########################
|
||||||
|
|
|
@ -29,4 +29,4 @@ localfields = rclaptg=gnuinfo
|
||||||
mhmboxquirks = tbird
|
mhmboxquirks = tbird
|
||||||
|
|
||||||
[/home/dockes/projets/fulltext/testrecoll/cjk]
|
[/home/dockes/projets/fulltext/testrecoll/cjk]
|
||||||
localfields= keyword = ckjtsthuniique: blabla= "some string"
|
localfields= ; keyword = ckjtsthuniique; blabla= "some string"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue