diff --git a/src/filters/rclimg b/src/filters/rclimg
index 468ba84f..e6c4bedc 100755
--- a/src/filters/rclimg
+++ b/src/filters/rclimg
@@ -50,7 +50,7 @@ $headAndBody = 1;
sub xapianTag {
my $imgtag = shift;
while ( ( $tagre, $xapiantag) = each %{$tagMap} ) {
- return $xapiantag if $imgtag =~ /$tagre/i;
+ return $xapiantag if $imgtag =~ /^$tagre$/i;
}
return undef;
}
diff --git a/src/filters/rclpdf b/src/filters/rclpdf
index 8acd5f96..fa81cc06 100755
--- a/src/filters/rclpdf
+++ b/src/filters/rclpdf
@@ -132,7 +132,13 @@ awk 'BEGIN'\
gsub(/>/, "\\>", mid)
mid = "
" mid ""
$0 = part1 mid part2
- }
+ }
+
+ # Recoll treats "Subject" as a "title" element (based on emails). The PDF
+ # "Subject" metadata field is more like an HTML "description"
+ if(doescape == 0 && $0 ~ /"){
# Begin of body text. need to escape some chars from now on as
diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp
index f7b8a20b..b8cf2293 100644
--- a/src/index/fsindexer.cpp
+++ b/src/index/fsindexer.cpp
@@ -47,8 +47,10 @@
#include "rclinit.h"
#include "execmd.h"
-// When using extended attributes, we have to use the ctime.
-// This is quite an expensive price to pay...
+
+// When using extended attributes, we have to use the ctime, because
+// this is all that gets set when the attributes are modified. This
+// is quite an expensive price to pay...
#ifdef RCL_USE_XATTR
#define RCL_STTIME st_ctime
#else
@@ -420,22 +422,21 @@ void FsIndexer::localfieldsfromconf()
vector nmlst = attrs.getNames(cstr_null);
for (vector::const_iterator it = nmlst.begin();
it != nmlst.end(); it++) {
- attrs.get(*it, m_localfields[*it]);
+ string nm = m_config->fieldCanon(*it);
+ attrs.get(*it, m_localfields[nm]);
+ LOGDEB2(("FsIndexer::localfieldsfromconf: [%s]->[%s]\n",
+ nm.c_str(), m_localfields[nm].c_str()));
}
}
-
-//
void FsIndexer::setlocalfields(const map& fields, Rcl::Doc& doc)
{
for (map::const_iterator it = fields.begin();
it != fields.end(); it++) {
- // Should local fields override those coming from the document
- // ? I think not, but not too sure. We could also chose to
- // concatenate the values ?
- if (doc.meta.find(it->second) == doc.meta.end()) {
- doc.meta[it->first] = it->second;
- }
+ // Being chosen by the user, localfields override values from
+ // the filter. The key is already canonic (see
+ // localfieldsfromconf())
+ doc.meta[it->first] = it->second;
}
}
@@ -484,7 +485,9 @@ void FsIndexer::reapmetadata(const vector& reapers, const string& fn,
}
string output;
if (ExecCmd::backtick(cmd, output)) {
- doc.meta[rp->fieldname] += string(" ") + output;
+ // addmeta() creates or appends. fieldname is already
+ // canonic (see above)
+ doc.addmeta(rp->fieldname, output);
}
}
}
@@ -534,7 +537,7 @@ void *FsIndexerInternfileWorker(void * fsp)
TempDir tmpdir;
RclConfig myconf(*(fip->m_stableconfig));
- InternfileTask *tsk;
+ InternfileTask *tsk = 0;
for (;;) {
if (!tqp->take(&tsk)) {
tqp->workerExit();
diff --git a/src/index/rclmonrcv.cpp b/src/index/rclmonrcv.cpp
index 89e837da..d961b600 100644
--- a/src/index/rclmonrcv.cpp
+++ b/src/index/rclmonrcv.cpp
@@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code)
code &= ~(IN_ISDIR|IN_ONESHOT);
switch (code) {
case IN_ACCESS: return "IN_ACCESS";
- case IN_MODIFY: return "IN_MODIFY";
case IN_ATTRIB: return "IN_ATTRIB";
- case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
- case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
case IN_CLOSE: return "IN_CLOSE";
- case IN_OPEN: return "IN_OPEN";
- case IN_MOVED_FROM: return "IN_MOVED_FROM";
- case IN_MOVED_TO: return "IN_MOVED_TO";
- case IN_MOVE: return "IN_MOVE";
+ case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE";
+ case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE";
case IN_CREATE: return "IN_CREATE";
case IN_DELETE: return "IN_DELETE";
case IN_DELETE_SELF: return "IN_DELETE_SELF";
- case IN_MOVE_SELF: return "IN_MOVE_SELF";
- case IN_UNMOUNT: return "IN_UNMOUNT";
- case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
case IN_IGNORED: return "IN_IGNORED";
+ case IN_MODIFY: return "IN_MODIFY";
+ case IN_MOVE: return "IN_MOVE";
+ case IN_MOVED_FROM: return "IN_MOVED_FROM";
+ case IN_MOVED_TO: return "IN_MOVED_TO";
+ case IN_MOVE_SELF: return "IN_MOVE_SELF";
+ case IN_OPEN: return "IN_OPEN";
+ case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW";
+ case IN_UNMOUNT: return "IN_UNMOUNT";
default: {
static char msg[50];
sprintf(msg, "Unknown event 0x%x", code);
@@ -599,6 +599,12 @@ bool RclIntf::addWatch(const string& path, bool)
// CLOSE_WRITE is covered through MODIFY. CREATE is needed for mkdirs
uint32_t mask = IN_MODIFY | IN_CREATE
| IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE
+#ifdef RCL_USE_XATTR
+ // It seems that IN_ATTRIB is not needed to receive extattr
+ // modification events, which is a bit weird because only ctime is
+ // set.
+ // | IN_ATTRIB
+#endif
#ifdef IN_DONT_FOLLOW
| IN_DONT_FOLLOW
#endif
@@ -692,7 +698,7 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs)
eraseWatchSubTree(m_idtopath, ev.m_path);
}
-
+ // IN_ATTRIB apparently not needed, see comment above
if (evp->mask & (IN_MODIFY)) {
ev.m_etyp = RclMonEvent::RCLEVT_MODIFY;
} else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) {
diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp
index 5500c42a..ddfb29db 100644
--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@@ -80,6 +80,7 @@ static string colon_restore(const string& in)
#ifdef RCL_USE_XATTR
void FileInterner::reapXAttrs(const string& path)
{
+ LOGDEB2(("FileInterner::reapXAttrs: [%s]\n", path.c_str()));
vector xnames;
if (!pxattr::list(path, &xnames)) {
LOGERR(("FileInterner::reapXattrs: pxattr::list: errno %d\n", errno));
@@ -98,6 +99,8 @@ void FileInterner::reapXAttrs(const string& path)
}
// Encode should we ?
m_XAttrsFields[mit->second] = value;
+ LOGDEB2(("FileInterner::reapXAttrs: got [%s] -> [%s]\n",
+ mit->second.c_str(), value.c_str()));
}
}
}
@@ -626,7 +629,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
it->first == cstr_dj_keycharset) {
// don't need/want these.
} else {
- doc.meta[it->first] = it->second;
+ doc.addmeta(m_cfg->fieldCanon(it->first), it->second);
}
}
if (doc.meta[Rcl::Doc::keyabs].empty() &&
@@ -659,10 +662,12 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
#ifdef RCL_USE_XATTR
// Set fields from extended file attributes.
- // These can be overriden by values from inside the file
+ // These can be later augmented by values from inside the file
for (map::const_iterator it = m_XAttrsFields.begin();
it != m_XAttrsFields.end(); it++) {
- doc.meta[it->first] = it->second;
+ LOGDEB1(("Internfile:: setting [%s] from xattrs value [%s]\n",
+ m_cfg->fieldCanon(it->first).c_str(), it->second.c_str()));
+ doc.meta[m_cfg->fieldCanon(it->first)] = it->second;
}
#endif //RCL_USE_XATTR
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 68f6a81f..1dc92c67 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -232,19 +232,22 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
ConfSimple parms(data);
if (!parms.ok())
return false;
+
+ // Special cases:
parms.get(Doc::keyurl, doc.url);
parms.get(Doc::keytp, doc.mimetype);
parms.get(Doc::keyfmt, doc.fmtime);
parms.get(Doc::keydmt, doc.dmtime);
parms.get(Doc::keyoc, doc.origcharset);
parms.get(cstr_caption, doc.meta[Doc::keytt]);
- parms.get(Doc::keykw, doc.meta[Doc::keykw]);
+
parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
// Possibly remove synthetic abstract indicator (if it's there, we
// used to index the beginning of the text as abstract).
doc.syntabs = false;
if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) {
- doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
+ doc.meta[Doc::keyabs] =
+ doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
doc.syntabs = true;
}
parms.get(Doc::keyipt, doc.ipath);
@@ -254,7 +257,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
parms.get(Doc::keysig, doc.sig);
doc.xdocid = docid;
- // Other, not predefined meta fields:
+ // Normal key/value pairs:
vector keys = parms.getNames(string());
for (vector::const_iterator it = keys.begin();
it != keys.end(); it++) {
@@ -1073,8 +1076,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
trimstring(doc.meta[Doc::keykw], " \t\r\n");
doc.meta[Doc::keykw] =
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
- if (!doc.meta[Doc::keykw].empty())
- RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);
+ // No need to explicitly append the keywords, this will be done by
+ // the "stored" loop
// If abstract is empty, we make up one with the beginning of the
// document. This is then not indexed, but part of the doc data so
@@ -1094,16 +1097,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
cstr_nc);
}
- if (!doc.meta[Doc::keyabs].empty())
- RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);
const set& stored = m_config->getStoredFields();
for (set::const_iterator it = stored.begin();
it != stored.end(); it++) {
string nm = m_config->fieldCanon(*it);
- if (!doc.meta[*it].empty()) {
+ if (!doc.meta[nm].empty()) {
string value =
- neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
+ neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
RECORD_APPEND(record, nm, value);
}
}
diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h
index 42744d80..a929298b 100644
--- a/src/rcldb/rcldoc.h
+++ b/src/rcldb/rcldoc.h
@@ -170,13 +170,29 @@ class Doc {
}
}
+ // Create entry or append text to existing entry.
+ bool addmeta(const string& nm, const string& value)
+ {
+ map::iterator mit = meta.find(nm);
+ if (mit == meta.end()) {
+ meta[nm] = value;
+ } else if (mit->second.empty()) {
+ mit->second = value;
+ } else {
+ mit->second += string(" - ") + value;
+ }
+ return true;
+ }
+
void dump(bool dotext=false) const;
// The official names for recoll native fields when used in a text
// context (ie: the python interface duplicates some of the fixed
// fields in the meta array, these are the names used). Defined in
- // rcldoc.cpp. For fields stored in the meta[] array (ie, title,
- // author), filters _must_ use these values
+ // rcldoc.cpp. Fields stored in the meta[] array (ie, title,
+ // author), _must_ use these canonical values, not aliases. This is
+ // enforced in internfile.cpp and misc other bits of metadata-gathering
+ // code
static const string keyurl; // url
static const string keyfn; // file name
static const string keyipt; // ipath
diff --git a/src/sampleconf/fields b/src/sampleconf/fields
index 6ed59e32..fd1a319d 100644
--- a/src/sampleconf/fields
+++ b/src/sampleconf/fields
@@ -64,15 +64,12 @@ recipient = XTO
[stored]
############################
# Some fields are stored in the document data record inside the index and
-# can be returned in result lists. There is no necessity that stored fields
-# should be indexed (have a prefix in the preceding section) (example:
-# "url", but this one doesn't need to be listed here, it's stored by hard
-# code)
+# can be displayed in result lists. There is no necessity that stored fields
+# should be indexed (have a prefix in the preceding section). Example: "url"
#
# Some fields are stored by default, don't add them here:
-# caption, keywords, abstract, mimetype, url
+# caption, mimetype, url
# Only canonical names should be used here, not aliases.
-# "author" used to be stored by default, now set here as optional
# "rclaptg" is used for viewer specialization (depending on local config)
# "rclbes" defines the backend type (ie normal fs, firefox cache). Should
# probably be hardcoded, don't remove it
@@ -81,6 +78,8 @@ recipient=
rclaptg=
rclbes=
filename=
+keywords=
+abstract=
[aliases]
##########################
diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf
index f7708c35..d2885db6 100644
--- a/tests/config/recoll.conf
+++ b/tests/config/recoll.conf
@@ -29,4 +29,4 @@ localfields = rclaptg=gnuinfo
mhmboxquirks = tbird
[/home/dockes/projets/fulltext/testrecoll/cjk]
-localfields= keyword = ckjtsthuniique: blabla= "some string"
+localfields= ; keyword = ckjtsthuniique; blabla= "some string"