diff --git a/src/filters/rclimg b/src/filters/rclimg index 468ba84f..e6c4bedc 100755 --- a/src/filters/rclimg +++ b/src/filters/rclimg @@ -50,7 +50,7 @@ $headAndBody = 1; sub xapianTag { my $imgtag = shift; while ( ( $tagre, $xapiantag) = each %{$tagMap} ) { - return $xapiantag if $imgtag =~ /$tagre/i; + return $xapiantag if $imgtag =~ /^$tagre$/i; } return undef; } diff --git a/src/filters/rclpdf b/src/filters/rclpdf index 8acd5f96..fa81cc06 100755 --- a/src/filters/rclpdf +++ b/src/filters/rclpdf @@ -132,7 +132,13 @@ awk 'BEGIN'\ gsub(/>/, "\\>", mid) mid = "" mid "" $0 = part1 mid part2 - } + } + + # Recoll treats "Subject" as a "title" element (based on emails). The PDF + # "Subject" metadata field is more like an HTML "description" + if(doescape == 0 && $0 ~ /"){ # Begin of body text. need to escape some chars from now on as diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index f7b8a20b..b8cf2293 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -47,8 +47,10 @@ #include "rclinit.h" #include "execmd.h" -// When using extended attributes, we have to use the ctime. -// This is quite an expensive price to pay... + +// When using extended attributes, we have to use the ctime, because +// this is all that gets set when the attributes are modified. This +// is quite an expensive price to pay... #ifdef RCL_USE_XATTR #define RCL_STTIME st_ctime #else @@ -420,22 +422,21 @@ void FsIndexer::localfieldsfromconf() vector nmlst = attrs.getNames(cstr_null); for (vector::const_iterator it = nmlst.begin(); it != nmlst.end(); it++) { - attrs.get(*it, m_localfields[*it]); + string nm = m_config->fieldCanon(*it); + attrs.get(*it, m_localfields[nm]); + LOGDEB2(("FsIndexer::localfieldsfromconf: [%s]->[%s]\n", + nm.c_str(), m_localfields[nm].c_str())); } } - -// void FsIndexer::setlocalfields(const map& fields, Rcl::Doc& doc) { for (map::const_iterator it = fields.begin(); it != fields.end(); it++) { - // Should local fields override those coming from the document - // ? I think not, but not too sure. We could also chose to - // concatenate the values ? - if (doc.meta.find(it->second) == doc.meta.end()) { - doc.meta[it->first] = it->second; - } + // Being chosen by the user, localfields override values from + // the filter. The key is already canonic (see + // localfieldsfromconf()) + doc.meta[it->first] = it->second; } } @@ -484,7 +485,9 @@ void FsIndexer::reapmetadata(const vector& reapers, const string& fn, } string output; if (ExecCmd::backtick(cmd, output)) { - doc.meta[rp->fieldname] += string(" ") + output; + // addmeta() creates or appends. fieldname is already + // canonic (see above) + doc.addmeta(rp->fieldname, output); } } } @@ -534,7 +537,7 @@ void *FsIndexerInternfileWorker(void * fsp) TempDir tmpdir; RclConfig myconf(*(fip->m_stableconfig)); - InternfileTask *tsk; + InternfileTask *tsk = 0; for (;;) { if (!tqp->take(&tsk)) { tqp->workerExit(); diff --git a/src/index/rclmonrcv.cpp b/src/index/rclmonrcv.cpp index 89e837da..d961b600 100644 --- a/src/index/rclmonrcv.cpp +++ b/src/index/rclmonrcv.cpp @@ -567,22 +567,22 @@ const char *RclIntf::event_name(int code) code &= ~(IN_ISDIR|IN_ONESHOT); switch (code) { case IN_ACCESS: return "IN_ACCESS"; - case IN_MODIFY: return "IN_MODIFY"; case IN_ATTRIB: return "IN_ATTRIB"; - case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE"; - case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE"; case IN_CLOSE: return "IN_CLOSE"; - case IN_OPEN: return "IN_OPEN"; - case IN_MOVED_FROM: return "IN_MOVED_FROM"; - case IN_MOVED_TO: return "IN_MOVED_TO"; - case IN_MOVE: return "IN_MOVE"; + case IN_CLOSE_NOWRITE: return "IN_CLOSE_NOWRITE"; + case IN_CLOSE_WRITE: return "IN_CLOSE_WRITE"; case IN_CREATE: return "IN_CREATE"; case IN_DELETE: return "IN_DELETE"; case IN_DELETE_SELF: return "IN_DELETE_SELF"; - case IN_MOVE_SELF: return "IN_MOVE_SELF"; - case IN_UNMOUNT: return "IN_UNMOUNT"; - case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW"; case IN_IGNORED: return "IN_IGNORED"; + case IN_MODIFY: return "IN_MODIFY"; + case IN_MOVE: return "IN_MOVE"; + case IN_MOVED_FROM: return "IN_MOVED_FROM"; + case IN_MOVED_TO: return "IN_MOVED_TO"; + case IN_MOVE_SELF: return "IN_MOVE_SELF"; + case IN_OPEN: return "IN_OPEN"; + case IN_Q_OVERFLOW: return "IN_Q_OVERFLOW"; + case IN_UNMOUNT: return "IN_UNMOUNT"; default: { static char msg[50]; sprintf(msg, "Unknown event 0x%x", code); @@ -599,6 +599,12 @@ bool RclIntf::addWatch(const string& path, bool) // CLOSE_WRITE is covered through MODIFY. CREATE is needed for mkdirs uint32_t mask = IN_MODIFY | IN_CREATE | IN_MOVED_FROM | IN_MOVED_TO | IN_DELETE +#ifdef RCL_USE_XATTR + // It seems that IN_ATTRIB is not needed to receive extattr + // modification events, which is a bit weird because only ctime is + // set. + // | IN_ATTRIB +#endif #ifdef IN_DONT_FOLLOW | IN_DONT_FOLLOW #endif @@ -692,7 +698,7 @@ bool RclIntf::getEvent(RclMonEvent& ev, int msecs) eraseWatchSubTree(m_idtopath, ev.m_path); } - + // IN_ATTRIB apparently not needed, see comment above if (evp->mask & (IN_MODIFY)) { ev.m_etyp = RclMonEvent::RCLEVT_MODIFY; } else if (evp->mask & (IN_DELETE | IN_MOVED_FROM)) { diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 5500c42a..ddfb29db 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -80,6 +80,7 @@ static string colon_restore(const string& in) #ifdef RCL_USE_XATTR void FileInterner::reapXAttrs(const string& path) { + LOGDEB2(("FileInterner::reapXAttrs: [%s]\n", path.c_str())); vector xnames; if (!pxattr::list(path, &xnames)) { LOGERR(("FileInterner::reapXattrs: pxattr::list: errno %d\n", errno)); @@ -98,6 +99,8 @@ void FileInterner::reapXAttrs(const string& path) } // Encode should we ? m_XAttrsFields[mit->second] = value; + LOGDEB2(("FileInterner::reapXAttrs: got [%s] -> [%s]\n", + mit->second.c_str(), value.c_str())); } } } @@ -626,7 +629,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) it->first == cstr_dj_keycharset) { // don't need/want these. } else { - doc.meta[it->first] = it->second; + doc.addmeta(m_cfg->fieldCanon(it->first), it->second); } } if (doc.meta[Rcl::Doc::keyabs].empty() && @@ -659,10 +662,12 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const #ifdef RCL_USE_XATTR // Set fields from extended file attributes. - // These can be overriden by values from inside the file + // These can be later augmented by values from inside the file for (map::const_iterator it = m_XAttrsFields.begin(); it != m_XAttrsFields.end(); it++) { - doc.meta[it->first] = it->second; + LOGDEB1(("Internfile:: setting [%s] from xattrs value [%s]\n", + m_cfg->fieldCanon(it->first).c_str(), it->second.c_str())); + doc.meta[m_cfg->fieldCanon(it->first)] = it->second; } #endif //RCL_USE_XATTR diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 68f6a81f..1dc92c67 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -232,19 +232,22 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, ConfSimple parms(data); if (!parms.ok()) return false; + + // Special cases: parms.get(Doc::keyurl, doc.url); parms.get(Doc::keytp, doc.mimetype); parms.get(Doc::keyfmt, doc.fmtime); parms.get(Doc::keydmt, doc.dmtime); parms.get(Doc::keyoc, doc.origcharset); parms.get(cstr_caption, doc.meta[Doc::keytt]); - parms.get(Doc::keykw, doc.meta[Doc::keykw]); + parms.get(Doc::keyabs, doc.meta[Doc::keyabs]); // Possibly remove synthetic abstract indicator (if it's there, we // used to index the beginning of the text as abstract). doc.syntabs = false; if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) { - doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length()); + doc.meta[Doc::keyabs] = + doc.meta[Doc::keyabs].substr(cstr_syntAbs.length()); doc.syntabs = true; } parms.get(Doc::keyipt, doc.ipath); @@ -254,7 +257,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, parms.get(Doc::keysig, doc.sig); doc.xdocid = docid; - // Other, not predefined meta fields: + // Normal key/value pairs: vector keys = parms.getNames(string()); for (vector::const_iterator it = keys.begin(); it != keys.end(); it++) { @@ -1073,8 +1076,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) trimstring(doc.meta[Doc::keykw], " \t\r\n"); doc.meta[Doc::keykw] = neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc); - if (!doc.meta[Doc::keykw].empty()) - RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]); + // No need to explicitly append the keywords, this will be done by + // the "stored" loop // If abstract is empty, we make up one with the beginning of the // document. This is then not indexed, but part of the doc data so @@ -1094,16 +1097,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen), cstr_nc); } - if (!doc.meta[Doc::keyabs].empty()) - RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]); const set& stored = m_config->getStoredFields(); for (set::const_iterator it = stored.begin(); it != stored.end(); it++) { string nm = m_config->fieldCanon(*it); - if (!doc.meta[*it].empty()) { + if (!doc.meta[nm].empty()) { string value = - neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc); + neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc); RECORD_APPEND(record, nm, value); } } diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index 42744d80..a929298b 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -170,13 +170,29 @@ class Doc { } } + // Create entry or append text to existing entry. + bool addmeta(const string& nm, const string& value) + { + map::iterator mit = meta.find(nm); + if (mit == meta.end()) { + meta[nm] = value; + } else if (mit->second.empty()) { + mit->second = value; + } else { + mit->second += string(" - ") + value; + } + return true; + } + void dump(bool dotext=false) const; // The official names for recoll native fields when used in a text // context (ie: the python interface duplicates some of the fixed // fields in the meta array, these are the names used). Defined in - // rcldoc.cpp. For fields stored in the meta[] array (ie, title, - // author), filters _must_ use these values + // rcldoc.cpp. Fields stored in the meta[] array (ie, title, + // author), _must_ use these canonical values, not aliases. This is + // enforced in internfile.cpp and misc other bits of metadata-gathering + // code static const string keyurl; // url static const string keyfn; // file name static const string keyipt; // ipath diff --git a/src/sampleconf/fields b/src/sampleconf/fields index 6ed59e32..fd1a319d 100644 --- a/src/sampleconf/fields +++ b/src/sampleconf/fields @@ -64,15 +64,12 @@ recipient = XTO [stored] ############################ # Some fields are stored in the document data record inside the index and -# can be returned in result lists. There is no necessity that stored fields -# should be indexed (have a prefix in the preceding section) (example: -# "url", but this one doesn't need to be listed here, it's stored by hard -# code) +# can be displayed in result lists. There is no necessity that stored fields +# should be indexed (have a prefix in the preceding section). Example: "url" # # Some fields are stored by default, don't add them here: -# caption, keywords, abstract, mimetype, url +# caption, mimetype, url # Only canonical names should be used here, not aliases. -# "author" used to be stored by default, now set here as optional # "rclaptg" is used for viewer specialization (depending on local config) # "rclbes" defines the backend type (ie normal fs, firefox cache). Should # probably be hardcoded, don't remove it @@ -81,6 +78,8 @@ recipient= rclaptg= rclbes= filename= +keywords= +abstract= [aliases] ########################## diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf index f7708c35..d2885db6 100644 --- a/tests/config/recoll.conf +++ b/tests/config/recoll.conf @@ -29,4 +29,4 @@ localfields = rclaptg=gnuinfo mhmboxquirks = tbird [/home/dockes/projets/fulltext/testrecoll/cjk] -localfields= keyword = ckjtsthuniique: blabla= "some string" +localfields= ; keyword = ckjtsthuniique; blabla= "some string"