factored out common charset handling code in exec and execm, cleaned up charset and textplain handling in mh_mail
This commit is contained in:
parent
8be8a00a79
commit
52804fef6c
9 changed files with 82 additions and 72 deletions
|
@ -48,6 +48,7 @@ DEF_CSTR(fbytes, "fbytes");
|
||||||
DEF_CSTR(fileu, "file://");
|
DEF_CSTR(fileu, "file://");
|
||||||
DEF_CSTR(fmtime, "fmtime");
|
DEF_CSTR(fmtime, "fmtime");
|
||||||
DEF_CSTR(iso_8859_1, "ISO-8859-1");
|
DEF_CSTR(iso_8859_1, "ISO-8859-1");
|
||||||
|
DEF_CSTR(utf8, "UTF-8");
|
||||||
DEF_CSTR(minwilds, "*?[");
|
DEF_CSTR(minwilds, "*?[");
|
||||||
DEF_CSTR(newline, "\n");
|
DEF_CSTR(newline, "\n");
|
||||||
DEF_CSTR(null, "");
|
DEF_CSTR(null, "");
|
||||||
|
|
|
@ -143,30 +143,37 @@ bool MimeHandlerExec::next_document()
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MimeHandlerExec::finaldetails()
|
void MimeHandlerExec::handle_cs(const string& mt, const string& icharset)
|
||||||
{
|
{
|
||||||
m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
|
string charset(icharset);
|
||||||
|
|
||||||
// cfgFilterOutputCharset comes from the mimeconf filter
|
// cfgFilterOutputCharset comes from the mimeconf filter
|
||||||
// definition line If the value is "default", we use the charset
|
// definition line and defaults to UTF-8 if empty. If the value is
|
||||||
// value defined in recoll.conf (which may vary depending on
|
// "default", we use the default input charset value defined in
|
||||||
// directory)
|
// recoll.conf (which may vary depending on directory)
|
||||||
string& charset = m_metaData[cstr_dj_keycharset];
|
if (charset.empty()) {
|
||||||
charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset;
|
charset = cfgFilterOutputCharset.empty() ? cstr_utf8 :
|
||||||
|
cfgFilterOutputCharset;
|
||||||
if (!stringlowercmp("default", charset)) {
|
if (!stringlowercmp("default", charset)) {
|
||||||
charset = m_dfltInputCharset;
|
charset = m_dfltInputCharset;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// The output mime type is html except if defined otherwise in the filter
|
m_metaData[cstr_dj_keyorigcharset] = charset;
|
||||||
// definition.
|
|
||||||
string& mt = m_metaData[cstr_dj_keymt];
|
|
||||||
mt = cfgFilterOutputMtype.empty() ? "text/html" :
|
|
||||||
cfgFilterOutputMtype;
|
|
||||||
|
|
||||||
// If this is text/plain transcode_to/check utf-8
|
// If this is text/plain transcode_to/check utf-8
|
||||||
if (!mt.compare(cstr_textplain)) {
|
if (!mt.compare(cstr_textplain)) {
|
||||||
(void)txtdcode("mh_exec");
|
(void)txtdcode("mh_exec/m");
|
||||||
|
} else {
|
||||||
|
m_metaData[cstr_dj_keycharset] = charset;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void MimeHandlerExec::finaldetails()
|
||||||
|
{
|
||||||
|
// The default output mime type is html, but it may be defined
|
||||||
|
// otherwise in the filter definition.
|
||||||
|
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" :
|
||||||
|
cfgFilterOutputMtype;
|
||||||
|
|
||||||
string md5, xmd5, reason;
|
string md5, xmd5, reason;
|
||||||
if (MD5File(m_fn, md5, &reason)) {
|
if (MD5File(m_fn, md5, &reason)) {
|
||||||
|
@ -175,4 +182,6 @@ void MimeHandlerExec::finaldetails()
|
||||||
LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n",
|
LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n",
|
||||||
m_fn.c_str(), reason.c_str()));
|
m_fn.c_str(), reason.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
handle_cs(m_metaData[cstr_dj_keymt]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,6 +77,13 @@ protected:
|
||||||
string m_fn;
|
string m_fn;
|
||||||
string m_ipath;
|
string m_ipath;
|
||||||
|
|
||||||
|
// Set up the character set metadata fields and possibly transcode
|
||||||
|
// text/plain output.
|
||||||
|
// @param charset when called from mh_execm, a possible explicit
|
||||||
|
// value from the filter (else the data will come from the config)
|
||||||
|
virtual void handle_cs(const string& mt, const string& charset = string());
|
||||||
|
|
||||||
|
private:
|
||||||
virtual void finaldetails();
|
virtual void finaldetails();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -120,7 +120,7 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
|
||||||
ibuf.c_str()));
|
ibuf.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
|
LOGDEB(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
|
||||||
if (len / 1024 > m_maxmemberkb) {
|
if (len / 1024 > m_maxmemberkb) {
|
||||||
LOGERR(("MHExecMultiple: data len > maxmemberkb\n"));
|
LOGERR(("MHExecMultiple: data len > maxmemberkb\n"));
|
||||||
return false;
|
return false;
|
||||||
|
@ -290,27 +290,15 @@ bool MimeHandlerExecMultiple::next_document()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Charset. For many document types it doesn't matter. For text
|
handle_cs(m_metaData[cstr_dj_keymt], charset);
|
||||||
// and html it does. We supply a default from the configuration.
|
|
||||||
if (charset.empty()) {
|
|
||||||
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
|
||||||
cfgFilterOutputCharset;
|
|
||||||
if (!stringlowercmp("default", charset)) {
|
|
||||||
charset = m_dfltInputCharset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
m_metaData[cstr_dj_keyorigcharset] = charset;
|
|
||||||
m_metaData[cstr_dj_keycharset] = charset;
|
|
||||||
|
|
||||||
if (!m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
|
|
||||||
(void)txtdcode("mh_execm");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (eofnext_received)
|
if (eofnext_received)
|
||||||
m_havedoc = false;
|
m_havedoc = false;
|
||||||
|
|
||||||
LOGDEB0(("MHExecMultiple: returning %d bytes of content,"
|
LOGDEB0(("MHExecMultiple: returning %d bytes of content,"
|
||||||
" mtype [%s] charset [%s]\n", m_metaData[cstr_dj_keycontent].size(),
|
" mtype [%s] charset [%s]\n",
|
||||||
m_metaData[cstr_dj_keymt].c_str(), m_metaData[cstr_dj_keycharset].c_str()));
|
m_metaData[cstr_dj_keycontent].size(),
|
||||||
|
m_metaData[cstr_dj_keymt].c_str(),
|
||||||
|
m_metaData[cstr_dj_keycharset].c_str()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -164,7 +164,7 @@ bool MimeHandlerHtml::next_document()
|
||||||
|
|
||||||
m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
|
m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
|
||||||
m_metaData[cstr_dj_keycontent] = result.dump;
|
m_metaData[cstr_dj_keycontent] = result.dump;
|
||||||
m_metaData[cstr_dj_keycharset] = "utf-8";
|
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
||||||
// Avoid setting empty values which would crush ones possibly inherited
|
// Avoid setting empty values which would crush ones possibly inherited
|
||||||
// from parent (if we're an attachment)
|
// from parent (if we're an attachment)
|
||||||
if (!result.dmtime.empty())
|
if (!result.dmtime.empty())
|
||||||
|
|
|
@ -242,39 +242,27 @@ bool MimeHandlerMail::processAttach()
|
||||||
MHMailAttach *att = m_attachments[m_idx];
|
MHMailAttach *att = m_attachments[m_idx];
|
||||||
|
|
||||||
m_metaData[cstr_dj_keymt] = att->m_contentType;
|
m_metaData[cstr_dj_keymt] = att->m_contentType;
|
||||||
|
m_metaData[cstr_dj_keyorigcharset] = att->m_charset;
|
||||||
m_metaData[cstr_dj_keycharset] = att->m_charset;
|
m_metaData[cstr_dj_keycharset] = att->m_charset;
|
||||||
m_metaData[cstr_dj_keyfn] = att->m_filename;
|
m_metaData[cstr_dj_keyfn] = att->m_filename;
|
||||||
// Change the title to something helpul
|
|
||||||
m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")";
|
m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")";
|
||||||
LOGDEB1((" processAttach:ct [%s] cs [%s] fn [%s]\n",
|
LOGDEB1((" processAttach:ct [%s] cs [%s] fn [%s]\n",
|
||||||
att->m_contentType.c_str(),
|
att->m_contentType.c_str(),
|
||||||
att->m_charset.c_str(),
|
att->m_charset.c_str(),
|
||||||
att->m_filename.c_str()));
|
att->m_filename.c_str()));
|
||||||
|
|
||||||
|
// Erase current content and replace
|
||||||
m_metaData[cstr_dj_keycontent] = string();
|
m_metaData[cstr_dj_keycontent] = string();
|
||||||
string& body = m_metaData[cstr_dj_keycontent];
|
string& body = m_metaData[cstr_dj_keycontent];
|
||||||
att->m_part->getBody(body, 0, att->m_part->bodylength);
|
att->m_part->getBody(body, 0, att->m_part->bodylength);
|
||||||
|
{
|
||||||
string decoded;
|
string decoded;
|
||||||
const string *bdp;
|
const string *bdp;
|
||||||
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
|
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (bdp != &body)
|
if (bdp != &body)
|
||||||
body = decoded;
|
body.swap(decoded);
|
||||||
|
|
||||||
// Special case for text/plain content. Internfile should deal
|
|
||||||
// with this but it expects text/plain to be utf-8 already, so we
|
|
||||||
// handle the transcoding if needed
|
|
||||||
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
|
|
||||||
string utf8;
|
|
||||||
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], "UTF-8")) {
|
|
||||||
LOGERR((" processAttach: transcode to utf-8 failed "
|
|
||||||
"for charset [%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
|
|
||||||
// can't transcode at all -> data is garbage just erase it
|
|
||||||
body.clear();
|
|
||||||
} else {
|
|
||||||
body = utf8;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Special case for application/octet-stream: try to better
|
// Special case for application/octet-stream: try to better
|
||||||
|
@ -287,6 +275,22 @@ bool MimeHandlerMail::processAttach()
|
||||||
m_metaData[cstr_dj_keymt] = mt;
|
m_metaData[cstr_dj_keymt] = mt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Special case for text/plain content. Internfile should deal
|
||||||
|
// with this but it expects text/plain to be utf-8 already, so we
|
||||||
|
// handle the transcoding if needed
|
||||||
|
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
|
||||||
|
string utf8;
|
||||||
|
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
|
||||||
|
LOGERR((" processAttach: transcode to utf-8 failed for charset "
|
||||||
|
"[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
|
||||||
|
// can't transcode at all -> data is garbage just erase it
|
||||||
|
body.clear();
|
||||||
|
} else {
|
||||||
|
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
||||||
|
body.swap(utf8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Ipath
|
// Ipath
|
||||||
char nbuf[20];
|
char nbuf[20];
|
||||||
sprintf(nbuf, "%d", m_idx);
|
sprintf(nbuf, "%d", m_idx);
|
||||||
|
@ -527,11 +531,13 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||||
|
|
||||||
// "Simple" part.
|
// "Simple" part.
|
||||||
LOGDEB2(("walkmime: simple part\n"));
|
LOGDEB2(("walkmime: simple part\n"));
|
||||||
// Normally the default charset is us-ascii. But it happens that
|
// Normally the default charset is us-ascii. But it happens that 8
|
||||||
// 8 bit chars exist in a message that is stated as us-ascii. Ie the
|
// bit chars exist in a message that is stated as us-ascii. Ie the
|
||||||
// mailer used by yahoo support ('KANA') does this. We could convert
|
// mailer used by yahoo support ('KANA') does this. We could
|
||||||
// to iso-8859 only if the transfer-encoding is 8 bit, or test for
|
// convert to iso-8859 only if the transfer-encoding is 8 bit, or
|
||||||
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
|
// test for actual 8 bit chars, but what the heck, le'ts use
|
||||||
|
// 8859-1 (actually CP1252 which is compatible, but with more
|
||||||
|
// useful chars) as default.
|
||||||
string charset;
|
string charset;
|
||||||
it = content_type.params.find(cstr_mail_charset);
|
it = content_type.params.find(cstr_mail_charset);
|
||||||
if (it != content_type.params.end())
|
if (it != content_type.params.end())
|
||||||
|
@ -544,7 +550,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||||
!stringlowercmp("unknown", charset) ) {
|
!stringlowercmp("unknown", charset) ) {
|
||||||
m_config->getConfParam("maildefcharset", charset);
|
m_config->getConfParam("maildefcharset", charset);
|
||||||
if (charset.empty())
|
if (charset.empty())
|
||||||
charset = "iso-8859-1";
|
charset = "CP1252";
|
||||||
}
|
}
|
||||||
|
|
||||||
// Content transfer encoding
|
// Content transfer encoding
|
||||||
|
@ -609,8 +615,6 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||||
body = decoded;
|
body = decoded;
|
||||||
|
|
||||||
// Handle html stripping and transcoding to utf8
|
// Handle html stripping and transcoding to utf8
|
||||||
string utf8;
|
|
||||||
const string *putf8 = 0;
|
|
||||||
if (!stringlowercmp("text/html", content_type.value)) {
|
if (!stringlowercmp("text/html", content_type.value)) {
|
||||||
MimeHandlerHtml mh(m_config, "text/html");
|
MimeHandlerHtml mh(m_config, "text/html");
|
||||||
mh.set_property(Dijon::Filter::OPERATING_MODE,
|
mh.set_property(Dijon::Filter::OPERATING_MODE,
|
||||||
|
@ -623,9 +627,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||||
if (it != mh.get_meta_data().end())
|
if (it != mh.get_meta_data().end())
|
||||||
out += it->second;
|
out += it->second;
|
||||||
} else {
|
} else {
|
||||||
|
string utf8;
|
||||||
// Transcode to utf-8
|
// Transcode to utf-8
|
||||||
LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
|
LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
|
||||||
if (!transcode(body, utf8, charset, "UTF-8")) {
|
if (!transcode(body, utf8, charset, cstr_utf8)) {
|
||||||
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
|
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
|
||||||
charset.c_str()));
|
charset.c_str()));
|
||||||
out += body;
|
out += body;
|
||||||
|
|
|
@ -288,8 +288,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||||
|
|
||||||
out:
|
out:
|
||||||
if (h) {
|
if (h) {
|
||||||
string charset = cfg->getDefCharset();
|
h->set_property(Dijon::Filter::DEFAULT_CHARSET, cfg->getDefCharset());
|
||||||
h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
|
||||||
}
|
}
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
|
|
|
@ -109,6 +109,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
// This only makes sense if the contents are currently txt/plain
|
// This only makes sense if the contents are currently txt/plain
|
||||||
|
// It converts from keyorigcharset to UTF-8 and sets keycharset.
|
||||||
bool txtdcode(const string& who);
|
bool txtdcode(const string& who);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
|
@ -40,7 +40,7 @@ application/x-lzma = uncompress rcluncomp unxz %f %t
|
||||||
# The default is now again to use rcldoc. Use raw antiword if speed is more
|
# The default is now again to use rcldoc. Use raw antiword if speed is more
|
||||||
# important for you than catching all data,
|
# important for you than catching all data,
|
||||||
application/msword = exec rcldoc
|
application/msword = exec rcldoc
|
||||||
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
|
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
|
||||||
# You can also use wvware directly but it's much slower.
|
# You can also use wvware directly but it's much slower.
|
||||||
# application/msword = exec wvWare --charset=utf-8 --nographics
|
# application/msword = exec wvWare --charset=utf-8 --nographics
|
||||||
|
|
||||||
|
@ -52,8 +52,8 @@ application/vnd.ms-office = exec rcldoc
|
||||||
application/ogg = execm rclaudio
|
application/ogg = execm rclaudio
|
||||||
application/pdf = exec rclpdf
|
application/pdf = exec rclpdf
|
||||||
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
|
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
|
||||||
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;charset=utf-8;mimetype=text/plain
|
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;mimetype=text/plain
|
||||||
application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain
|
application/vnd.ms-powerpoint = exec catppt -d utf-8;mimetype=text/plain
|
||||||
application/vn.oasis.opendocument.txt = exec rclsoff
|
application/vn.oasis.opendocument.txt = exec rclsoff
|
||||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
|
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
|
||||||
exec rclopxml
|
exec rclopxml
|
||||||
|
@ -81,7 +81,7 @@ application/vnd.wordperfect = exec wpd2html;mimetype=text/html
|
||||||
application/x-abiword = exec rclabw
|
application/x-abiword = exec rclabw
|
||||||
application/x-awk = internal text/plain
|
application/x-awk = internal text/plain
|
||||||
application/x-chm = execm rclchm
|
application/x-chm = execm rclchm
|
||||||
application/x-dia-diagram = execm rcldia;mimetype=text/plain;charset=utf-8
|
application/x-dia-diagram = execm rcldia;mimetype=text/plain
|
||||||
application/x-dvi = exec rcldvi
|
application/x-dvi = exec rcldvi
|
||||||
application/x-flac = execm rclaudio
|
application/x-flac = execm rclaudio
|
||||||
application/x-gnuinfo = execm rclinfo
|
application/x-gnuinfo = execm rclinfo
|
||||||
|
@ -109,7 +109,7 @@ image/vnd.djvu = exec rcldjvu
|
||||||
image/svg+xml = exec rclsvg
|
image/svg+xml = exec rclsvg
|
||||||
image/x-xcf = execm rclimg
|
image/x-xcf = execm rclimg
|
||||||
message/rfc822 = internal
|
message/rfc822 = internal
|
||||||
text/calendar = execm rclics;mimetype=text/plain;charset=utf-8
|
text/calendar = execm rclics;mimetype=text/plain
|
||||||
text/html = internal
|
text/html = internal
|
||||||
text/plain = internal
|
text/plain = internal
|
||||||
text/rtf = exec unrtf --nopict --html;mimetype=text/html
|
text/rtf = exec unrtf --nopict --html;mimetype=text/html
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue