factored out common charset handling code in exec and execm, cleaned up charset and textplain handling in mh_mail
This commit is contained in:
parent
8be8a00a79
commit
52804fef6c
9 changed files with 82 additions and 72 deletions
|
@ -48,6 +48,7 @@ DEF_CSTR(fbytes, "fbytes");
|
|||
DEF_CSTR(fileu, "file://");
|
||||
DEF_CSTR(fmtime, "fmtime");
|
||||
DEF_CSTR(iso_8859_1, "ISO-8859-1");
|
||||
DEF_CSTR(utf8, "UTF-8");
|
||||
DEF_CSTR(minwilds, "*?[");
|
||||
DEF_CSTR(newline, "\n");
|
||||
DEF_CSTR(null, "");
|
||||
|
|
|
@ -143,30 +143,37 @@ bool MimeHandlerExec::next_document()
|
|||
return true;
|
||||
}
|
||||
|
||||
void MimeHandlerExec::finaldetails()
|
||||
void MimeHandlerExec::handle_cs(const string& mt, const string& icharset)
|
||||
{
|
||||
m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
|
||||
string charset(icharset);
|
||||
|
||||
// cfgFilterOutputCharset comes from the mimeconf filter
|
||||
// definition line If the value is "default", we use the charset
|
||||
// value defined in recoll.conf (which may vary depending on
|
||||
// directory)
|
||||
string& charset = m_metaData[cstr_dj_keycharset];
|
||||
charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset;
|
||||
if (!stringlowercmp("default", charset)) {
|
||||
charset = m_dfltInputCharset;
|
||||
// definition line and defaults to UTF-8 if empty. If the value is
|
||||
// "default", we use the default input charset value defined in
|
||||
// recoll.conf (which may vary depending on directory)
|
||||
if (charset.empty()) {
|
||||
charset = cfgFilterOutputCharset.empty() ? cstr_utf8 :
|
||||
cfgFilterOutputCharset;
|
||||
if (!stringlowercmp("default", charset)) {
|
||||
charset = m_dfltInputCharset;
|
||||
}
|
||||
}
|
||||
|
||||
// The output mime type is html except if defined otherwise in the filter
|
||||
// definition.
|
||||
string& mt = m_metaData[cstr_dj_keymt];
|
||||
mt = cfgFilterOutputMtype.empty() ? "text/html" :
|
||||
cfgFilterOutputMtype;
|
||||
m_metaData[cstr_dj_keyorigcharset] = charset;
|
||||
|
||||
// If this is text/plain transcode_to/check utf-8
|
||||
if (!mt.compare(cstr_textplain)) {
|
||||
(void)txtdcode("mh_exec");
|
||||
(void)txtdcode("mh_exec/m");
|
||||
} else {
|
||||
m_metaData[cstr_dj_keycharset] = charset;
|
||||
}
|
||||
}
|
||||
|
||||
void MimeHandlerExec::finaldetails()
|
||||
{
|
||||
// The default output mime type is html, but it may be defined
|
||||
// otherwise in the filter definition.
|
||||
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" :
|
||||
cfgFilterOutputMtype;
|
||||
|
||||
string md5, xmd5, reason;
|
||||
if (MD5File(m_fn, md5, &reason)) {
|
||||
|
@ -175,4 +182,6 @@ void MimeHandlerExec::finaldetails()
|
|||
LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n",
|
||||
m_fn.c_str(), reason.c_str()));
|
||||
}
|
||||
|
||||
handle_cs(m_metaData[cstr_dj_keymt]);
|
||||
}
|
||||
|
|
|
@ -77,6 +77,13 @@ protected:
|
|||
string m_fn;
|
||||
string m_ipath;
|
||||
|
||||
// Set up the character set metadata fields and possibly transcode
|
||||
// text/plain output.
|
||||
// @param charset when called from mh_execm, a possible explicit
|
||||
// value from the filter (else the data will come from the config)
|
||||
virtual void handle_cs(const string& mt, const string& charset = string());
|
||||
|
||||
private:
|
||||
virtual void finaldetails();
|
||||
};
|
||||
|
||||
|
|
|
@ -120,7 +120,7 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
|
|||
ibuf.c_str()));
|
||||
return false;
|
||||
}
|
||||
LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
|
||||
LOGDEB(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
|
||||
if (len / 1024 > m_maxmemberkb) {
|
||||
LOGERR(("MHExecMultiple: data len > maxmemberkb\n"));
|
||||
return false;
|
||||
|
@ -290,27 +290,15 @@ bool MimeHandlerExecMultiple::next_document()
|
|||
}
|
||||
}
|
||||
|
||||
// Charset. For many document types it doesn't matter. For text
|
||||
// and html it does. We supply a default from the configuration.
|
||||
if (charset.empty()) {
|
||||
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
||||
cfgFilterOutputCharset;
|
||||
if (!stringlowercmp("default", charset)) {
|
||||
charset = m_dfltInputCharset;
|
||||
}
|
||||
}
|
||||
m_metaData[cstr_dj_keyorigcharset] = charset;
|
||||
m_metaData[cstr_dj_keycharset] = charset;
|
||||
handle_cs(m_metaData[cstr_dj_keymt], charset);
|
||||
|
||||
if (!m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
|
||||
(void)txtdcode("mh_execm");
|
||||
}
|
||||
|
||||
if (eofnext_received)
|
||||
m_havedoc = false;
|
||||
|
||||
LOGDEB0(("MHExecMultiple: returning %d bytes of content,"
|
||||
" mtype [%s] charset [%s]\n", m_metaData[cstr_dj_keycontent].size(),
|
||||
m_metaData[cstr_dj_keymt].c_str(), m_metaData[cstr_dj_keycharset].c_str()));
|
||||
" mtype [%s] charset [%s]\n",
|
||||
m_metaData[cstr_dj_keycontent].size(),
|
||||
m_metaData[cstr_dj_keymt].c_str(),
|
||||
m_metaData[cstr_dj_keycharset].c_str()));
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -164,7 +164,7 @@ bool MimeHandlerHtml::next_document()
|
|||
|
||||
m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
|
||||
m_metaData[cstr_dj_keycontent] = result.dump;
|
||||
m_metaData[cstr_dj_keycharset] = "utf-8";
|
||||
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
||||
// Avoid setting empty values which would crush ones possibly inherited
|
||||
// from parent (if we're an attachment)
|
||||
if (!result.dmtime.empty())
|
||||
|
|
|
@ -242,39 +242,27 @@ bool MimeHandlerMail::processAttach()
|
|||
MHMailAttach *att = m_attachments[m_idx];
|
||||
|
||||
m_metaData[cstr_dj_keymt] = att->m_contentType;
|
||||
m_metaData[cstr_dj_keyorigcharset] = att->m_charset;
|
||||
m_metaData[cstr_dj_keycharset] = att->m_charset;
|
||||
m_metaData[cstr_dj_keyfn] = att->m_filename;
|
||||
// Change the title to something helpul
|
||||
m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")";
|
||||
LOGDEB1((" processAttach:ct [%s] cs [%s] fn [%s]\n",
|
||||
att->m_contentType.c_str(),
|
||||
att->m_charset.c_str(),
|
||||
att->m_filename.c_str()));
|
||||
|
||||
// Erase current content and replace
|
||||
m_metaData[cstr_dj_keycontent] = string();
|
||||
string& body = m_metaData[cstr_dj_keycontent];
|
||||
att->m_part->getBody(body, 0, att->m_part->bodylength);
|
||||
string decoded;
|
||||
const string *bdp;
|
||||
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
|
||||
return false;
|
||||
}
|
||||
if (bdp != &body)
|
||||
body = decoded;
|
||||
|
||||
// Special case for text/plain content. Internfile should deal
|
||||
// with this but it expects text/plain to be utf-8 already, so we
|
||||
// handle the transcoding if needed
|
||||
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
|
||||
string utf8;
|
||||
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], "UTF-8")) {
|
||||
LOGERR((" processAttach: transcode to utf-8 failed "
|
||||
"for charset [%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
|
||||
// can't transcode at all -> data is garbage just erase it
|
||||
body.clear();
|
||||
} else {
|
||||
body = utf8;
|
||||
{
|
||||
string decoded;
|
||||
const string *bdp;
|
||||
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
|
||||
return false;
|
||||
}
|
||||
if (bdp != &body)
|
||||
body.swap(decoded);
|
||||
}
|
||||
|
||||
// Special case for application/octet-stream: try to better
|
||||
|
@ -287,6 +275,22 @@ bool MimeHandlerMail::processAttach()
|
|||
m_metaData[cstr_dj_keymt] = mt;
|
||||
}
|
||||
|
||||
// Special case for text/plain content. Internfile should deal
|
||||
// with this but it expects text/plain to be utf-8 already, so we
|
||||
// handle the transcoding if needed
|
||||
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
|
||||
string utf8;
|
||||
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
|
||||
LOGERR((" processAttach: transcode to utf-8 failed for charset "
|
||||
"[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
|
||||
// can't transcode at all -> data is garbage just erase it
|
||||
body.clear();
|
||||
} else {
|
||||
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
||||
body.swap(utf8);
|
||||
}
|
||||
}
|
||||
|
||||
// Ipath
|
||||
char nbuf[20];
|
||||
sprintf(nbuf, "%d", m_idx);
|
||||
|
@ -527,11 +531,13 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
|||
|
||||
// "Simple" part.
|
||||
LOGDEB2(("walkmime: simple part\n"));
|
||||
// Normally the default charset is us-ascii. But it happens that
|
||||
// 8 bit chars exist in a message that is stated as us-ascii. Ie the
|
||||
// mailer used by yahoo support ('KANA') does this. We could convert
|
||||
// to iso-8859 only if the transfer-encoding is 8 bit, or test for
|
||||
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
|
||||
// Normally the default charset is us-ascii. But it happens that 8
|
||||
// bit chars exist in a message that is stated as us-ascii. Ie the
|
||||
// mailer used by yahoo support ('KANA') does this. We could
|
||||
// convert to iso-8859 only if the transfer-encoding is 8 bit, or
|
||||
// test for actual 8 bit chars, but what the heck, le'ts use
|
||||
// 8859-1 (actually CP1252 which is compatible, but with more
|
||||
// useful chars) as default.
|
||||
string charset;
|
||||
it = content_type.params.find(cstr_mail_charset);
|
||||
if (it != content_type.params.end())
|
||||
|
@ -544,7 +550,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
|||
!stringlowercmp("unknown", charset) ) {
|
||||
m_config->getConfParam("maildefcharset", charset);
|
||||
if (charset.empty())
|
||||
charset = "iso-8859-1";
|
||||
charset = "CP1252";
|
||||
}
|
||||
|
||||
// Content transfer encoding
|
||||
|
@ -609,8 +615,6 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
|||
body = decoded;
|
||||
|
||||
// Handle html stripping and transcoding to utf8
|
||||
string utf8;
|
||||
const string *putf8 = 0;
|
||||
if (!stringlowercmp("text/html", content_type.value)) {
|
||||
MimeHandlerHtml mh(m_config, "text/html");
|
||||
mh.set_property(Dijon::Filter::OPERATING_MODE,
|
||||
|
@ -623,9 +627,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
|||
if (it != mh.get_meta_data().end())
|
||||
out += it->second;
|
||||
} else {
|
||||
string utf8;
|
||||
// Transcode to utf-8
|
||||
LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
|
||||
if (!transcode(body, utf8, charset, "UTF-8")) {
|
||||
if (!transcode(body, utf8, charset, cstr_utf8)) {
|
||||
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
|
||||
charset.c_str()));
|
||||
out += body;
|
||||
|
|
|
@ -288,8 +288,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
|||
|
||||
out:
|
||||
if (h) {
|
||||
string charset = cfg->getDefCharset();
|
||||
h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
||||
h->set_property(Dijon::Filter::DEFAULT_CHARSET, cfg->getDefCharset());
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
|
|
@ -109,6 +109,7 @@ public:
|
|||
}
|
||||
|
||||
// This only makes sense if the contents are currently txt/plain
|
||||
// It converts from keyorigcharset to UTF-8 and sets keycharset.
|
||||
bool txtdcode(const string& who);
|
||||
|
||||
protected:
|
||||
|
|
|
@ -40,7 +40,7 @@ application/x-lzma = uncompress rcluncomp unxz %f %t
|
|||
# The default is now again to use rcldoc. Use raw antiword if speed is more
|
||||
# important for you than catching all data,
|
||||
application/msword = exec rcldoc
|
||||
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
|
||||
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
|
||||
# You can also use wvware directly but it's much slower.
|
||||
# application/msword = exec wvWare --charset=utf-8 --nographics
|
||||
|
||||
|
@ -52,8 +52,8 @@ application/vnd.ms-office = exec rcldoc
|
|||
application/ogg = execm rclaudio
|
||||
application/pdf = exec rclpdf
|
||||
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
|
||||
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;charset=utf-8;mimetype=text/plain
|
||||
application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain
|
||||
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;mimetype=text/plain
|
||||
application/vnd.ms-powerpoint = exec catppt -d utf-8;mimetype=text/plain
|
||||
application/vn.oasis.opendocument.txt = exec rclsoff
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
|
||||
exec rclopxml
|
||||
|
@ -81,7 +81,7 @@ application/vnd.wordperfect = exec wpd2html;mimetype=text/html
|
|||
application/x-abiword = exec rclabw
|
||||
application/x-awk = internal text/plain
|
||||
application/x-chm = execm rclchm
|
||||
application/x-dia-diagram = execm rcldia;mimetype=text/plain;charset=utf-8
|
||||
application/x-dia-diagram = execm rcldia;mimetype=text/plain
|
||||
application/x-dvi = exec rcldvi
|
||||
application/x-flac = execm rclaudio
|
||||
application/x-gnuinfo = execm rclinfo
|
||||
|
@ -109,7 +109,7 @@ image/vnd.djvu = exec rcldjvu
|
|||
image/svg+xml = exec rclsvg
|
||||
image/x-xcf = execm rclimg
|
||||
message/rfc822 = internal
|
||||
text/calendar = execm rclics;mimetype=text/plain;charset=utf-8
|
||||
text/calendar = execm rclics;mimetype=text/plain
|
||||
text/html = internal
|
||||
text/plain = internal
|
||||
text/rtf = exec unrtf --nopict --html;mimetype=text/html
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue