factored out common charset handling code in exec and execm, cleaned up charset and textplain handling in mh_mail

This commit is contained in:
Jean-Francois Dockes 2012-10-06 12:14:04 +02:00
parent 8be8a00a79
commit 52804fef6c
9 changed files with 82 additions and 72 deletions

View file

@ -48,6 +48,7 @@ DEF_CSTR(fbytes, "fbytes");
DEF_CSTR(fileu, "file://");
DEF_CSTR(fmtime, "fmtime");
DEF_CSTR(iso_8859_1, "ISO-8859-1");
DEF_CSTR(utf8, "UTF-8");
DEF_CSTR(minwilds, "*?[");
DEF_CSTR(newline, "\n");
DEF_CSTR(null, "");

View file

@ -143,30 +143,37 @@ bool MimeHandlerExec::next_document()
return true;
}
void MimeHandlerExec::finaldetails()
void MimeHandlerExec::handle_cs(const string& mt, const string& icharset)
{
m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
string charset(icharset);
// cfgFilterOutputCharset comes from the mimeconf filter
// definition line If the value is "default", we use the charset
// value defined in recoll.conf (which may vary depending on
// directory)
string& charset = m_metaData[cstr_dj_keycharset];
charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset;
// definition line and defaults to UTF-8 if empty. If the value is
// "default", we use the default input charset value defined in
// recoll.conf (which may vary depending on directory)
if (charset.empty()) {
charset = cfgFilterOutputCharset.empty() ? cstr_utf8 :
cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset;
}
}
// The output mime type is html except if defined otherwise in the filter
// definition.
string& mt = m_metaData[cstr_dj_keymt];
mt = cfgFilterOutputMtype.empty() ? "text/html" :
cfgFilterOutputMtype;
m_metaData[cstr_dj_keyorigcharset] = charset;
// If this is text/plain transcode_to/check utf-8
if (!mt.compare(cstr_textplain)) {
(void)txtdcode("mh_exec");
(void)txtdcode("mh_exec/m");
} else {
m_metaData[cstr_dj_keycharset] = charset;
}
}
void MimeHandlerExec::finaldetails()
{
// The default output mime type is html, but it may be defined
// otherwise in the filter definition.
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" :
cfgFilterOutputMtype;
string md5, xmd5, reason;
if (MD5File(m_fn, md5, &reason)) {
@ -175,4 +182,6 @@ void MimeHandlerExec::finaldetails()
LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n",
m_fn.c_str(), reason.c_str()));
}
handle_cs(m_metaData[cstr_dj_keymt]);
}

View file

@ -77,6 +77,13 @@ protected:
string m_fn;
string m_ipath;
// Set up the character set metadata fields and possibly transcode
// text/plain output.
// @param charset when called from mh_execm, a possible explicit
// value from the filter (else the data will come from the config)
virtual void handle_cs(const string& mt, const string& charset = string());
private:
virtual void finaldetails();
};

View file

@ -120,7 +120,7 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
ibuf.c_str()));
return false;
}
LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
LOGDEB(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
if (len / 1024 > m_maxmemberkb) {
LOGERR(("MHExecMultiple: data len > maxmemberkb\n"));
return false;
@ -290,27 +290,15 @@ bool MimeHandlerExecMultiple::next_document()
}
}
// Charset. For many document types it doesn't matter. For text
// and html it does. We supply a default from the configuration.
if (charset.empty()) {
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset;
}
}
m_metaData[cstr_dj_keyorigcharset] = charset;
m_metaData[cstr_dj_keycharset] = charset;
if (!m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
(void)txtdcode("mh_execm");
}
handle_cs(m_metaData[cstr_dj_keymt], charset);
if (eofnext_received)
m_havedoc = false;
LOGDEB0(("MHExecMultiple: returning %d bytes of content,"
" mtype [%s] charset [%s]\n", m_metaData[cstr_dj_keycontent].size(),
m_metaData[cstr_dj_keymt].c_str(), m_metaData[cstr_dj_keycharset].c_str()));
" mtype [%s] charset [%s]\n",
m_metaData[cstr_dj_keycontent].size(),
m_metaData[cstr_dj_keymt].c_str(),
m_metaData[cstr_dj_keycharset].c_str()));
return true;
}

View file

@ -164,7 +164,7 @@ bool MimeHandlerHtml::next_document()
m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
m_metaData[cstr_dj_keycontent] = result.dump;
m_metaData[cstr_dj_keycharset] = "utf-8";
m_metaData[cstr_dj_keycharset] = cstr_utf8;
// Avoid setting empty values which would crush ones possibly inherited
// from parent (if we're an attachment)
if (!result.dmtime.empty())

View file

@ -242,39 +242,27 @@ bool MimeHandlerMail::processAttach()
MHMailAttach *att = m_attachments[m_idx];
m_metaData[cstr_dj_keymt] = att->m_contentType;
m_metaData[cstr_dj_keyorigcharset] = att->m_charset;
m_metaData[cstr_dj_keycharset] = att->m_charset;
m_metaData[cstr_dj_keyfn] = att->m_filename;
// Change the title to something helpul
m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")";
LOGDEB1((" processAttach:ct [%s] cs [%s] fn [%s]\n",
att->m_contentType.c_str(),
att->m_charset.c_str(),
att->m_filename.c_str()));
// Erase current content and replace
m_metaData[cstr_dj_keycontent] = string();
string& body = m_metaData[cstr_dj_keycontent];
att->m_part->getBody(body, 0, att->m_part->bodylength);
string decoded;
const string *bdp;
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
return false;
}
if (bdp != &body)
body = decoded;
// Special case for text/plain content. Internfile should deal
// with this but it expects text/plain to be utf-8 already, so we
// handle the transcoding if needed
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
string utf8;
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], "UTF-8")) {
LOGERR((" processAttach: transcode to utf-8 failed "
"for charset [%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
// can't transcode at all -> data is garbage just erase it
body.clear();
} else {
body = utf8;
{
string decoded;
const string *bdp;
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
return false;
}
if (bdp != &body)
body.swap(decoded);
}
// Special case for application/octet-stream: try to better
@ -287,6 +275,22 @@ bool MimeHandlerMail::processAttach()
m_metaData[cstr_dj_keymt] = mt;
}
// Special case for text/plain content. Internfile should deal
// with this but it expects text/plain to be utf-8 already, so we
// handle the transcoding if needed
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
string utf8;
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
LOGERR((" processAttach: transcode to utf-8 failed for charset "
"[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
// can't transcode at all -> data is garbage just erase it
body.clear();
} else {
m_metaData[cstr_dj_keycharset] = cstr_utf8;
body.swap(utf8);
}
}
// Ipath
char nbuf[20];
sprintf(nbuf, "%d", m_idx);
@ -527,11 +531,13 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
// "Simple" part.
LOGDEB2(("walkmime: simple part\n"));
// Normally the default charset is us-ascii. But it happens that
// 8 bit chars exist in a message that is stated as us-ascii. Ie the
// mailer used by yahoo support ('KANA') does this. We could convert
// to iso-8859 only if the transfer-encoding is 8 bit, or test for
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
// Normally the default charset is us-ascii. But it happens that 8
// bit chars exist in a message that is stated as us-ascii. Ie the
// mailer used by yahoo support ('KANA') does this. We could
// convert to iso-8859 only if the transfer-encoding is 8 bit, or
// test for actual 8 bit chars, but what the heck, le'ts use
// 8859-1 (actually CP1252 which is compatible, but with more
// useful chars) as default.
string charset;
it = content_type.params.find(cstr_mail_charset);
if (it != content_type.params.end())
@ -544,7 +550,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
!stringlowercmp("unknown", charset) ) {
m_config->getConfParam("maildefcharset", charset);
if (charset.empty())
charset = "iso-8859-1";
charset = "CP1252";
}
// Content transfer encoding
@ -609,8 +615,6 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
body = decoded;
// Handle html stripping and transcoding to utf8
string utf8;
const string *putf8 = 0;
if (!stringlowercmp("text/html", content_type.value)) {
MimeHandlerHtml mh(m_config, "text/html");
mh.set_property(Dijon::Filter::OPERATING_MODE,
@ -623,9 +627,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
if (it != mh.get_meta_data().end())
out += it->second;
} else {
string utf8;
// Transcode to utf-8
LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
if (!transcode(body, utf8, charset, "UTF-8")) {
if (!transcode(body, utf8, charset, cstr_utf8)) {
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
charset.c_str()));
out += body;

View file

@ -288,8 +288,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
out:
if (h) {
string charset = cfg->getDefCharset();
h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
h->set_property(Dijon::Filter::DEFAULT_CHARSET, cfg->getDefCharset());
}
return h;
}

View file

@ -109,6 +109,7 @@ public:
}
// This only makes sense if the contents are currently txt/plain
// It converts from keyorigcharset to UTF-8 and sets keycharset.
bool txtdcode(const string& who);
protected:

View file

@ -40,7 +40,7 @@ application/x-lzma = uncompress rcluncomp unxz %f %t
# The default is now again to use rcldoc. Use raw antiword if speed is more
# important for you than catching all data,
application/msword = exec rcldoc
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
# You can also use wvware directly but it's much slower.
# application/msword = exec wvWare --charset=utf-8 --nographics
@ -52,8 +52,8 @@ application/vnd.ms-office = exec rcldoc
application/ogg = execm rclaudio
application/pdf = exec rclpdf
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;charset=utf-8;mimetype=text/plain
application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;mimetype=text/plain
application/vnd.ms-powerpoint = exec catppt -d utf-8;mimetype=text/plain
application/vn.oasis.opendocument.txt = exec rclsoff
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
exec rclopxml
@ -81,7 +81,7 @@ application/vnd.wordperfect = exec wpd2html;mimetype=text/html
application/x-abiword = exec rclabw
application/x-awk = internal text/plain
application/x-chm = execm rclchm
application/x-dia-diagram = execm rcldia;mimetype=text/plain;charset=utf-8
application/x-dia-diagram = execm rcldia;mimetype=text/plain
application/x-dvi = exec rcldvi
application/x-flac = execm rclaudio
application/x-gnuinfo = execm rclinfo
@ -109,7 +109,7 @@ image/vnd.djvu = exec rcldjvu
image/svg+xml = exec rclsvg
image/x-xcf = execm rclimg
message/rfc822 = internal
text/calendar = execm rclics;mimetype=text/plain;charset=utf-8
text/calendar = execm rclics;mimetype=text/plain
text/html = internal
text/plain = internal
text/rtf = exec unrtf --nopict --html;mimetype=text/html