factored out common charset handling code in exec and execm, cleaned up charset and textplain handling in mh_mail

This commit is contained in:
Jean-Francois Dockes 2012-10-06 12:14:04 +02:00
parent 8be8a00a79
commit 52804fef6c
9 changed files with 82 additions and 72 deletions

View file

@ -48,6 +48,7 @@ DEF_CSTR(fbytes, "fbytes");
DEF_CSTR(fileu, "file://"); DEF_CSTR(fileu, "file://");
DEF_CSTR(fmtime, "fmtime"); DEF_CSTR(fmtime, "fmtime");
DEF_CSTR(iso_8859_1, "ISO-8859-1"); DEF_CSTR(iso_8859_1, "ISO-8859-1");
DEF_CSTR(utf8, "UTF-8");
DEF_CSTR(minwilds, "*?["); DEF_CSTR(minwilds, "*?[");
DEF_CSTR(newline, "\n"); DEF_CSTR(newline, "\n");
DEF_CSTR(null, ""); DEF_CSTR(null, "");

View file

@ -143,30 +143,37 @@ bool MimeHandlerExec::next_document()
return true; return true;
} }
void MimeHandlerExec::finaldetails() void MimeHandlerExec::handle_cs(const string& mt, const string& icharset)
{ {
m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; string charset(icharset);
// cfgFilterOutputCharset comes from the mimeconf filter // cfgFilterOutputCharset comes from the mimeconf filter
// definition line If the value is "default", we use the charset // definition line and defaults to UTF-8 if empty. If the value is
// value defined in recoll.conf (which may vary depending on // "default", we use the default input charset value defined in
// directory) // recoll.conf (which may vary depending on directory)
string& charset = m_metaData[cstr_dj_keycharset]; if (charset.empty()) {
charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset; charset = cfgFilterOutputCharset.empty() ? cstr_utf8 :
cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) { if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset; charset = m_dfltInputCharset;
} }
}
// The output mime type is html except if defined otherwise in the filter m_metaData[cstr_dj_keyorigcharset] = charset;
// definition.
string& mt = m_metaData[cstr_dj_keymt];
mt = cfgFilterOutputMtype.empty() ? "text/html" :
cfgFilterOutputMtype;
// If this is text/plain transcode_to/check utf-8 // If this is text/plain transcode_to/check utf-8
if (!mt.compare(cstr_textplain)) { if (!mt.compare(cstr_textplain)) {
(void)txtdcode("mh_exec"); (void)txtdcode("mh_exec/m");
} else {
m_metaData[cstr_dj_keycharset] = charset;
} }
}
void MimeHandlerExec::finaldetails()
{
// The default output mime type is html, but it may be defined
// otherwise in the filter definition.
m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" :
cfgFilterOutputMtype;
string md5, xmd5, reason; string md5, xmd5, reason;
if (MD5File(m_fn, md5, &reason)) { if (MD5File(m_fn, md5, &reason)) {
@ -175,4 +182,6 @@ void MimeHandlerExec::finaldetails()
LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n", LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n",
m_fn.c_str(), reason.c_str())); m_fn.c_str(), reason.c_str()));
} }
handle_cs(m_metaData[cstr_dj_keymt]);
} }

View file

@ -77,6 +77,13 @@ protected:
string m_fn; string m_fn;
string m_ipath; string m_ipath;
// Set up the character set metadata fields and possibly transcode
// text/plain output.
// @param charset when called from mh_execm, a possible explicit
// value from the filter (else the data will come from the config)
virtual void handle_cs(const string& mt, const string& charset = string());
private:
virtual void finaldetails(); virtual void finaldetails();
}; };

View file

@ -120,7 +120,7 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
ibuf.c_str())); ibuf.c_str()));
return false; return false;
} }
LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len)); LOGDEB(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
if (len / 1024 > m_maxmemberkb) { if (len / 1024 > m_maxmemberkb) {
LOGERR(("MHExecMultiple: data len > maxmemberkb\n")); LOGERR(("MHExecMultiple: data len > maxmemberkb\n"));
return false; return false;
@ -290,27 +290,15 @@ bool MimeHandlerExecMultiple::next_document()
} }
} }
// Charset. For many document types it doesn't matter. For text handle_cs(m_metaData[cstr_dj_keymt], charset);
// and html it does. We supply a default from the configuration.
if (charset.empty()) {
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset;
}
}
m_metaData[cstr_dj_keyorigcharset] = charset;
m_metaData[cstr_dj_keycharset] = charset;
if (!m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
(void)txtdcode("mh_execm");
}
if (eofnext_received) if (eofnext_received)
m_havedoc = false; m_havedoc = false;
LOGDEB0(("MHExecMultiple: returning %d bytes of content," LOGDEB0(("MHExecMultiple: returning %d bytes of content,"
" mtype [%s] charset [%s]\n", m_metaData[cstr_dj_keycontent].size(), " mtype [%s] charset [%s]\n",
m_metaData[cstr_dj_keymt].c_str(), m_metaData[cstr_dj_keycharset].c_str())); m_metaData[cstr_dj_keycontent].size(),
m_metaData[cstr_dj_keymt].c_str(),
m_metaData[cstr_dj_keycharset].c_str()));
return true; return true;
} }

View file

@ -164,7 +164,7 @@ bool MimeHandlerHtml::next_document()
m_metaData[cstr_dj_keyorigcharset] = result.get_charset(); m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
m_metaData[cstr_dj_keycontent] = result.dump; m_metaData[cstr_dj_keycontent] = result.dump;
m_metaData[cstr_dj_keycharset] = "utf-8"; m_metaData[cstr_dj_keycharset] = cstr_utf8;
// Avoid setting empty values which would crush ones possibly inherited // Avoid setting empty values which would crush ones possibly inherited
// from parent (if we're an attachment) // from parent (if we're an attachment)
if (!result.dmtime.empty()) if (!result.dmtime.empty())

View file

@ -242,39 +242,27 @@ bool MimeHandlerMail::processAttach()
MHMailAttach *att = m_attachments[m_idx]; MHMailAttach *att = m_attachments[m_idx];
m_metaData[cstr_dj_keymt] = att->m_contentType; m_metaData[cstr_dj_keymt] = att->m_contentType;
m_metaData[cstr_dj_keyorigcharset] = att->m_charset;
m_metaData[cstr_dj_keycharset] = att->m_charset; m_metaData[cstr_dj_keycharset] = att->m_charset;
m_metaData[cstr_dj_keyfn] = att->m_filename; m_metaData[cstr_dj_keyfn] = att->m_filename;
// Change the title to something helpul
m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")"; m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")";
LOGDEB1((" processAttach:ct [%s] cs [%s] fn [%s]\n", LOGDEB1((" processAttach:ct [%s] cs [%s] fn [%s]\n",
att->m_contentType.c_str(), att->m_contentType.c_str(),
att->m_charset.c_str(), att->m_charset.c_str(),
att->m_filename.c_str())); att->m_filename.c_str()));
// Erase current content and replace
m_metaData[cstr_dj_keycontent] = string(); m_metaData[cstr_dj_keycontent] = string();
string& body = m_metaData[cstr_dj_keycontent]; string& body = m_metaData[cstr_dj_keycontent];
att->m_part->getBody(body, 0, att->m_part->bodylength); att->m_part->getBody(body, 0, att->m_part->bodylength);
{
string decoded; string decoded;
const string *bdp; const string *bdp;
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) { if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
return false; return false;
} }
if (bdp != &body) if (bdp != &body)
body = decoded; body.swap(decoded);
// Special case for text/plain content. Internfile should deal
// with this but it expects text/plain to be utf-8 already, so we
// handle the transcoding if needed
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
string utf8;
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], "UTF-8")) {
LOGERR((" processAttach: transcode to utf-8 failed "
"for charset [%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
// can't transcode at all -> data is garbage just erase it
body.clear();
} else {
body = utf8;
}
} }
// Special case for application/octet-stream: try to better // Special case for application/octet-stream: try to better
@ -287,6 +275,22 @@ bool MimeHandlerMail::processAttach()
m_metaData[cstr_dj_keymt] = mt; m_metaData[cstr_dj_keymt] = mt;
} }
// Special case for text/plain content. Internfile should deal
// with this but it expects text/plain to be utf-8 already, so we
// handle the transcoding if needed
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
string utf8;
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
LOGERR((" processAttach: transcode to utf-8 failed for charset "
"[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
// can't transcode at all -> data is garbage just erase it
body.clear();
} else {
m_metaData[cstr_dj_keycharset] = cstr_utf8;
body.swap(utf8);
}
}
// Ipath // Ipath
char nbuf[20]; char nbuf[20];
sprintf(nbuf, "%d", m_idx); sprintf(nbuf, "%d", m_idx);
@ -527,11 +531,13 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
// "Simple" part. // "Simple" part.
LOGDEB2(("walkmime: simple part\n")); LOGDEB2(("walkmime: simple part\n"));
// Normally the default charset is us-ascii. But it happens that // Normally the default charset is us-ascii. But it happens that 8
// 8 bit chars exist in a message that is stated as us-ascii. Ie the // bit chars exist in a message that is stated as us-ascii. Ie the
// mailer used by yahoo support ('KANA') does this. We could convert // mailer used by yahoo support ('KANA') does this. We could
// to iso-8859 only if the transfer-encoding is 8 bit, or test for // convert to iso-8859 only if the transfer-encoding is 8 bit, or
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default // test for actual 8 bit chars, but what the heck, le'ts use
// 8859-1 (actually CP1252 which is compatible, but with more
// useful chars) as default.
string charset; string charset;
it = content_type.params.find(cstr_mail_charset); it = content_type.params.find(cstr_mail_charset);
if (it != content_type.params.end()) if (it != content_type.params.end())
@ -544,7 +550,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
!stringlowercmp("unknown", charset) ) { !stringlowercmp("unknown", charset) ) {
m_config->getConfParam("maildefcharset", charset); m_config->getConfParam("maildefcharset", charset);
if (charset.empty()) if (charset.empty())
charset = "iso-8859-1"; charset = "CP1252";
} }
// Content transfer encoding // Content transfer encoding
@ -609,8 +615,6 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
body = decoded; body = decoded;
// Handle html stripping and transcoding to utf8 // Handle html stripping and transcoding to utf8
string utf8;
const string *putf8 = 0;
if (!stringlowercmp("text/html", content_type.value)) { if (!stringlowercmp("text/html", content_type.value)) {
MimeHandlerHtml mh(m_config, "text/html"); MimeHandlerHtml mh(m_config, "text/html");
mh.set_property(Dijon::Filter::OPERATING_MODE, mh.set_property(Dijon::Filter::OPERATING_MODE,
@ -623,9 +627,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
if (it != mh.get_meta_data().end()) if (it != mh.get_meta_data().end())
out += it->second; out += it->second;
} else { } else {
string utf8;
// Transcode to utf-8 // Transcode to utf-8
LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str())); LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
if (!transcode(body, utf8, charset, "UTF-8")) { if (!transcode(body, utf8, charset, cstr_utf8)) {
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n", LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
charset.c_str())); charset.c_str()));
out += body; out += body;

View file

@ -288,8 +288,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
out: out:
if (h) { if (h) {
string charset = cfg->getDefCharset(); h->set_property(Dijon::Filter::DEFAULT_CHARSET, cfg->getDefCharset());
h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
} }
return h; return h;
} }

View file

@ -109,6 +109,7 @@ public:
} }
// This only makes sense if the contents are currently txt/plain // This only makes sense if the contents are currently txt/plain
// It converts from keyorigcharset to UTF-8 and sets keycharset.
bool txtdcode(const string& who); bool txtdcode(const string& who);
protected: protected:

View file

@ -40,7 +40,7 @@ application/x-lzma = uncompress rcluncomp unxz %f %t
# The default is now again to use rcldoc. Use raw antiword if speed is more # The default is now again to use rcldoc. Use raw antiword if speed is more
# important for you than catching all data, # important for you than catching all data,
application/msword = exec rcldoc application/msword = exec rcldoc
#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8 #application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
# You can also use wvware directly but it's much slower. # You can also use wvware directly but it's much slower.
# application/msword = exec wvWare --charset=utf-8 --nographics # application/msword = exec wvWare --charset=utf-8 --nographics
@ -52,8 +52,8 @@ application/vnd.ms-office = exec rcldoc
application/ogg = execm rclaudio application/ogg = execm rclaudio
application/pdf = exec rclpdf application/pdf = exec rclpdf
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;charset=utf-8;mimetype=text/plain application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;mimetype=text/plain
application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain application/vnd.ms-powerpoint = exec catppt -d utf-8;mimetype=text/plain
application/vn.oasis.opendocument.txt = exec rclsoff application/vn.oasis.opendocument.txt = exec rclsoff
application/vnd.openxmlformats-officedocument.wordprocessingml.document = \ application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
exec rclopxml exec rclopxml
@ -81,7 +81,7 @@ application/vnd.wordperfect = exec wpd2html;mimetype=text/html
application/x-abiword = exec rclabw application/x-abiword = exec rclabw
application/x-awk = internal text/plain application/x-awk = internal text/plain
application/x-chm = execm rclchm application/x-chm = execm rclchm
application/x-dia-diagram = execm rcldia;mimetype=text/plain;charset=utf-8 application/x-dia-diagram = execm rcldia;mimetype=text/plain
application/x-dvi = exec rcldvi application/x-dvi = exec rcldvi
application/x-flac = execm rclaudio application/x-flac = execm rclaudio
application/x-gnuinfo = execm rclinfo application/x-gnuinfo = execm rclinfo
@ -109,7 +109,7 @@ image/vnd.djvu = exec rcldjvu
image/svg+xml = exec rclsvg image/svg+xml = exec rclsvg
image/x-xcf = execm rclimg image/x-xcf = execm rclimg
message/rfc822 = internal message/rfc822 = internal
text/calendar = execm rclics;mimetype=text/plain;charset=utf-8 text/calendar = execm rclics;mimetype=text/plain
text/html = internal text/html = internal
text/plain = internal text/plain = internal
text/rtf = exec unrtf --nopict --html;mimetype=text/html text/rtf = exec unrtf --nopict --html;mimetype=text/html