factored out common charset handling code in exec and execm, cleaned up charset and textplain handling in mh_mail

2012-10-06 12:14:04 +02:00 · 2012-10-06 12:14:04 +02:00 · 52804fef6c
commit 52804fef6c
parent 8be8a00a79
9 changed files with 82 additions and 72 deletions
--- a/src/common/cstr.h
+++ b/src/common/cstr.h
@ -48,6 +48,7 @@ DEF_CSTR(fbytes, "fbytes");
 DEF_CSTR(fileu, "file://");
 DEF_CSTR(fmtime, "fmtime");
 DEF_CSTR(iso_8859_1, "ISO-8859-1");
 DEF_CSTR(utf8, "UTF-8");
 DEF_CSTR(minwilds, "*?[");
 DEF_CSTR(newline, "\n");
 DEF_CSTR(null, "");
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@ -143,30 +143,37 @@ bool MimeHandlerExec::next_document()
    return true;
 }
-void MimeHandlerExec::finaldetails()
+void MimeHandlerExec::handle_cs(const string& mt, const string& icharset)
 {
-    m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
+    string charset(icharset);
    // cfgFilterOutputCharset comes from the mimeconf filter
-    // definition line If the value is "default", we use the charset
+    // definition line and defaults to UTF-8 if empty. If the value is
-    // value defined in recoll.conf (which may vary depending on
+    // "default", we use the default input charset value defined in
-    // directory)
+    // recoll.conf (which may vary depending on directory)
-    string& charset = m_metaData[cstr_dj_keycharset];
+    if (charset.empty()) {
-    charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset;
+	charset = cfgFilterOutputCharset.empty() ? cstr_utf8 : 
 	    cfgFilterOutputCharset;
 	if (!stringlowercmp("default", charset)) {
 	    charset = m_dfltInputCharset;
 	}
-
+    }
-    // The output mime type is html except if defined otherwise in the filter
+    m_metaData[cstr_dj_keyorigcharset] = charset;
    // definition.
    string& mt = m_metaData[cstr_dj_keymt];
    mt = cfgFilterOutputMtype.empty() ? "text/html" : 
 	cfgFilterOutputMtype;
    // If this is text/plain transcode_to/check utf-8
    if (!mt.compare(cstr_textplain)) {
-	(void)txtdcode("mh_exec");
+	(void)txtdcode("mh_exec/m");
    } else {
 	m_metaData[cstr_dj_keycharset] = charset;
    }
 }
 void MimeHandlerExec::finaldetails()
 {
    // The default output mime type is html, but it may be defined
    // otherwise in the filter definition.
    m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" : 
 	cfgFilterOutputMtype;
    string md5, xmd5, reason;
    if (MD5File(m_fn, md5, &reason)) {
@ -175,4 +182,6 @@ void MimeHandlerExec::finaldetails()
 	LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n", 
 		m_fn.c_str(), reason.c_str()));
    }
    handle_cs(m_metaData[cstr_dj_keymt]);
 }
--- a/src/internfile/mh_exec.h
+++ b/src/internfile/mh_exec.h
@ -77,6 +77,13 @@ protected:
    string m_fn;
    string m_ipath;
    // Set up the character set metadata fields and possibly transcode
    // text/plain output. 
    // @param charset when called from mh_execm, a possible explicit
    //       value from the filter (else the data will come from the config)
    virtual void handle_cs(const string& mt, const string& charset = string());
 private:
    virtual void finaldetails();
 };
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@ -120,7 +120,7 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
                ibuf.c_str()));
        return false;
    }
-    LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
+    LOGDEB(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
    if (len / 1024 > m_maxmemberkb) {
        LOGERR(("MHExecMultiple: data len > maxmemberkb\n"));
        return false;
@ -290,27 +290,15 @@ bool MimeHandlerExecMultiple::next_document()
        }
    }
-    // Charset. For many document types it doesn't matter. For text
+    handle_cs(m_metaData[cstr_dj_keymt], charset);
    // and html it does. We supply a default from the configuration. 
    if (charset.empty()) {
 	charset = cfgFilterOutputCharset.empty() ? "utf-8" : 
 	    cfgFilterOutputCharset;
 	if (!stringlowercmp("default", charset)) {
 	    charset = m_dfltInputCharset;
 	}
    }
    m_metaData[cstr_dj_keyorigcharset] = charset;
    m_metaData[cstr_dj_keycharset] = charset;
    if (!m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
 	(void)txtdcode("mh_execm");
    }
    if (eofnext_received)
        m_havedoc = false;
    LOGDEB0(("MHExecMultiple: returning %d bytes of content,"
-	    " mtype [%s] charset [%s]\n", m_metaData[cstr_dj_keycontent].size(), 
+	    " mtype [%s] charset [%s]\n", 
-     m_metaData[cstr_dj_keymt].c_str(), m_metaData[cstr_dj_keycharset].c_str()));
+	     m_metaData[cstr_dj_keycontent].size(), 
 	     m_metaData[cstr_dj_keymt].c_str(), 
 	     m_metaData[cstr_dj_keycharset].c_str()));
    return true;
 }
--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@ -164,7 +164,7 @@ bool MimeHandlerHtml::next_document()
    m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
    m_metaData[cstr_dj_keycontent] = result.dump;
-    m_metaData[cstr_dj_keycharset] = "utf-8";
+    m_metaData[cstr_dj_keycharset] = cstr_utf8;
    // Avoid setting empty values which would crush ones possibly inherited
    // from parent (if we're an attachment)
    if (!result.dmtime.empty())
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -242,39 +242,27 @@ bool MimeHandlerMail::processAttach()
    MHMailAttach *att = m_attachments[m_idx];
    m_metaData[cstr_dj_keymt] = att->m_contentType;
    m_metaData[cstr_dj_keyorigcharset] = att->m_charset;
    m_metaData[cstr_dj_keycharset] = att->m_charset;
    m_metaData[cstr_dj_keyfn] = att->m_filename;
    // Change the title to something helpul
    m_metaData[cstr_dj_keytitle] = att->m_filename + "  (" + m_subject + ")";
    LOGDEB1(("  processAttach:ct [%s] cs [%s] fn [%s]\n", 
 	    att->m_contentType.c_str(),
 	    att->m_charset.c_str(),
 	    att->m_filename.c_str()));
    // Erase current content and replace
    m_metaData[cstr_dj_keycontent] = string();
    string& body = m_metaData[cstr_dj_keycontent];
    att->m_part->getBody(body, 0, att->m_part->bodylength);
    {
 	string decoded;
 	const string *bdp;
 	if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
 	    return false;
 	}
 	if (bdp != &body)
-	body = decoded;
+	    body.swap(decoded);
    // Special case for text/plain content. Internfile should deal
    // with this but it expects text/plain to be utf-8 already, so we
    // handle the transcoding if needed
    if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
 	string utf8;
 	if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], "UTF-8")) {
 	    LOGERR(("  processAttach: transcode to utf-8 failed "
 		    "for charset [%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
 	    // can't transcode at all -> data is garbage just erase it
 	    body.clear();
 	} else {
 	    body = utf8;
 	}
    }
    // Special case for application/octet-stream: try to better
@ -287,6 +275,22 @@ bool MimeHandlerMail::processAttach()
 	    m_metaData[cstr_dj_keymt] = mt;
    }
    // Special case for text/plain content. Internfile should deal
    // with this but it expects text/plain to be utf-8 already, so we
    // handle the transcoding if needed
    if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
 	string utf8;
 	if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
 	    LOGERR(("  processAttach: transcode to utf-8 failed for charset "
 		    "[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
 	    // can't transcode at all -> data is garbage just erase it
 	    body.clear();
 	} else {
 	    m_metaData[cstr_dj_keycharset] = cstr_utf8;
 	    body.swap(utf8);
 	}
    }
    // Ipath
    char nbuf[20];
    sprintf(nbuf, "%d", m_idx);
@ -527,11 +531,13 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
    // "Simple" part. 
    LOGDEB2(("walkmime: simple  part\n"));
-    // Normally the default charset is us-ascii. But it happens that
+    // Normally the default charset is us-ascii. But it happens that 8
-    // 8 bit chars exist in a message that is stated as us-ascii. Ie the 
+    // bit chars exist in a message that is stated as us-ascii. Ie the
-    // mailer used by yahoo support ('KANA') does this. We could convert 
+    // mailer used by yahoo support ('KANA') does this. We could
-    // to iso-8859 only if the transfer-encoding is 8 bit, or test for
+    // convert to iso-8859 only if the transfer-encoding is 8 bit, or
-    // actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
+    // test for actual 8 bit chars, but what the heck, le'ts use
    // 8859-1 (actually CP1252 which is compatible, but with more
    // useful chars) as default.
    string charset;
    it = content_type.params.find(cstr_mail_charset);
    if (it != content_type.params.end())
@ -544,7 +550,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 	!stringlowercmp("unknown", charset) ) {
        m_config->getConfParam("maildefcharset", charset);
        if (charset.empty())
-            charset = "iso-8859-1";
+            charset = "CP1252";
    }
    // Content transfer encoding
@ -609,8 +615,6 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 	body = decoded;
    // Handle html stripping and transcoding to utf8
    string utf8;
    const string *putf8 = 0;
    if (!stringlowercmp("text/html", content_type.value)) {
 	MimeHandlerHtml mh(m_config, "text/html");
 	mh.set_property(Dijon::Filter::OPERATING_MODE, 
@ -623,9 +627,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 	if (it != mh.get_meta_data().end())
 	    out += it->second;
    } else {
 	string utf8;
 	// Transcode to utf-8 
 	LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
-	if (!transcode(body, utf8, charset, "UTF-8")) {
+	if (!transcode(body, utf8, charset, cstr_utf8)) {
 	    LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
 		    charset.c_str()));
 	    out += body;
--- a/src/internfile/mimehandler.cpp
+++ b/src/internfile/mimehandler.cpp
@ -288,8 +288,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
 out:
    if (h) {
-	string charset = cfg->getDefCharset();
+	h->set_property(Dijon::Filter::DEFAULT_CHARSET, cfg->getDefCharset());
 	h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
    }
    return h;
 }
--- a/src/internfile/mimehandler.h
+++ b/src/internfile/mimehandler.h
@ -109,6 +109,7 @@ public:
    }
    // This only makes sense if the contents are currently txt/plain
    // It converts from keyorigcharset to UTF-8 and sets keycharset.
    bool txtdcode(const string& who);
 protected:
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -40,7 +40,7 @@ application/x-lzma = uncompress rcluncomp unxz %f %t
 # The default is now again to use rcldoc. Use raw antiword if speed is more
 # important for you than catching all data, 
 application/msword = exec rcldoc
-#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
+#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
 # You can also use wvware directly but it's much slower.
 # application/msword = exec wvWare --charset=utf-8 --nographics
@ -52,8 +52,8 @@ application/vnd.ms-office = exec rcldoc
 application/ogg = execm rclaudio
 application/pdf = exec rclpdf
 application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
-application/vnd.ms-excel = exec xls2csv -c "	" -d utf-8;charset=utf-8;mimetype=text/plain
+application/vnd.ms-excel = exec xls2csv -c "	" -d utf-8;mimetype=text/plain
-application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain
+application/vnd.ms-powerpoint = exec catppt -d utf-8;mimetype=text/plain
 application/vn.oasis.opendocument.txt = exec rclsoff
 application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
 exec rclopxml
@ -81,7 +81,7 @@ application/vnd.wordperfect = exec wpd2html;mimetype=text/html
 application/x-abiword = exec rclabw
 application/x-awk = internal text/plain
 application/x-chm = execm rclchm
-application/x-dia-diagram = execm rcldia;mimetype=text/plain;charset=utf-8
+application/x-dia-diagram = execm rcldia;mimetype=text/plain
 application/x-dvi = exec rcldvi
 application/x-flac = execm rclaudio
 application/x-gnuinfo = execm rclinfo
@ -109,7 +109,7 @@ image/vnd.djvu = exec rcldjvu
 image/svg+xml = exec rclsvg
 image/x-xcf = execm rclimg
 message/rfc822 = internal
-text/calendar = execm rclics;mimetype=text/plain;charset=utf-8
+text/calendar = execm rclics;mimetype=text/plain
 text/html  = internal 
 text/plain = internal 
 text/rtf = exec unrtf --nopict --html;mimetype=text/html