For plain text files, try alternate decode from 8bit charset when decode from UTF-8 fails

2012-10-06 15:12:49 +02:00 · 2012-10-06 15:12:49 +02:00 · 822848b31c
commit 822848b31c
parent 52804fef6c
6 changed files with 122 additions and 48 deletions
--- a/src/common/cstr.h
+++ b/src/common/cstr.h
@ -49,6 +49,7 @@ DEF_CSTR(fileu, "file://");
 DEF_CSTR(fmtime, "fmtime");
 DEF_CSTR(iso_8859_1, "ISO-8859-1");
 DEF_CSTR(utf8, "UTF-8");
+DEF_CSTR(cp1252, "CP1252");
 DEF_CSTR(minwilds, "*?[");
 DEF_CSTR(newline, "\n");
 DEF_CSTR(null, "");
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -344,7 +344,7 @@ list<string> RclConfig::getTopdirs()
 //  If defcharset was set (from the config or a previous call, this
 //   is done in setKeydir), use it.
 //  Else, try to guess it from the locale
-//  Use iso8859-1 as ultimate default
+//  Use cp1252 (as a superset of iso8859-1) as ultimate default
 //
 // For filenames, same thing except that we do not use the config file value
 // (only the locale).
@ -372,9 +372,8 @@ const string& RclConfig::getDefCharset(bool filename)
 	    ) {
 	    localecharset = string(cp);
 	} else {
-	    // Note: it seems that all versions of iconv will take
-	    // iso-8859. Some won't take iso8859
-	    localecharset = string(cstr_iso_8859_1);
+	    // Use cp1252 instead of iso-8859-1, it's a superset.
+	    localecharset = string(cstr_cp1252);
 	}
 	LOGDEB1(("RclConfig::getDefCharset: localecharset [%s]\n",
 		localecharset.c_str()));
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -279,16 +279,8 @@ bool MimeHandlerMail::processAttach()
    // with this but it expects text/plain to be utf-8 already, so we
    // handle the transcoding if needed
    if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
-	string utf8;
-	if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
-	    LOGERR(("  processAttach: transcode to utf-8 failed for charset "
-		    "[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
- 	    // can't transcode at all -> data is garbage just erase it
+	if (!txtdcode("MimeHandlerMail::processAttach"))
 	    body.clear();
-	} else {
-	    m_metaData[cstr_dj_keycharset] = cstr_utf8;
-	    body.swap(utf8);
-	}
    }

    // Ipath
@ -320,32 +312,32 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
    // Handle some headers. 
    string& text = m_metaData[cstr_dj_keycontent];
    Binc::HeaderItem hi;
-    string transcoded;
+    string decoded;
    if (doc->h.getFirstHeader("From", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (preview())
 	    text += string("From: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
 	if (depth == 1) {
-	    m_metaData[cstr_dj_keyauthor] = transcoded;
+	    m_metaData[cstr_dj_keyauthor] = decoded;
 	}
    }
    if (doc->h.getFirstHeader("To", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (preview())
 	    text += string("To: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
 	if (depth == 1) {
-	    m_metaData[cstr_dj_keyrecipient] = transcoded;
+	    m_metaData[cstr_dj_keyrecipient] = decoded;
 	}
    }
    if (doc->h.getFirstHeader("Cc", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (preview())
 	    text += string("Cc: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
 	if (depth == 1) {
-	    m_metaData[cstr_dj_keyrecipient] += " " + transcoded;
+	    m_metaData[cstr_dj_keyrecipient] += " " + decoded;
 	}
    }
    if (doc->h.getFirstHeader("Message-Id", hi)) {
@ -355,31 +347,31 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
 	}
    }
    if (doc->h.getFirstHeader("Date", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (depth == 1) {
-	    time_t t = rfc2822DateToUxTime(transcoded);
+	    time_t t = rfc2822DateToUxTime(decoded);
 	    if (t != (time_t)-1) {
 		char ascuxtime[100];
 		sprintf(ascuxtime, "%ld", (long)t);
 		m_metaData[cstr_dj_keymd] = ascuxtime;
 	    } else {
 		// Leave mtime field alone, ftime will be used instead.
-		LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
+		LOGDEB(("rfc2822Date...: failed: [%s]\n", decoded.c_str()));
 	    }
 	}
 	if (preview())
 	    text += string("Date: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
    }
    if (doc->h.getFirstHeader("Subject", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (depth == 1) {
-	    m_metaData[cstr_dj_keytitle] = transcoded;
-	    m_subject = transcoded;
+	    m_metaData[cstr_dj_keytitle] = decoded;
+	    m_subject = decoded;
 	}
 	if (preview())
 	    text += string("Subject: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
    }

    // Check for the presence of configured additional headers and possibly
@ -597,22 +589,23 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
    }

    // We are dealing with an inline part of text/plain or text/html
-    // type There may be several such parts, which is why we don't
-    // just return a text or html subdoc and let the filter stack
-    // work: we want to concatenate them in place instead
+    // type. We can't just return a text or html subdoc and let the
+    // filter stack work: this would create another subdocument, but
+    // we want instead to decode a body part of this message document.

    LOGDEB2(("walkmime: final: body start offset %d, length %d\n", 
 	     doc->getBodyStartOffset(), doc->getBodyLength()));
    string body;
    doc->getBody(body, 0, doc->bodylength);
-
+    {
 	string decoded;
 	const string *bdp;
 	if (!decodeBody(cte, body, decoded, &bdp)) {
 	    LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n"));
 	}
 	if (bdp != &body)
-	body = decoded;
+	    body.swap(decoded);
+    }

    // Handle html stripping and transcoding to utf8
    if (!stringlowercmp("text/html", content_type.value)) {
--- a/src/internfile/txtdcode.cpp
+++ b/src/internfile/txtdcode.cpp
@ -15,10 +15,80 @@
 */
 #include "autoconfig.h"

+#include <tr1/unordered_map>
+using std::tr1::unordered_map;
+
 #include "cstr.h"
 #include "transcode.h"
 #include "mimehandler.h"
 #include "debuglog.h"
+#include "smallut.h"
+
+static const char *vcountry_to_code[] = {
+    "fr", "windows-1252",
+    "al", "windows-1252", 
+    "dk", "windows-1252",
+    "en", "windows-1252",
+    "de", "windows-1252",
+    "is", "windows-1252",
+    "my", "windows-1252",
+    "ie", "windows-1252",
+    "gb", "windows-1252",
+    "it", "windows-1252",
+    "lu", "windows-1252",
+    "no", "windows-1252",
+    "pt", "windows-1252",
+    "es", "windows-1252",
+    "se", "windows-1252",
+    "ba", "iso-8859-2",
+    "hr", "iso-8859-2",
+    "cz", "iso-8859-2",
+    "hu", "iso-8859-2",
+    "pl", "iso-8859-2",
+    "rs", "iso-8859-2",
+    "sk", "iso-8859-2",
+    "si", "iso-8859-2",
+    "gr", "iso-8859-7",
+    "il", "iso-8859-8",
+    "tr", "iso-8859-9",
+    "th", "iso-8859-11",
+    "lv", "iso-8859-13",
+    "lt", "iso-8859-13",
+};
+
+
+// Called after decoding from utf-8 failed. Handle the common case
+// where this is a good old 8bit-encoded text document left-over when
+// the locale was switched to utf-8. We try to guess a charset
+// according to the locale language and use it. This is a very rough
+// heuristic, but may be better than discarding the data.
+static bool alternate_decode(const string& in, string& out)
+{
+    static unordered_map<string, string> country_to_code;
+    if (country_to_code.empty()) {
+	for (unsigned int i = 0; 
+	     i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
+	    country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
+	}
+    }
+
+    string locale = setlocale(LC_CTYPE, 0);
+    LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
+    string::size_type under = locale.find_first_of("_");
+    if (under == string::npos)
+	return false;
+    string country = locale.substr(0, under);
+
+    unordered_map<string,string>::const_iterator it = 
+	country_to_code.find(country);
+    if (it == country_to_code.end())
+	return false;
+    string code = it->second;
+
+    LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
+	    code.c_str()));
+    return transcode(in, out, code, cstr_utf8);
+}

 bool RecollFilter::txtdcode(const string& who)
 {
@ -33,17 +103,24 @@ bool RecollFilter::txtdcode(const string& who)
    LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n", 
 	     who.c_str(), itext.size(), ocs.c_str()));
    int ecnt;
-    bool ret;
    string otext;
-    if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) || 
-	ecnt > int(itext.size() / 4)) {
+    bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
+    if (!ret || ecnt > int(itext.size() / 100)) {
 	LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
 		"for input charset [%s] ret %d ecnt %d\n", 
 		who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
+
+	if (samecharset(ocs, cstr_utf8)) {
+	    ret = alternate_decode(itext, otext);
+	}
+	if (!ret) {
 	    itext.erase();
 	    return false;
 	}
+    }
+
    itext.swap(otext);
-    m_metaData[cstr_dj_keycharset] = "UTF-8";
+    m_metaData[cstr_dj_keycharset] = cstr_utf8;
    return true;
 }
+
--- a/tests/mail/mail.sh
+++ b/tests/mail/mail.sh
@ -12,6 +12,8 @@ initvariables $0
  recollq '"Dear Corporate Administrator"'
  recollq TestTbirdWithoutEmptyLine
  recollq TestTbirdWithEmptyLine
+  recollq Utf8attachaccentueaccentue
+
 ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout

 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
--- a/tests/mail/mail.txt
+++ b/tests/mail/mail.txt
@ -11,3 +11,5 @@ message/rfc822	[file:///home/dockes/projets/fulltext/testrecoll/mail/badMail.edi
 message/rfc822	[file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent]	[Pronote: salut les genies.]	568	bytes	
 1 results
 message/rfc822	[file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent]	[De mieux en mieux]	557	bytes	
+1 results
+message/rfc822	[file:///home/dockes/projets/fulltext/testrecoll/mail/outmail]	[Message avec attachement textplain utf8]	1733	bytes