For plain text files, try alternate decode from 8bit charset when decode from UTF-8 fails

2012-10-06 15:12:49 +02:00 · 2012-10-06 15:12:49 +02:00 · 822848b31c
commit 822848b31c
parent 52804fef6c
6 changed files with 122 additions and 48 deletions
--- a/src/common/cstr.h
+++ b/src/common/cstr.h
@ -49,6 +49,7 @@ DEF_CSTR(fileu, "file://");
 DEF_CSTR(fmtime, "fmtime");
 DEF_CSTR(iso_8859_1, "ISO-8859-1");
 DEF_CSTR(utf8, "UTF-8");
 DEF_CSTR(cp1252, "CP1252");
 DEF_CSTR(minwilds, "*?[");
 DEF_CSTR(newline, "\n");
 DEF_CSTR(null, "");
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -344,7 +344,7 @@ list<string> RclConfig::getTopdirs()
 //  If defcharset was set (from the config or a previous call, this
 //   is done in setKeydir), use it.
 //  Else, try to guess it from the locale
-//  Use iso8859-1 as ultimate default
+//  Use cp1252 (as a superset of iso8859-1) as ultimate default
 //
 // For filenames, same thing except that we do not use the config file value
 // (only the locale).
@ -372,9 +372,8 @@ const string& RclConfig::getDefCharset(bool filename)
 	    ) {
 	    localecharset = string(cp);
 	} else {
-	    // Note: it seems that all versions of iconv will take
+	    // Use cp1252 instead of iso-8859-1, it's a superset.
-	    // iso-8859. Some won't take iso8859
+	    localecharset = string(cstr_cp1252);
 	    localecharset = string(cstr_iso_8859_1);
 	}
 	LOGDEB1(("RclConfig::getDefCharset: localecharset [%s]\n",
 		localecharset.c_str()));
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -279,16 +279,8 @@ bool MimeHandlerMail::processAttach()
    // with this but it expects text/plain to be utf-8 already, so we
    // handle the transcoding if needed
    if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
-	string utf8;
+	if (!txtdcode("MimeHandlerMail::processAttach"))
 	if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
 	    LOGERR(("  processAttach: transcode to utf-8 failed for charset "
 		    "[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
 	    // can't transcode at all -> data is garbage just erase it
 	    body.clear();
 	} else {
 	    m_metaData[cstr_dj_keycharset] = cstr_utf8;
 	    body.swap(utf8);
 	}
    }
    // Ipath
@ -320,32 +312,32 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
    // Handle some headers. 
    string& text = m_metaData[cstr_dj_keycontent];
    Binc::HeaderItem hi;
-    string transcoded;
+    string decoded;
    if (doc->h.getFirstHeader("From", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (preview())
 	    text += string("From: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
 	if (depth == 1) {
-	    m_metaData[cstr_dj_keyauthor] = transcoded;
+	    m_metaData[cstr_dj_keyauthor] = decoded;
 	}
    }
    if (doc->h.getFirstHeader("To", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (preview())
 	    text += string("To: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
 	if (depth == 1) {
-	    m_metaData[cstr_dj_keyrecipient] = transcoded;
+	    m_metaData[cstr_dj_keyrecipient] = decoded;
 	}
    }
    if (doc->h.getFirstHeader("Cc", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (preview())
 	    text += string("Cc: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
 	if (depth == 1) {
-	    m_metaData[cstr_dj_keyrecipient] += " " + transcoded;
+	    m_metaData[cstr_dj_keyrecipient] += " " + decoded;
 	}
    }
    if (doc->h.getFirstHeader("Message-Id", hi)) {
@ -355,31 +347,31 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
 	}
    }
    if (doc->h.getFirstHeader("Date", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (depth == 1) {
-	    time_t t = rfc2822DateToUxTime(transcoded);
+	    time_t t = rfc2822DateToUxTime(decoded);
 	    if (t != (time_t)-1) {
 		char ascuxtime[100];
 		sprintf(ascuxtime, "%ld", (long)t);
 		m_metaData[cstr_dj_keymd] = ascuxtime;
 	    } else {
 		// Leave mtime field alone, ftime will be used instead.
-		LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
+		LOGDEB(("rfc2822Date...: failed: [%s]\n", decoded.c_str()));
 	    }
 	}
 	if (preview())
 	    text += string("Date: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
    }
    if (doc->h.getFirstHeader("Subject", hi)) {
-	rfc2047_decode(hi.getValue(), transcoded);
+	rfc2047_decode(hi.getValue(), decoded);
 	if (depth == 1) {
-	    m_metaData[cstr_dj_keytitle] = transcoded;
+	    m_metaData[cstr_dj_keytitle] = decoded;
-	    m_subject = transcoded;
+	    m_subject = decoded;
 	}
 	if (preview())
 	    text += string("Subject: ");
-	text += transcoded + cstr_newline;
+	text += decoded + cstr_newline;
    }
    // Check for the presence of configured additional headers and possibly
@ -597,22 +589,23 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
    }
    // We are dealing with an inline part of text/plain or text/html
-    // type There may be several such parts, which is why we don't
+    // type. We can't just return a text or html subdoc and let the
-    // just return a text or html subdoc and let the filter stack
+    // filter stack work: this would create another subdocument, but
-    // work: we want to concatenate them in place instead
+    // we want instead to decode a body part of this message document.
    LOGDEB2(("walkmime: final: body start offset %d, length %d\n", 
 	     doc->getBodyStartOffset(), doc->getBodyLength()));
    string body;
    doc->getBody(body, 0, doc->bodylength);
-
+    {
 	string decoded;
 	const string *bdp;
 	if (!decodeBody(cte, body, decoded, &bdp)) {
 	    LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n"));
 	}
 	if (bdp != &body)
-	body = decoded;
+	    body.swap(decoded);
    }
    // Handle html stripping and transcoding to utf8
    if (!stringlowercmp("text/html", content_type.value)) {
--- a/src/internfile/txtdcode.cpp
+++ b/src/internfile/txtdcode.cpp
@ -15,10 +15,80 @@
 */
 #include "autoconfig.h"
 #include <tr1/unordered_map>
 using std::tr1::unordered_map;
 #include "cstr.h"
 #include "transcode.h"
 #include "mimehandler.h"
 #include "debuglog.h"
 #include "smallut.h"
 static const char *vcountry_to_code[] = {
    "fr", "windows-1252",
    "al", "windows-1252", 
    "dk", "windows-1252",
    "en", "windows-1252",
    "de", "windows-1252",
    "is", "windows-1252",
    "my", "windows-1252",
    "ie", "windows-1252",
    "gb", "windows-1252",
    "it", "windows-1252",
    "lu", "windows-1252",
    "no", "windows-1252",
    "pt", "windows-1252",
    "es", "windows-1252",
    "se", "windows-1252",
    "ba", "iso-8859-2",
    "hr", "iso-8859-2",
    "cz", "iso-8859-2",
    "hu", "iso-8859-2",
    "pl", "iso-8859-2",
    "rs", "iso-8859-2",
    "sk", "iso-8859-2",
    "si", "iso-8859-2",
    "gr", "iso-8859-7",
    "il", "iso-8859-8",
    "tr", "iso-8859-9",
    "th", "iso-8859-11",
    "lv", "iso-8859-13",
    "lt", "iso-8859-13",
 };
 // Called after decoding from utf-8 failed. Handle the common case
 // where this is a good old 8bit-encoded text document left-over when
 // the locale was switched to utf-8. We try to guess a charset
 // according to the locale language and use it. This is a very rough
 // heuristic, but may be better than discarding the data.
 static bool alternate_decode(const string& in, string& out)
 {
    static unordered_map<string, string> country_to_code;
    if (country_to_code.empty()) {
 	for (unsigned int i = 0; 
 	     i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
 	    country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
 	}
    }
    string locale = setlocale(LC_CTYPE, 0);
    LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
    string::size_type under = locale.find_first_of("_");
    if (under == string::npos)
 	return false;
    string country = locale.substr(0, under);
    unordered_map<string,string>::const_iterator it = 
 	country_to_code.find(country);
    if (it == country_to_code.end())
 	return false;
    string code = it->second;
    LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
 	    code.c_str()));
    return transcode(in, out, code, cstr_utf8);
 }
 bool RecollFilter::txtdcode(const string& who)
 {
@ -33,17 +103,24 @@ bool RecollFilter::txtdcode(const string& who)
    LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n", 
 	     who.c_str(), itext.size(), ocs.c_str()));
    int ecnt;
    bool ret;
    string otext;
-    if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) || 
+    bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
-	ecnt > int(itext.size() / 4)) {
+    if (!ret || ecnt > int(itext.size() / 100)) {
 	LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
 		"for input charset [%s] ret %d ecnt %d\n", 
 		who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
 	if (samecharset(ocs, cstr_utf8)) {
 	    ret = alternate_decode(itext, otext);
 	}
 	if (!ret) {
 	    itext.erase();
 	    return false;
 	}
    }
    itext.swap(otext);
-    m_metaData[cstr_dj_keycharset] = "UTF-8";
+    m_metaData[cstr_dj_keycharset] = cstr_utf8;
    return true;
 }
--- a/tests/mail/mail.sh
+++ b/tests/mail/mail.sh
@ -12,6 +12,8 @@ initvariables $0
  recollq '"Dear Corporate Administrator"'
  recollq TestTbirdWithoutEmptyLine
  recollq TestTbirdWithEmptyLine
  recollq Utf8attachaccentueaccentue
 ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
--- a/tests/mail/mail.txt
+++ b/tests/mail/mail.txt
@ -11,3 +11,5 @@ message/rfc822	[file:///home/dockes/projets/fulltext/testrecoll/mail/badMail.edi
 message/rfc822	[file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent]	[Pronote: salut les genies.]	568	bytes	
 1 results
 message/rfc822	[file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent]	[De mieux en mieux]	557	bytes	
 1 results
 message/rfc822	[file:///home/dockes/projets/fulltext/testrecoll/mail/outmail]	[Message avec attachement textplain utf8]	1733	bytes