diff --git a/src/common/cstr.h b/src/common/cstr.h index 0b1c5cf6..b2d18ede 100644 --- a/src/common/cstr.h +++ b/src/common/cstr.h @@ -49,6 +49,7 @@ DEF_CSTR(fileu, "file://"); DEF_CSTR(fmtime, "fmtime"); DEF_CSTR(iso_8859_1, "ISO-8859-1"); DEF_CSTR(utf8, "UTF-8"); +DEF_CSTR(cp1252, "CP1252"); DEF_CSTR(minwilds, "*?["); DEF_CSTR(newline, "\n"); DEF_CSTR(null, ""); diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 1dd62eb5..a0e3eff5 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -344,7 +344,7 @@ list RclConfig::getTopdirs() // If defcharset was set (from the config or a previous call, this // is done in setKeydir), use it. // Else, try to guess it from the locale -// Use iso8859-1 as ultimate default +// Use cp1252 (as a superset of iso8859-1) as ultimate default // // For filenames, same thing except that we do not use the config file value // (only the locale). @@ -372,9 +372,8 @@ const string& RclConfig::getDefCharset(bool filename) ) { localecharset = string(cp); } else { - // Note: it seems that all versions of iconv will take - // iso-8859. Some won't take iso8859 - localecharset = string(cstr_iso_8859_1); + // Use cp1252 instead of iso-8859-1, it's a superset. + localecharset = string(cstr_cp1252); } LOGDEB1(("RclConfig::getDefCharset: localecharset [%s]\n", localecharset.c_str())); diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index 7e177c3a..a98f8732 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -279,16 +279,8 @@ bool MimeHandlerMail::processAttach() // with this but it expects text/plain to be utf-8 already, so we // handle the transcoding if needed if (m_metaData[cstr_dj_keymt] == cstr_textplain) { - string utf8; - if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) { - LOGERR((" processAttach: transcode to utf-8 failed for charset " - "[%s]\n", m_metaData[cstr_dj_keycharset].c_str())); - // can't transcode at all -> data is garbage just erase it - body.clear(); - } else { - m_metaData[cstr_dj_keycharset] = cstr_utf8; - body.swap(utf8); - } + if (!txtdcode("MimeHandlerMail::processAttach")) + body.clear(); } // Ipath @@ -320,32 +312,32 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) // Handle some headers. string& text = m_metaData[cstr_dj_keycontent]; Binc::HeaderItem hi; - string transcoded; + string decoded; if (doc->h.getFirstHeader("From", hi)) { - rfc2047_decode(hi.getValue(), transcoded); + rfc2047_decode(hi.getValue(), decoded); if (preview()) text += string("From: "); - text += transcoded + cstr_newline; + text += decoded + cstr_newline; if (depth == 1) { - m_metaData[cstr_dj_keyauthor] = transcoded; + m_metaData[cstr_dj_keyauthor] = decoded; } } if (doc->h.getFirstHeader("To", hi)) { - rfc2047_decode(hi.getValue(), transcoded); + rfc2047_decode(hi.getValue(), decoded); if (preview()) text += string("To: "); - text += transcoded + cstr_newline; + text += decoded + cstr_newline; if (depth == 1) { - m_metaData[cstr_dj_keyrecipient] = transcoded; + m_metaData[cstr_dj_keyrecipient] = decoded; } } if (doc->h.getFirstHeader("Cc", hi)) { - rfc2047_decode(hi.getValue(), transcoded); + rfc2047_decode(hi.getValue(), decoded); if (preview()) text += string("Cc: "); - text += transcoded + cstr_newline; + text += decoded + cstr_newline; if (depth == 1) { - m_metaData[cstr_dj_keyrecipient] += " " + transcoded; + m_metaData[cstr_dj_keyrecipient] += " " + decoded; } } if (doc->h.getFirstHeader("Message-Id", hi)) { @@ -355,31 +347,31 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) } } if (doc->h.getFirstHeader("Date", hi)) { - rfc2047_decode(hi.getValue(), transcoded); + rfc2047_decode(hi.getValue(), decoded); if (depth == 1) { - time_t t = rfc2822DateToUxTime(transcoded); + time_t t = rfc2822DateToUxTime(decoded); if (t != (time_t)-1) { char ascuxtime[100]; sprintf(ascuxtime, "%ld", (long)t); m_metaData[cstr_dj_keymd] = ascuxtime; } else { // Leave mtime field alone, ftime will be used instead. - LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str())); + LOGDEB(("rfc2822Date...: failed: [%s]\n", decoded.c_str())); } } if (preview()) text += string("Date: "); - text += transcoded + cstr_newline; + text += decoded + cstr_newline; } if (doc->h.getFirstHeader("Subject", hi)) { - rfc2047_decode(hi.getValue(), transcoded); + rfc2047_decode(hi.getValue(), decoded); if (depth == 1) { - m_metaData[cstr_dj_keytitle] = transcoded; - m_subject = transcoded; + m_metaData[cstr_dj_keytitle] = decoded; + m_subject = decoded; } if (preview()) text += string("Subject: "); - text += transcoded + cstr_newline; + text += decoded + cstr_newline; } // Check for the presence of configured additional headers and possibly @@ -597,22 +589,23 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) } // We are dealing with an inline part of text/plain or text/html - // type There may be several such parts, which is why we don't - // just return a text or html subdoc and let the filter stack - // work: we want to concatenate them in place instead + // type. We can't just return a text or html subdoc and let the + // filter stack work: this would create another subdocument, but + // we want instead to decode a body part of this message document. LOGDEB2(("walkmime: final: body start offset %d, length %d\n", doc->getBodyStartOffset(), doc->getBodyLength())); string body; doc->getBody(body, 0, doc->bodylength); - - string decoded; - const string *bdp; - if (!decodeBody(cte, body, decoded, &bdp)) { - LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n")); + { + string decoded; + const string *bdp; + if (!decodeBody(cte, body, decoded, &bdp)) { + LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n")); + } + if (bdp != &body) + body.swap(decoded); } - if (bdp != &body) - body = decoded; // Handle html stripping and transcoding to utf8 if (!stringlowercmp("text/html", content_type.value)) { diff --git a/src/internfile/txtdcode.cpp b/src/internfile/txtdcode.cpp index 0d933958..51a98223 100644 --- a/src/internfile/txtdcode.cpp +++ b/src/internfile/txtdcode.cpp @@ -15,10 +15,80 @@ */ #include "autoconfig.h" +#include +using std::tr1::unordered_map; + #include "cstr.h" #include "transcode.h" #include "mimehandler.h" #include "debuglog.h" +#include "smallut.h" + +static const char *vcountry_to_code[] = { + "fr", "windows-1252", + "al", "windows-1252", + "dk", "windows-1252", + "en", "windows-1252", + "de", "windows-1252", + "is", "windows-1252", + "my", "windows-1252", + "ie", "windows-1252", + "gb", "windows-1252", + "it", "windows-1252", + "lu", "windows-1252", + "no", "windows-1252", + "pt", "windows-1252", + "es", "windows-1252", + "se", "windows-1252", + "ba", "iso-8859-2", + "hr", "iso-8859-2", + "cz", "iso-8859-2", + "hu", "iso-8859-2", + "pl", "iso-8859-2", + "rs", "iso-8859-2", + "sk", "iso-8859-2", + "si", "iso-8859-2", + "gr", "iso-8859-7", + "il", "iso-8859-8", + "tr", "iso-8859-9", + "th", "iso-8859-11", + "lv", "iso-8859-13", + "lt", "iso-8859-13", +}; + + +// Called after decoding from utf-8 failed. Handle the common case +// where this is a good old 8bit-encoded text document left-over when +// the locale was switched to utf-8. We try to guess a charset +// according to the locale language and use it. This is a very rough +// heuristic, but may be better than discarding the data. +static bool alternate_decode(const string& in, string& out) +{ + static unordered_map country_to_code; + if (country_to_code.empty()) { + for (unsigned int i = 0; + i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) { + country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1]; + } + } + + string locale = setlocale(LC_CTYPE, 0); + LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str())); + string::size_type under = locale.find_first_of("_"); + if (under == string::npos) + return false; + string country = locale.substr(0, under); + + unordered_map::const_iterator it = + country_to_code.find(country); + if (it == country_to_code.end()) + return false; + string code = it->second; + + LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n", + code.c_str())); + return transcode(in, out, code, cstr_utf8); +} bool RecollFilter::txtdcode(const string& who) { @@ -33,17 +103,24 @@ bool RecollFilter::txtdcode(const string& who) LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n", who.c_str(), itext.size(), ocs.c_str())); int ecnt; - bool ret; string otext; - if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) || - ecnt > int(itext.size() / 4)) { + bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt); + if (!ret || ecnt > int(itext.size() / 100)) { LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed " "for input charset [%s] ret %d ecnt %d\n", who.c_str(), itext.size(), ocs.c_str(), ret, ecnt)); - itext.erase(); - return false; + + if (samecharset(ocs, cstr_utf8)) { + ret = alternate_decode(itext, otext); + } + if (!ret) { + itext.erase(); + return false; + } } + itext.swap(otext); - m_metaData[cstr_dj_keycharset] = "UTF-8"; + m_metaData[cstr_dj_keycharset] = cstr_utf8; return true; } + diff --git a/tests/mail/mail.sh b/tests/mail/mail.sh index 71058ce3..7588c503 100755 --- a/tests/mail/mail.sh +++ b/tests/mail/mail.sh @@ -12,6 +12,8 @@ initvariables $0 recollq '"Dear Corporate Administrator"' recollq TestTbirdWithoutEmptyLine recollq TestTbirdWithEmptyLine + recollq Utf8attachaccentueaccentue + ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 diff --git a/tests/mail/mail.txt b/tests/mail/mail.txt index 66b474c8..cdf0cb86 100644 --- a/tests/mail/mail.txt +++ b/tests/mail/mail.txt @@ -11,3 +11,5 @@ message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/badMail.edi message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [Pronote: salut les genies.] 568 bytes 1 results message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [De mieux en mieux] 557 bytes +1 results +message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/outmail] [Message avec attachement textplain utf8] 1733 bytes