For plain text files, try alternate decode from 8bit charset when decode from UTF-8 fails

This commit is contained in:
Jean-Francois Dockes 2012-10-06 15:12:49 +02:00
parent 52804fef6c
commit 822848b31c
6 changed files with 122 additions and 48 deletions

View file

@ -49,6 +49,7 @@ DEF_CSTR(fileu, "file://");
DEF_CSTR(fmtime, "fmtime"); DEF_CSTR(fmtime, "fmtime");
DEF_CSTR(iso_8859_1, "ISO-8859-1"); DEF_CSTR(iso_8859_1, "ISO-8859-1");
DEF_CSTR(utf8, "UTF-8"); DEF_CSTR(utf8, "UTF-8");
DEF_CSTR(cp1252, "CP1252");
DEF_CSTR(minwilds, "*?["); DEF_CSTR(minwilds, "*?[");
DEF_CSTR(newline, "\n"); DEF_CSTR(newline, "\n");
DEF_CSTR(null, ""); DEF_CSTR(null, "");

View file

@ -344,7 +344,7 @@ list<string> RclConfig::getTopdirs()
// If defcharset was set (from the config or a previous call, this // If defcharset was set (from the config or a previous call, this
// is done in setKeydir), use it. // is done in setKeydir), use it.
// Else, try to guess it from the locale // Else, try to guess it from the locale
// Use iso8859-1 as ultimate default // Use cp1252 (as a superset of iso8859-1) as ultimate default
// //
// For filenames, same thing except that we do not use the config file value // For filenames, same thing except that we do not use the config file value
// (only the locale). // (only the locale).
@ -372,9 +372,8 @@ const string& RclConfig::getDefCharset(bool filename)
) { ) {
localecharset = string(cp); localecharset = string(cp);
} else { } else {
// Note: it seems that all versions of iconv will take // Use cp1252 instead of iso-8859-1, it's a superset.
// iso-8859. Some won't take iso8859 localecharset = string(cstr_cp1252);
localecharset = string(cstr_iso_8859_1);
} }
LOGDEB1(("RclConfig::getDefCharset: localecharset [%s]\n", LOGDEB1(("RclConfig::getDefCharset: localecharset [%s]\n",
localecharset.c_str())); localecharset.c_str()));

View file

@ -279,16 +279,8 @@ bool MimeHandlerMail::processAttach()
// with this but it expects text/plain to be utf-8 already, so we // with this but it expects text/plain to be utf-8 already, so we
// handle the transcoding if needed // handle the transcoding if needed
if (m_metaData[cstr_dj_keymt] == cstr_textplain) { if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
string utf8; if (!txtdcode("MimeHandlerMail::processAttach"))
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
LOGERR((" processAttach: transcode to utf-8 failed for charset "
"[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
// can't transcode at all -> data is garbage just erase it
body.clear(); body.clear();
} else {
m_metaData[cstr_dj_keycharset] = cstr_utf8;
body.swap(utf8);
}
} }
// Ipath // Ipath
@ -320,32 +312,32 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
// Handle some headers. // Handle some headers.
string& text = m_metaData[cstr_dj_keycontent]; string& text = m_metaData[cstr_dj_keycontent];
Binc::HeaderItem hi; Binc::HeaderItem hi;
string transcoded; string decoded;
if (doc->h.getFirstHeader("From", hi)) { if (doc->h.getFirstHeader("From", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), decoded);
if (preview()) if (preview())
text += string("From: "); text += string("From: ");
text += transcoded + cstr_newline; text += decoded + cstr_newline;
if (depth == 1) { if (depth == 1) {
m_metaData[cstr_dj_keyauthor] = transcoded; m_metaData[cstr_dj_keyauthor] = decoded;
} }
} }
if (doc->h.getFirstHeader("To", hi)) { if (doc->h.getFirstHeader("To", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), decoded);
if (preview()) if (preview())
text += string("To: "); text += string("To: ");
text += transcoded + cstr_newline; text += decoded + cstr_newline;
if (depth == 1) { if (depth == 1) {
m_metaData[cstr_dj_keyrecipient] = transcoded; m_metaData[cstr_dj_keyrecipient] = decoded;
} }
} }
if (doc->h.getFirstHeader("Cc", hi)) { if (doc->h.getFirstHeader("Cc", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), decoded);
if (preview()) if (preview())
text += string("Cc: "); text += string("Cc: ");
text += transcoded + cstr_newline; text += decoded + cstr_newline;
if (depth == 1) { if (depth == 1) {
m_metaData[cstr_dj_keyrecipient] += " " + transcoded; m_metaData[cstr_dj_keyrecipient] += " " + decoded;
} }
} }
if (doc->h.getFirstHeader("Message-Id", hi)) { if (doc->h.getFirstHeader("Message-Id", hi)) {
@ -355,31 +347,31 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
} }
} }
if (doc->h.getFirstHeader("Date", hi)) { if (doc->h.getFirstHeader("Date", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), decoded);
if (depth == 1) { if (depth == 1) {
time_t t = rfc2822DateToUxTime(transcoded); time_t t = rfc2822DateToUxTime(decoded);
if (t != (time_t)-1) { if (t != (time_t)-1) {
char ascuxtime[100]; char ascuxtime[100];
sprintf(ascuxtime, "%ld", (long)t); sprintf(ascuxtime, "%ld", (long)t);
m_metaData[cstr_dj_keymd] = ascuxtime; m_metaData[cstr_dj_keymd] = ascuxtime;
} else { } else {
// Leave mtime field alone, ftime will be used instead. // Leave mtime field alone, ftime will be used instead.
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str())); LOGDEB(("rfc2822Date...: failed: [%s]\n", decoded.c_str()));
} }
} }
if (preview()) if (preview())
text += string("Date: "); text += string("Date: ");
text += transcoded + cstr_newline; text += decoded + cstr_newline;
} }
if (doc->h.getFirstHeader("Subject", hi)) { if (doc->h.getFirstHeader("Subject", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), decoded);
if (depth == 1) { if (depth == 1) {
m_metaData[cstr_dj_keytitle] = transcoded; m_metaData[cstr_dj_keytitle] = decoded;
m_subject = transcoded; m_subject = decoded;
} }
if (preview()) if (preview())
text += string("Subject: "); text += string("Subject: ");
text += transcoded + cstr_newline; text += decoded + cstr_newline;
} }
// Check for the presence of configured additional headers and possibly // Check for the presence of configured additional headers and possibly
@ -597,22 +589,23 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
} }
// We are dealing with an inline part of text/plain or text/html // We are dealing with an inline part of text/plain or text/html
// type There may be several such parts, which is why we don't // type. We can't just return a text or html subdoc and let the
// just return a text or html subdoc and let the filter stack // filter stack work: this would create another subdocument, but
// work: we want to concatenate them in place instead // we want instead to decode a body part of this message document.
LOGDEB2(("walkmime: final: body start offset %d, length %d\n", LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
doc->getBodyStartOffset(), doc->getBodyLength())); doc->getBodyStartOffset(), doc->getBodyLength()));
string body; string body;
doc->getBody(body, 0, doc->bodylength); doc->getBody(body, 0, doc->bodylength);
{
string decoded; string decoded;
const string *bdp; const string *bdp;
if (!decodeBody(cte, body, decoded, &bdp)) { if (!decodeBody(cte, body, decoded, &bdp)) {
LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n")); LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n"));
} }
if (bdp != &body) if (bdp != &body)
body = decoded; body.swap(decoded);
}
// Handle html stripping and transcoding to utf8 // Handle html stripping and transcoding to utf8
if (!stringlowercmp("text/html", content_type.value)) { if (!stringlowercmp("text/html", content_type.value)) {

View file

@ -15,10 +15,80 @@
*/ */
#include "autoconfig.h" #include "autoconfig.h"
#include <tr1/unordered_map>
using std::tr1::unordered_map;
#include "cstr.h" #include "cstr.h"
#include "transcode.h" #include "transcode.h"
#include "mimehandler.h" #include "mimehandler.h"
#include "debuglog.h" #include "debuglog.h"
#include "smallut.h"
static const char *vcountry_to_code[] = {
"fr", "windows-1252",
"al", "windows-1252",
"dk", "windows-1252",
"en", "windows-1252",
"de", "windows-1252",
"is", "windows-1252",
"my", "windows-1252",
"ie", "windows-1252",
"gb", "windows-1252",
"it", "windows-1252",
"lu", "windows-1252",
"no", "windows-1252",
"pt", "windows-1252",
"es", "windows-1252",
"se", "windows-1252",
"ba", "iso-8859-2",
"hr", "iso-8859-2",
"cz", "iso-8859-2",
"hu", "iso-8859-2",
"pl", "iso-8859-2",
"rs", "iso-8859-2",
"sk", "iso-8859-2",
"si", "iso-8859-2",
"gr", "iso-8859-7",
"il", "iso-8859-8",
"tr", "iso-8859-9",
"th", "iso-8859-11",
"lv", "iso-8859-13",
"lt", "iso-8859-13",
};
// Called after decoding from utf-8 failed. Handle the common case
// where this is a good old 8bit-encoded text document left-over when
// the locale was switched to utf-8. We try to guess a charset
// according to the locale language and use it. This is a very rough
// heuristic, but may be better than discarding the data.
static bool alternate_decode(const string& in, string& out)
{
static unordered_map<string, string> country_to_code;
if (country_to_code.empty()) {
for (unsigned int i = 0;
i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
}
}
string locale = setlocale(LC_CTYPE, 0);
LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
string::size_type under = locale.find_first_of("_");
if (under == string::npos)
return false;
string country = locale.substr(0, under);
unordered_map<string,string>::const_iterator it =
country_to_code.find(country);
if (it == country_to_code.end())
return false;
string code = it->second;
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
code.c_str()));
return transcode(in, out, code, cstr_utf8);
}
bool RecollFilter::txtdcode(const string& who) bool RecollFilter::txtdcode(const string& who)
{ {
@ -33,17 +103,24 @@ bool RecollFilter::txtdcode(const string& who)
LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n", LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n",
who.c_str(), itext.size(), ocs.c_str())); who.c_str(), itext.size(), ocs.c_str()));
int ecnt; int ecnt;
bool ret;
string otext; string otext;
if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) || bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
ecnt > int(itext.size() / 4)) { if (!ret || ecnt > int(itext.size() / 100)) {
LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed " LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
"for input charset [%s] ret %d ecnt %d\n", "for input charset [%s] ret %d ecnt %d\n",
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt)); who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
if (samecharset(ocs, cstr_utf8)) {
ret = alternate_decode(itext, otext);
}
if (!ret) {
itext.erase(); itext.erase();
return false; return false;
} }
}
itext.swap(otext); itext.swap(otext);
m_metaData[cstr_dj_keycharset] = "UTF-8"; m_metaData[cstr_dj_keycharset] = cstr_utf8;
return true; return true;
} }

View file

@ -12,6 +12,8 @@ initvariables $0
recollq '"Dear Corporate Administrator"' recollq '"Dear Corporate Administrator"'
recollq TestTbirdWithoutEmptyLine recollq TestTbirdWithoutEmptyLine
recollq TestTbirdWithEmptyLine recollq TestTbirdWithEmptyLine
recollq Utf8attachaccentueaccentue
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1

View file

@ -11,3 +11,5 @@ message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/badMail.edi
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [Pronote: salut les genies.] 568 bytes message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [Pronote: salut les genies.] 568 bytes
1 results 1 results
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [De mieux en mieux] 557 bytes message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [De mieux en mieux] 557 bytes
1 results
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/outmail] [Message avec attachement textplain utf8] 1733 bytes