For plain text files, try alternate decode from 8bit charset when decode from UTF-8 fails
This commit is contained in:
parent
52804fef6c
commit
822848b31c
6 changed files with 122 additions and 48 deletions
|
@ -49,6 +49,7 @@ DEF_CSTR(fileu, "file://");
|
||||||
DEF_CSTR(fmtime, "fmtime");
|
DEF_CSTR(fmtime, "fmtime");
|
||||||
DEF_CSTR(iso_8859_1, "ISO-8859-1");
|
DEF_CSTR(iso_8859_1, "ISO-8859-1");
|
||||||
DEF_CSTR(utf8, "UTF-8");
|
DEF_CSTR(utf8, "UTF-8");
|
||||||
|
DEF_CSTR(cp1252, "CP1252");
|
||||||
DEF_CSTR(minwilds, "*?[");
|
DEF_CSTR(minwilds, "*?[");
|
||||||
DEF_CSTR(newline, "\n");
|
DEF_CSTR(newline, "\n");
|
||||||
DEF_CSTR(null, "");
|
DEF_CSTR(null, "");
|
||||||
|
|
|
@ -344,7 +344,7 @@ list<string> RclConfig::getTopdirs()
|
||||||
// If defcharset was set (from the config or a previous call, this
|
// If defcharset was set (from the config or a previous call, this
|
||||||
// is done in setKeydir), use it.
|
// is done in setKeydir), use it.
|
||||||
// Else, try to guess it from the locale
|
// Else, try to guess it from the locale
|
||||||
// Use iso8859-1 as ultimate default
|
// Use cp1252 (as a superset of iso8859-1) as ultimate default
|
||||||
//
|
//
|
||||||
// For filenames, same thing except that we do not use the config file value
|
// For filenames, same thing except that we do not use the config file value
|
||||||
// (only the locale).
|
// (only the locale).
|
||||||
|
@ -372,9 +372,8 @@ const string& RclConfig::getDefCharset(bool filename)
|
||||||
) {
|
) {
|
||||||
localecharset = string(cp);
|
localecharset = string(cp);
|
||||||
} else {
|
} else {
|
||||||
// Note: it seems that all versions of iconv will take
|
// Use cp1252 instead of iso-8859-1, it's a superset.
|
||||||
// iso-8859. Some won't take iso8859
|
localecharset = string(cstr_cp1252);
|
||||||
localecharset = string(cstr_iso_8859_1);
|
|
||||||
}
|
}
|
||||||
LOGDEB1(("RclConfig::getDefCharset: localecharset [%s]\n",
|
LOGDEB1(("RclConfig::getDefCharset: localecharset [%s]\n",
|
||||||
localecharset.c_str()));
|
localecharset.c_str()));
|
||||||
|
|
|
@ -279,16 +279,8 @@ bool MimeHandlerMail::processAttach()
|
||||||
// with this but it expects text/plain to be utf-8 already, so we
|
// with this but it expects text/plain to be utf-8 already, so we
|
||||||
// handle the transcoding if needed
|
// handle the transcoding if needed
|
||||||
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
|
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
|
||||||
string utf8;
|
if (!txtdcode("MimeHandlerMail::processAttach"))
|
||||||
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
|
body.clear();
|
||||||
LOGERR((" processAttach: transcode to utf-8 failed for charset "
|
|
||||||
"[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
|
|
||||||
// can't transcode at all -> data is garbage just erase it
|
|
||||||
body.clear();
|
|
||||||
} else {
|
|
||||||
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
|
||||||
body.swap(utf8);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ipath
|
// Ipath
|
||||||
|
@ -320,32 +312,32 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
|
||||||
// Handle some headers.
|
// Handle some headers.
|
||||||
string& text = m_metaData[cstr_dj_keycontent];
|
string& text = m_metaData[cstr_dj_keycontent];
|
||||||
Binc::HeaderItem hi;
|
Binc::HeaderItem hi;
|
||||||
string transcoded;
|
string decoded;
|
||||||
if (doc->h.getFirstHeader("From", hi)) {
|
if (doc->h.getFirstHeader("From", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), decoded);
|
||||||
if (preview())
|
if (preview())
|
||||||
text += string("From: ");
|
text += string("From: ");
|
||||||
text += transcoded + cstr_newline;
|
text += decoded + cstr_newline;
|
||||||
if (depth == 1) {
|
if (depth == 1) {
|
||||||
m_metaData[cstr_dj_keyauthor] = transcoded;
|
m_metaData[cstr_dj_keyauthor] = decoded;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (doc->h.getFirstHeader("To", hi)) {
|
if (doc->h.getFirstHeader("To", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), decoded);
|
||||||
if (preview())
|
if (preview())
|
||||||
text += string("To: ");
|
text += string("To: ");
|
||||||
text += transcoded + cstr_newline;
|
text += decoded + cstr_newline;
|
||||||
if (depth == 1) {
|
if (depth == 1) {
|
||||||
m_metaData[cstr_dj_keyrecipient] = transcoded;
|
m_metaData[cstr_dj_keyrecipient] = decoded;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (doc->h.getFirstHeader("Cc", hi)) {
|
if (doc->h.getFirstHeader("Cc", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), decoded);
|
||||||
if (preview())
|
if (preview())
|
||||||
text += string("Cc: ");
|
text += string("Cc: ");
|
||||||
text += transcoded + cstr_newline;
|
text += decoded + cstr_newline;
|
||||||
if (depth == 1) {
|
if (depth == 1) {
|
||||||
m_metaData[cstr_dj_keyrecipient] += " " + transcoded;
|
m_metaData[cstr_dj_keyrecipient] += " " + decoded;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (doc->h.getFirstHeader("Message-Id", hi)) {
|
if (doc->h.getFirstHeader("Message-Id", hi)) {
|
||||||
|
@ -355,31 +347,31 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (doc->h.getFirstHeader("Date", hi)) {
|
if (doc->h.getFirstHeader("Date", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), decoded);
|
||||||
if (depth == 1) {
|
if (depth == 1) {
|
||||||
time_t t = rfc2822DateToUxTime(transcoded);
|
time_t t = rfc2822DateToUxTime(decoded);
|
||||||
if (t != (time_t)-1) {
|
if (t != (time_t)-1) {
|
||||||
char ascuxtime[100];
|
char ascuxtime[100];
|
||||||
sprintf(ascuxtime, "%ld", (long)t);
|
sprintf(ascuxtime, "%ld", (long)t);
|
||||||
m_metaData[cstr_dj_keymd] = ascuxtime;
|
m_metaData[cstr_dj_keymd] = ascuxtime;
|
||||||
} else {
|
} else {
|
||||||
// Leave mtime field alone, ftime will be used instead.
|
// Leave mtime field alone, ftime will be used instead.
|
||||||
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
|
LOGDEB(("rfc2822Date...: failed: [%s]\n", decoded.c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (preview())
|
if (preview())
|
||||||
text += string("Date: ");
|
text += string("Date: ");
|
||||||
text += transcoded + cstr_newline;
|
text += decoded + cstr_newline;
|
||||||
}
|
}
|
||||||
if (doc->h.getFirstHeader("Subject", hi)) {
|
if (doc->h.getFirstHeader("Subject", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), decoded);
|
||||||
if (depth == 1) {
|
if (depth == 1) {
|
||||||
m_metaData[cstr_dj_keytitle] = transcoded;
|
m_metaData[cstr_dj_keytitle] = decoded;
|
||||||
m_subject = transcoded;
|
m_subject = decoded;
|
||||||
}
|
}
|
||||||
if (preview())
|
if (preview())
|
||||||
text += string("Subject: ");
|
text += string("Subject: ");
|
||||||
text += transcoded + cstr_newline;
|
text += decoded + cstr_newline;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for the presence of configured additional headers and possibly
|
// Check for the presence of configured additional headers and possibly
|
||||||
|
@ -597,22 +589,23 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We are dealing with an inline part of text/plain or text/html
|
// We are dealing with an inline part of text/plain or text/html
|
||||||
// type There may be several such parts, which is why we don't
|
// type. We can't just return a text or html subdoc and let the
|
||||||
// just return a text or html subdoc and let the filter stack
|
// filter stack work: this would create another subdocument, but
|
||||||
// work: we want to concatenate them in place instead
|
// we want instead to decode a body part of this message document.
|
||||||
|
|
||||||
LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
|
LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
|
||||||
doc->getBodyStartOffset(), doc->getBodyLength()));
|
doc->getBodyStartOffset(), doc->getBodyLength()));
|
||||||
string body;
|
string body;
|
||||||
doc->getBody(body, 0, doc->bodylength);
|
doc->getBody(body, 0, doc->bodylength);
|
||||||
|
{
|
||||||
string decoded;
|
string decoded;
|
||||||
const string *bdp;
|
const string *bdp;
|
||||||
if (!decodeBody(cte, body, decoded, &bdp)) {
|
if (!decodeBody(cte, body, decoded, &bdp)) {
|
||||||
LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n"));
|
LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n"));
|
||||||
|
}
|
||||||
|
if (bdp != &body)
|
||||||
|
body.swap(decoded);
|
||||||
}
|
}
|
||||||
if (bdp != &body)
|
|
||||||
body = decoded;
|
|
||||||
|
|
||||||
// Handle html stripping and transcoding to utf8
|
// Handle html stripping and transcoding to utf8
|
||||||
if (!stringlowercmp("text/html", content_type.value)) {
|
if (!stringlowercmp("text/html", content_type.value)) {
|
||||||
|
|
|
@ -15,10 +15,80 @@
|
||||||
*/
|
*/
|
||||||
#include "autoconfig.h"
|
#include "autoconfig.h"
|
||||||
|
|
||||||
|
#include <tr1/unordered_map>
|
||||||
|
using std::tr1::unordered_map;
|
||||||
|
|
||||||
#include "cstr.h"
|
#include "cstr.h"
|
||||||
#include "transcode.h"
|
#include "transcode.h"
|
||||||
#include "mimehandler.h"
|
#include "mimehandler.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
|
#include "smallut.h"
|
||||||
|
|
||||||
|
static const char *vcountry_to_code[] = {
|
||||||
|
"fr", "windows-1252",
|
||||||
|
"al", "windows-1252",
|
||||||
|
"dk", "windows-1252",
|
||||||
|
"en", "windows-1252",
|
||||||
|
"de", "windows-1252",
|
||||||
|
"is", "windows-1252",
|
||||||
|
"my", "windows-1252",
|
||||||
|
"ie", "windows-1252",
|
||||||
|
"gb", "windows-1252",
|
||||||
|
"it", "windows-1252",
|
||||||
|
"lu", "windows-1252",
|
||||||
|
"no", "windows-1252",
|
||||||
|
"pt", "windows-1252",
|
||||||
|
"es", "windows-1252",
|
||||||
|
"se", "windows-1252",
|
||||||
|
"ba", "iso-8859-2",
|
||||||
|
"hr", "iso-8859-2",
|
||||||
|
"cz", "iso-8859-2",
|
||||||
|
"hu", "iso-8859-2",
|
||||||
|
"pl", "iso-8859-2",
|
||||||
|
"rs", "iso-8859-2",
|
||||||
|
"sk", "iso-8859-2",
|
||||||
|
"si", "iso-8859-2",
|
||||||
|
"gr", "iso-8859-7",
|
||||||
|
"il", "iso-8859-8",
|
||||||
|
"tr", "iso-8859-9",
|
||||||
|
"th", "iso-8859-11",
|
||||||
|
"lv", "iso-8859-13",
|
||||||
|
"lt", "iso-8859-13",
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// Called after decoding from utf-8 failed. Handle the common case
|
||||||
|
// where this is a good old 8bit-encoded text document left-over when
|
||||||
|
// the locale was switched to utf-8. We try to guess a charset
|
||||||
|
// according to the locale language and use it. This is a very rough
|
||||||
|
// heuristic, but may be better than discarding the data.
|
||||||
|
static bool alternate_decode(const string& in, string& out)
|
||||||
|
{
|
||||||
|
static unordered_map<string, string> country_to_code;
|
||||||
|
if (country_to_code.empty()) {
|
||||||
|
for (unsigned int i = 0;
|
||||||
|
i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
|
||||||
|
country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
string locale = setlocale(LC_CTYPE, 0);
|
||||||
|
LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
|
||||||
|
string::size_type under = locale.find_first_of("_");
|
||||||
|
if (under == string::npos)
|
||||||
|
return false;
|
||||||
|
string country = locale.substr(0, under);
|
||||||
|
|
||||||
|
unordered_map<string,string>::const_iterator it =
|
||||||
|
country_to_code.find(country);
|
||||||
|
if (it == country_to_code.end())
|
||||||
|
return false;
|
||||||
|
string code = it->second;
|
||||||
|
|
||||||
|
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
|
||||||
|
code.c_str()));
|
||||||
|
return transcode(in, out, code, cstr_utf8);
|
||||||
|
}
|
||||||
|
|
||||||
bool RecollFilter::txtdcode(const string& who)
|
bool RecollFilter::txtdcode(const string& who)
|
||||||
{
|
{
|
||||||
|
@ -33,17 +103,24 @@ bool RecollFilter::txtdcode(const string& who)
|
||||||
LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n",
|
LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n",
|
||||||
who.c_str(), itext.size(), ocs.c_str()));
|
who.c_str(), itext.size(), ocs.c_str()));
|
||||||
int ecnt;
|
int ecnt;
|
||||||
bool ret;
|
|
||||||
string otext;
|
string otext;
|
||||||
if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) ||
|
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
|
||||||
ecnt > int(itext.size() / 4)) {
|
if (!ret || ecnt > int(itext.size() / 100)) {
|
||||||
LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
|
LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
|
||||||
"for input charset [%s] ret %d ecnt %d\n",
|
"for input charset [%s] ret %d ecnt %d\n",
|
||||||
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
|
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
|
||||||
itext.erase();
|
|
||||||
return false;
|
if (samecharset(ocs, cstr_utf8)) {
|
||||||
|
ret = alternate_decode(itext, otext);
|
||||||
|
}
|
||||||
|
if (!ret) {
|
||||||
|
itext.erase();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
itext.swap(otext);
|
itext.swap(otext);
|
||||||
m_metaData[cstr_dj_keycharset] = "UTF-8";
|
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,8 @@ initvariables $0
|
||||||
recollq '"Dear Corporate Administrator"'
|
recollq '"Dear Corporate Administrator"'
|
||||||
recollq TestTbirdWithoutEmptyLine
|
recollq TestTbirdWithoutEmptyLine
|
||||||
recollq TestTbirdWithEmptyLine
|
recollq TestTbirdWithEmptyLine
|
||||||
|
recollq Utf8attachaccentueaccentue
|
||||||
|
|
||||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
|
|
||||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||||
|
|
|
@ -11,3 +11,5 @@ message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/badMail.edi
|
||||||
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [Pronote: salut les genies.] 568 bytes
|
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [Pronote: salut les genies.] 568 bytes
|
||||||
1 results
|
1 results
|
||||||
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [De mieux en mieux] 557 bytes
|
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [De mieux en mieux] 557 bytes
|
||||||
|
1 results
|
||||||
|
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/outmail] [Message avec attachement textplain utf8] 1733 bytes
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue