For plain text files, try alternate decode from 8bit charset when decode from UTF-8 fails
This commit is contained in:
parent
52804fef6c
commit
822848b31c
6 changed files with 122 additions and 48 deletions
|
@ -49,6 +49,7 @@ DEF_CSTR(fileu, "file://");
|
|||
DEF_CSTR(fmtime, "fmtime");
|
||||
DEF_CSTR(iso_8859_1, "ISO-8859-1");
|
||||
DEF_CSTR(utf8, "UTF-8");
|
||||
DEF_CSTR(cp1252, "CP1252");
|
||||
DEF_CSTR(minwilds, "*?[");
|
||||
DEF_CSTR(newline, "\n");
|
||||
DEF_CSTR(null, "");
|
||||
|
|
|
@ -344,7 +344,7 @@ list<string> RclConfig::getTopdirs()
|
|||
// If defcharset was set (from the config or a previous call, this
|
||||
// is done in setKeydir), use it.
|
||||
// Else, try to guess it from the locale
|
||||
// Use iso8859-1 as ultimate default
|
||||
// Use cp1252 (as a superset of iso8859-1) as ultimate default
|
||||
//
|
||||
// For filenames, same thing except that we do not use the config file value
|
||||
// (only the locale).
|
||||
|
@ -372,9 +372,8 @@ const string& RclConfig::getDefCharset(bool filename)
|
|||
) {
|
||||
localecharset = string(cp);
|
||||
} else {
|
||||
// Note: it seems that all versions of iconv will take
|
||||
// iso-8859. Some won't take iso8859
|
||||
localecharset = string(cstr_iso_8859_1);
|
||||
// Use cp1252 instead of iso-8859-1, it's a superset.
|
||||
localecharset = string(cstr_cp1252);
|
||||
}
|
||||
LOGDEB1(("RclConfig::getDefCharset: localecharset [%s]\n",
|
||||
localecharset.c_str()));
|
||||
|
|
|
@ -279,16 +279,8 @@ bool MimeHandlerMail::processAttach()
|
|||
// with this but it expects text/plain to be utf-8 already, so we
|
||||
// handle the transcoding if needed
|
||||
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
|
||||
string utf8;
|
||||
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
|
||||
LOGERR((" processAttach: transcode to utf-8 failed for charset "
|
||||
"[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
|
||||
// can't transcode at all -> data is garbage just erase it
|
||||
if (!txtdcode("MimeHandlerMail::processAttach"))
|
||||
body.clear();
|
||||
} else {
|
||||
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
||||
body.swap(utf8);
|
||||
}
|
||||
}
|
||||
|
||||
// Ipath
|
||||
|
@ -320,32 +312,32 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
|
|||
// Handle some headers.
|
||||
string& text = m_metaData[cstr_dj_keycontent];
|
||||
Binc::HeaderItem hi;
|
||||
string transcoded;
|
||||
string decoded;
|
||||
if (doc->h.getFirstHeader("From", hi)) {
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
rfc2047_decode(hi.getValue(), decoded);
|
||||
if (preview())
|
||||
text += string("From: ");
|
||||
text += transcoded + cstr_newline;
|
||||
text += decoded + cstr_newline;
|
||||
if (depth == 1) {
|
||||
m_metaData[cstr_dj_keyauthor] = transcoded;
|
||||
m_metaData[cstr_dj_keyauthor] = decoded;
|
||||
}
|
||||
}
|
||||
if (doc->h.getFirstHeader("To", hi)) {
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
rfc2047_decode(hi.getValue(), decoded);
|
||||
if (preview())
|
||||
text += string("To: ");
|
||||
text += transcoded + cstr_newline;
|
||||
text += decoded + cstr_newline;
|
||||
if (depth == 1) {
|
||||
m_metaData[cstr_dj_keyrecipient] = transcoded;
|
||||
m_metaData[cstr_dj_keyrecipient] = decoded;
|
||||
}
|
||||
}
|
||||
if (doc->h.getFirstHeader("Cc", hi)) {
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
rfc2047_decode(hi.getValue(), decoded);
|
||||
if (preview())
|
||||
text += string("Cc: ");
|
||||
text += transcoded + cstr_newline;
|
||||
text += decoded + cstr_newline;
|
||||
if (depth == 1) {
|
||||
m_metaData[cstr_dj_keyrecipient] += " " + transcoded;
|
||||
m_metaData[cstr_dj_keyrecipient] += " " + decoded;
|
||||
}
|
||||
}
|
||||
if (doc->h.getFirstHeader("Message-Id", hi)) {
|
||||
|
@ -355,31 +347,31 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
|
|||
}
|
||||
}
|
||||
if (doc->h.getFirstHeader("Date", hi)) {
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
rfc2047_decode(hi.getValue(), decoded);
|
||||
if (depth == 1) {
|
||||
time_t t = rfc2822DateToUxTime(transcoded);
|
||||
time_t t = rfc2822DateToUxTime(decoded);
|
||||
if (t != (time_t)-1) {
|
||||
char ascuxtime[100];
|
||||
sprintf(ascuxtime, "%ld", (long)t);
|
||||
m_metaData[cstr_dj_keymd] = ascuxtime;
|
||||
} else {
|
||||
// Leave mtime field alone, ftime will be used instead.
|
||||
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
|
||||
LOGDEB(("rfc2822Date...: failed: [%s]\n", decoded.c_str()));
|
||||
}
|
||||
}
|
||||
if (preview())
|
||||
text += string("Date: ");
|
||||
text += transcoded + cstr_newline;
|
||||
text += decoded + cstr_newline;
|
||||
}
|
||||
if (doc->h.getFirstHeader("Subject", hi)) {
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
rfc2047_decode(hi.getValue(), decoded);
|
||||
if (depth == 1) {
|
||||
m_metaData[cstr_dj_keytitle] = transcoded;
|
||||
m_subject = transcoded;
|
||||
m_metaData[cstr_dj_keytitle] = decoded;
|
||||
m_subject = decoded;
|
||||
}
|
||||
if (preview())
|
||||
text += string("Subject: ");
|
||||
text += transcoded + cstr_newline;
|
||||
text += decoded + cstr_newline;
|
||||
}
|
||||
|
||||
// Check for the presence of configured additional headers and possibly
|
||||
|
@ -597,22 +589,23 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
|||
}
|
||||
|
||||
// We are dealing with an inline part of text/plain or text/html
|
||||
// type There may be several such parts, which is why we don't
|
||||
// just return a text or html subdoc and let the filter stack
|
||||
// work: we want to concatenate them in place instead
|
||||
// type. We can't just return a text or html subdoc and let the
|
||||
// filter stack work: this would create another subdocument, but
|
||||
// we want instead to decode a body part of this message document.
|
||||
|
||||
LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
|
||||
doc->getBodyStartOffset(), doc->getBodyLength()));
|
||||
string body;
|
||||
doc->getBody(body, 0, doc->bodylength);
|
||||
|
||||
{
|
||||
string decoded;
|
||||
const string *bdp;
|
||||
if (!decodeBody(cte, body, decoded, &bdp)) {
|
||||
LOGERR(("MimeHandlerMail::walkmime: failed decoding body\n"));
|
||||
}
|
||||
if (bdp != &body)
|
||||
body = decoded;
|
||||
body.swap(decoded);
|
||||
}
|
||||
|
||||
// Handle html stripping and transcoding to utf8
|
||||
if (!stringlowercmp("text/html", content_type.value)) {
|
||||
|
|
|
@ -15,10 +15,80 @@
|
|||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <tr1/unordered_map>
|
||||
using std::tr1::unordered_map;
|
||||
|
||||
#include "cstr.h"
|
||||
#include "transcode.h"
|
||||
#include "mimehandler.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
|
||||
static const char *vcountry_to_code[] = {
|
||||
"fr", "windows-1252",
|
||||
"al", "windows-1252",
|
||||
"dk", "windows-1252",
|
||||
"en", "windows-1252",
|
||||
"de", "windows-1252",
|
||||
"is", "windows-1252",
|
||||
"my", "windows-1252",
|
||||
"ie", "windows-1252",
|
||||
"gb", "windows-1252",
|
||||
"it", "windows-1252",
|
||||
"lu", "windows-1252",
|
||||
"no", "windows-1252",
|
||||
"pt", "windows-1252",
|
||||
"es", "windows-1252",
|
||||
"se", "windows-1252",
|
||||
"ba", "iso-8859-2",
|
||||
"hr", "iso-8859-2",
|
||||
"cz", "iso-8859-2",
|
||||
"hu", "iso-8859-2",
|
||||
"pl", "iso-8859-2",
|
||||
"rs", "iso-8859-2",
|
||||
"sk", "iso-8859-2",
|
||||
"si", "iso-8859-2",
|
||||
"gr", "iso-8859-7",
|
||||
"il", "iso-8859-8",
|
||||
"tr", "iso-8859-9",
|
||||
"th", "iso-8859-11",
|
||||
"lv", "iso-8859-13",
|
||||
"lt", "iso-8859-13",
|
||||
};
|
||||
|
||||
|
||||
// Called after decoding from utf-8 failed. Handle the common case
|
||||
// where this is a good old 8bit-encoded text document left-over when
|
||||
// the locale was switched to utf-8. We try to guess a charset
|
||||
// according to the locale language and use it. This is a very rough
|
||||
// heuristic, but may be better than discarding the data.
|
||||
static bool alternate_decode(const string& in, string& out)
|
||||
{
|
||||
static unordered_map<string, string> country_to_code;
|
||||
if (country_to_code.empty()) {
|
||||
for (unsigned int i = 0;
|
||||
i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
|
||||
country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
|
||||
}
|
||||
}
|
||||
|
||||
string locale = setlocale(LC_CTYPE, 0);
|
||||
LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
|
||||
string::size_type under = locale.find_first_of("_");
|
||||
if (under == string::npos)
|
||||
return false;
|
||||
string country = locale.substr(0, under);
|
||||
|
||||
unordered_map<string,string>::const_iterator it =
|
||||
country_to_code.find(country);
|
||||
if (it == country_to_code.end())
|
||||
return false;
|
||||
string code = it->second;
|
||||
|
||||
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
|
||||
code.c_str()));
|
||||
return transcode(in, out, code, cstr_utf8);
|
||||
}
|
||||
|
||||
bool RecollFilter::txtdcode(const string& who)
|
||||
{
|
||||
|
@ -33,17 +103,24 @@ bool RecollFilter::txtdcode(const string& who)
|
|||
LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n",
|
||||
who.c_str(), itext.size(), ocs.c_str()));
|
||||
int ecnt;
|
||||
bool ret;
|
||||
string otext;
|
||||
if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) ||
|
||||
ecnt > int(itext.size() / 4)) {
|
||||
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
|
||||
if (!ret || ecnt > int(itext.size() / 100)) {
|
||||
LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
|
||||
"for input charset [%s] ret %d ecnt %d\n",
|
||||
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
|
||||
|
||||
if (samecharset(ocs, cstr_utf8)) {
|
||||
ret = alternate_decode(itext, otext);
|
||||
}
|
||||
if (!ret) {
|
||||
itext.erase();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
itext.swap(otext);
|
||||
m_metaData[cstr_dj_keycharset] = "UTF-8";
|
||||
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -12,6 +12,8 @@ initvariables $0
|
|||
recollq '"Dear Corporate Administrator"'
|
||||
recollq TestTbirdWithoutEmptyLine
|
||||
recollq TestTbirdWithEmptyLine
|
||||
recollq Utf8attachaccentueaccentue
|
||||
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||
|
|
|
@ -11,3 +11,5 @@ message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/badMail.edi
|
|||
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [Pronote: salut les genies.] 568 bytes
|
||||
1 results
|
||||
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/thunderbird/Sent] [De mieux en mieux] 557 bytes
|
||||
1 results
|
||||
message/rfc822 [file:///home/dockes/projets/fulltext/testrecoll/mail/outmail] [Message avec attachement textplain utf8] 1733 bytes
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue