detect unicode BOM in text files

This commit is contained in:
Jean-Francois Dockes 2016-11-15 18:31:34 +01:00
parent 68d4d36f83
commit 1d303468a5

View file

@ -15,12 +15,14 @@
*/ */
#include "autoconfig.h" #include "autoconfig.h"
#include <sstream>
#include "cstr.h" #include "cstr.h"
#include "transcode.h" #include "transcode.h"
#include "mimehandler.h" #include "mimehandler.h"
#include "log.h" #include "log.h"
#include "smallut.h" #include "smallut.h"
#include "listmem.h"
// Called after decoding from utf-8 failed. Handle the common case // Called after decoding from utf-8 failed. Handle the common case
// where this is a good old 8bit-encoded text document left-over when // where this is a good old 8bit-encoded text document left-over when
@ -29,37 +31,94 @@
// heuristic, but may be better than discarding the data. // heuristic, but may be better than discarding the data.
// If we still get a significant number of decode errors, the doc is // If we still get a significant number of decode errors, the doc is
// quite probably binary, so just fail. // quite probably binary, so just fail.
static bool alternate_decode(const string& in, string& out) // Note that we could very well get a wrong transcoding (e.g. between
// iso-8859 variations), there is no way to detect it.
static bool alternate_decode(const string& in, string& out, const string& ocs)
{ {
string lang = localelang();
string code = langtocode(lang);
LOGDEB("RecollFilter::txtdcode: trying alternate decode from " << (code) << "\n" );
int ecnt; int ecnt;
bool ret = transcode(in, out, code, cstr_utf8, &ecnt); if (samecharset(ocs, cstr_utf8)) {
return ecnt > 5 ? false : ret; string lang = localelang();
string code = langtocode(lang);
LOGDEB("RecollFilter::txtdcode: trying alternate decode from " <<
code << "\n");
bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
return ecnt > 5 ? false : ret;
} else {
// Give a try to utf-8 anyway, as this is self-detecting. This
// handles UTF-8 docs in a non-utf-8 environment. Note that
// this will almost never be called, as most encodings are
// unable to detect errors so that the first try at
// transcoding will have succeeded and alternate_decode() will
// not be called at all.
//
// To avoid this, we would have to attempt an utf-8 decode
// first, but this is a costly proposition as we don't know
// how much data to test, so need to test all (the beginning
// of the text could be ascii even if there are 8-bit chars
// later).
bool ret = transcode(in, out, cstr_utf8, cstr_utf8, &ecnt);
return ecnt > 5 ? false : ret;
}
}
static string bomtocode(const string& itext)
{
#if 0
std::ostringstream strm;
listmem(strm, itext.c_str(), MIN(itext.size(), 8));
LOGDEB("txtdcode:bomtocode: input " << strm.str() << "\n");
#endif
const unsigned char *utxt = (const unsigned char *)itext.c_str();
if (itext.size() >= 3 && utxt[0] == 0xEF && utxt[1] == 0xBB &&
utxt[2] == 0xBF) {
LOGDEB("txtdcode:bomtocode: UTF-8\n");
return "UTF-8";
} else if (itext.size() >= 2 && utxt[0] == 0xFE && utxt[1] == 0xFF) {
return "UTF-16BE";
} else if (itext.size() >= 2 && utxt[0] == 0xFF && utxt[1] == 0xFE) {
return "UTF-16LE";
} else if (itext.size() >= 4 && utxt[0] == 0 && utxt[1] == 0 &&
utxt[2] == 0xFE && utxt[3] == 0xFF) {
return "UTF-32BE";
} else if (itext.size() >= 4 && utxt[3] == 0 && utxt[2] == 0 &&
utxt[1] == 0xFE && utxt[0] == 0xFF) {
return "UTF-32LE";
} else {
return string();
}
} }
bool RecollFilter::txtdcode(const string& who) bool RecollFilter::txtdcode(const string& who)
{ {
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) { if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
LOGERR("" << (who) << "::txtdcode: called on non txt/plain: " << (m_metaData[cstr_dj_keymt]) << "\n" ); LOGERR(who << "::txtdcode: called on non txt/plain: " <<
m_metaData[cstr_dj_keymt] << "\n");
return false; return false;
} }
string& ocs = m_metaData[cstr_dj_keyorigcharset]; string& ocs = m_metaData[cstr_dj_keyorigcharset];
string& itext = m_metaData[cstr_dj_keycontent]; string& itext = m_metaData[cstr_dj_keycontent];
LOGDEB1("" << (who) << "::txtdcode: " << (itext.size()) << " bytes from [" << (ocs) << "] to UTF-8\n" ); LOGDEB(who << "::txtdcode: " << itext.size() << " bytes from [" <<
ocs << "] to UTF-8\n");
int ecnt; int ecnt;
string otext; string otext;
string bomfromcode = bomtocode(itext);
if (!bomfromcode.empty()) {
LOGDEB(who << "::txtdcode: " << " input charset changed from " <<
ocs << " to " << bomfromcode << " from BOM detection\n");
ocs = bomfromcode;
}
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt); bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
if (!ret || ecnt > int(itext.size() / 100)) { if (!ret || ecnt > int(itext.size() / 100)) {
LOGERR("" << (who) << "::txtdcode: transcode " << (itext.size()) << " bytes to UTF-8 failed for input charset [" << (ocs) << "] ret " << (ret) << " ecnt " << (ecnt) << "\n" ); LOGERR(who << "::txtdcode: transcode " << itext.size() <<
" bytes to UTF-8 failed for input charset [" << ocs <<
"] ret " << ret << " ecnt " << ecnt << "\n");
ret = alternate_decode(itext, otext, ocs);
if (samecharset(ocs, cstr_utf8)) {
ret = alternate_decode(itext, otext);
} else {
ret = false;
}
if (!ret) { if (!ret) {
LOGDEB("txtdcode: failed. Doc is not text?\n" ); LOGDEB("txtdcode: failed. Doc is not text?\n" );
itext.erase(); itext.erase();
@ -71,5 +130,3 @@ bool RecollFilter::txtdcode(const string& who)
m_metaData[cstr_dj_keycharset] = cstr_utf8; m_metaData[cstr_dj_keycharset] = cstr_utf8;
return true; return true;
} }