detect unicode BOM in text files

2016-11-15 18:31:34 +01:00 · 2016-11-15 18:31:34 +01:00 · 1d303468a5
commit 1d303468a5
parent 68d4d36f83
1 changed files with 74 additions and 17 deletions
--- a/src/internfile/txtdcode.cpp
+++ b/src/internfile/txtdcode.cpp
@ -15,12 +15,14 @@
 */
 #include "autoconfig.h"

+#include <sstream>
+
 #include "cstr.h"
 #include "transcode.h"
 #include "mimehandler.h"
 #include "log.h"
 #include "smallut.h"
-
+#include "listmem.h"

 // Called after decoding from utf-8 failed. Handle the common case
 // where this is a good old 8bit-encoded text document left-over when
@ -29,37 +31,94 @@
 // heuristic, but may be better than discarding the data. 
 // If we still get a significant number of decode errors, the doc is
 // quite probably binary, so just fail.
-static bool alternate_decode(const string& in, string& out)
+// Note that we could very well get a wrong transcoding (e.g. between
+// iso-8859 variations), there is no way to detect it.
+static bool alternate_decode(const string& in, string& out, const string& ocs)
 {
-    string lang = localelang();
-    string code = langtocode(lang);
-    LOGDEB("RecollFilter::txtdcode: trying alternate decode from "  << (code) << "\n" );
    int ecnt;
-    bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
-    return ecnt > 5 ? false : ret;
+    if (samecharset(ocs, cstr_utf8)) {
+        string lang = localelang();
+        string code = langtocode(lang);
+        LOGDEB("RecollFilter::txtdcode: trying alternate decode from " <<
+               code << "\n");
+        bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
+        return ecnt > 5 ? false : ret;
+    } else {
+        // Give a try to utf-8 anyway, as this is self-detecting. This
+        // handles UTF-8 docs in a non-utf-8 environment. Note that
+        // this will almost never be called, as most encodings are
+        // unable to detect errors so that the first try at
+        // transcoding will have succeeded and alternate_decode() will
+        // not be called at all.
+        // 
+        // To avoid this, we would have to attempt an utf-8 decode
+        // first, but this is a costly proposition as we don't know
+        // how much data to test, so need to test all (the beginning
+        // of the text could be ascii even if there are 8-bit chars
+        // later).
+        bool ret = transcode(in, out, cstr_utf8, cstr_utf8, &ecnt);
+        return ecnt > 5 ? false : ret;
+    }
+}
+
+static string bomtocode(const string& itext)
+{
+#if 0
+    std::ostringstream strm;
+    listmem(strm, itext.c_str(), MIN(itext.size(), 8));
+    LOGDEB("txtdcode:bomtocode: input " << strm.str() << "\n");
+#endif
+
+    const unsigned char *utxt = (const unsigned char *)itext.c_str();
+    if (itext.size() >= 3 && utxt[0] == 0xEF && utxt[1] == 0xBB &&
+        utxt[2] == 0xBF) {
+        LOGDEB("txtdcode:bomtocode: UTF-8\n");
+        return "UTF-8";
+    } else if (itext.size() >= 2 && utxt[0] == 0xFE && utxt[1] == 0xFF) {
+        return "UTF-16BE";
+    } else if (itext.size() >= 2 && utxt[0] == 0xFF && utxt[1] == 0xFE) {
+        return "UTF-16LE";
+    } else if (itext.size() >= 4 && utxt[0] == 0 && utxt[1] == 0 &&
+               utxt[2] == 0xFE && utxt[3] == 0xFF) {
+        return "UTF-32BE";
+    } else if (itext.size() >= 4 && utxt[3] == 0 && utxt[2] == 0 &&
+               utxt[1] == 0xFE && utxt[0] == 0xFF) {
+        return "UTF-32LE";
+    } else {
+        return string();
+    }
 }

 bool RecollFilter::txtdcode(const string& who)
 {
    if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
-	LOGERR(""  << (who) << "::txtdcode: called on non txt/plain: "  << (m_metaData[cstr_dj_keymt]) << "\n" );
+	LOGERR(who << "::txtdcode: called on non txt/plain: " <<
+               m_metaData[cstr_dj_keymt] << "\n");
 	return false;
    }

    string& ocs = m_metaData[cstr_dj_keyorigcharset];
    string& itext = m_metaData[cstr_dj_keycontent];
-    LOGDEB1(""  << (who) << "::txtdcode: "  << (itext.size()) << " bytes from ["  << (ocs) << "] to UTF-8\n" );
+    LOGDEB(who << "::txtdcode: "  << itext.size() << " bytes from ["  <<
+           ocs << "] to UTF-8\n");
    int ecnt;
    string otext;
+
+    string bomfromcode = bomtocode(itext);
+    if (!bomfromcode.empty()) {
+        LOGDEB(who << "::txtdcode: " << " input charset changed from " <<
+               ocs << " to " << bomfromcode << " from BOM detection\n");
+        ocs = bomfromcode;
+    }
+    
    bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
    if (!ret || ecnt > int(itext.size() / 100)) {
-	LOGERR(""  << (who) << "::txtdcode: transcode "  << (itext.size()) << " bytes to UTF-8 failed for input charset ["  << (ocs) << "] ret "  << (ret) << " ecnt "  << (ecnt) << "\n" );
+	LOGERR(who << "::txtdcode: transcode " << itext.size() <<
+               " bytes to UTF-8 failed for input charset [" << ocs <<
+               "] ret " << ret << " ecnt "  << ecnt << "\n");
+
+        ret = alternate_decode(itext, otext, ocs);

-	if (samecharset(ocs, cstr_utf8)) {
-	    ret = alternate_decode(itext, otext);
-	} else {
-	    ret = false;
-	}
 	if (!ret) {
 	    LOGDEB("txtdcode: failed. Doc is not text?\n" );
 	    itext.erase();
@ -71,5 +130,3 @@ bool RecollFilter::txtdcode(const string& who)
    m_metaData[cstr_dj_keycharset] = cstr_utf8;
    return true;
 }
-
-