from 1.18 branch: When creating initial config directory (1st exec), initialize specific unac_except_trans for some languages: de, se/no/dk/fi + fix mixup of language and country codes

2012-11-01 11:27:50 +01:00 · 2012-11-01 11:27:50 +01:00 · 3da5158e9f
commit 3da5158e9f
parent a11c696554
5 changed files with 87 additions and 56 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -1101,6 +1101,12 @@ static const char blurb0[] =
 "# values is identical.\n"
    ;
 // Use uni2ascii -a K to generate these from the utf-8 strings
 // Swedish and Danish. 
 static const char swedish_ex[] = "unac_except_trans = \303\244\303\244 \303\204\303\244 \303\266\303\266 \303\226\303\266 \303\274\303\274 \303\234\303\274 \303\237ss \305\223oe \305\222oe \303\246ae \303\206ae \357\254\201fi \357\254\202fl \303\245\303\245 \303\205\303\245";
 // German:
 static const char german_ex[] = "unac_except_trans = \303\244\303\244 \303\204\303\244 \303\266\303\266 \303\226\303\266 \303\274\303\274 \303\234\303\274 \303\237ss \305\223oe \305\222oe \303\246ae \303\206ae \357\254\201fi \357\254\202fl";
 // Create initial user config by creating commented empty files
 static const char *configfiles[] = {"recoll.conf", "mimemap", "mimeconf", 
 				    "mimeview"};
@ -1121,12 +1127,22 @@ bool RclConfig::initUserConfig()
 	    strerror(errno);
 	return false;
    }
    string lang = localelang();
    for (int i = 0; i < ncffiles; i++) {
 	string dst = path_cat(m_confdir, string(configfiles[i])); 
 	if (access(dst.c_str(), 0) < 0) {
 	    FILE *fp = fopen(dst.c_str(), "w");
 	    if (fp) {
 		fprintf(fp, "%s\n", blurb);
 		if (!strcmp(configfiles[i], "recoll.conf")) {
 		    // Add improved unac_except_trans for some languages
 		    if (lang == "se" || lang == "dk" || lang == "no" || 
 			lang == "fi") {
 			fprintf(fp, "%s\n", swedish_ex);
 		    } else if (lang == "de") {
 			fprintf(fp, "%s\n", german_ex);
 		    }
 		}
 		fclose(fp);
 	    } else {
 		m_reason += string("fopen ") + dst + ": " + strerror(errno);
--- a/src/common/rclinit.cpp
+++ b/src/common/rclinit.cpp
@ -30,6 +30,7 @@
 #include "rclinit.h"
 #include "pathut.h"
 #include "unac.h"
 #include "smallut.h"
 static const int catchedSigs[] = {SIGHUP, SIGINT, SIGQUIT, SIGTERM, 
     SIGUSR1, SIGUSR2};
@ -119,6 +120,9 @@ RclConfig *recollinit(RclInitFlags flags,
    if (config->getConfParam("unac_except_trans", unacex) && !unacex.empty()) 
 	unac_set_except_translations(unacex.c_str());
    // Init langtocode() static table
    langtocode("");
    int flushmb;
    if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
 	LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",
--- a/src/internfile/txtdcode.cpp
+++ b/src/internfile/txtdcode.cpp
@ -15,47 +15,12 @@
 */
 #include "autoconfig.h"
 #include <tr1/unordered_map>
 using std::tr1::unordered_map;
 #include "cstr.h"
 #include "transcode.h"
 #include "mimehandler.h"
 #include "debuglog.h"
 #include "smallut.h"
 static const char *vcountry_to_code[] = {
    "fr", "windows-1252",
    "al", "windows-1252", 
    "dk", "windows-1252",
    "en", "windows-1252",
    "de", "windows-1252",
    "is", "windows-1252",
    "my", "windows-1252",
    "ie", "windows-1252",
    "gb", "windows-1252",
    "it", "windows-1252",
    "lu", "windows-1252",
    "no", "windows-1252",
    "pt", "windows-1252",
    "es", "windows-1252",
    "se", "windows-1252",
    "ba", "iso-8859-2",
    "hr", "iso-8859-2",
    "cz", "iso-8859-2",
    "hu", "iso-8859-2",
    "pl", "iso-8859-2",
    "rs", "iso-8859-2",
    "sk", "iso-8859-2",
    "si", "iso-8859-2",
    "gr", "iso-8859-7",
    "il", "iso-8859-8",
    "tr", "iso-8859-9",
    "th", "iso-8859-11",
    "lv", "iso-8859-13",
    "lt", "iso-8859-13",
 };
 // Called after decoding from utf-8 failed. Handle the common case
 // where this is a good old 8bit-encoded text document left-over when
@ -64,27 +29,8 @@ static const char *vcountry_to_code[] = {
 // heuristic, but may be better than discarding the data.
 static bool alternate_decode(const string& in, string& out)
 {
-    static unordered_map<string, string> country_to_code;
+    string lang = localelang();
-    if (country_to_code.empty()) {
+    string code = langtocode(lang);
 	for (unsigned int i = 0; 
 	     i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
 	    country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
 	}
    }
    string locale = setlocale(LC_CTYPE, 0);
    LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
    string::size_type under = locale.find_first_of("_");
    if (under == string::npos)
 	return false;
    string country = locale.substr(0, under);
    unordered_map<string,string>::const_iterator it = 
 	country_to_code.find(country);
    if (it == country_to_code.end())
 	return false;
    string code = it->second;
    LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
 	    code.c_str()));
    return transcode(in, out, code, cstr_utf8);
--- a/src/utils/smallut.cpp
+++ b/src/utils/smallut.cpp
@ -33,11 +33,14 @@
 #include <string>
 #include <iostream>
 #include <list>
 #include <tr1/unordered_map>
 using std::tr1::unordered_map;
 using namespace std;
 #include "smallut.h"
 #include "utf8iter.h"
 #include "hldata.h"
 #include "cstr.h"
 int stringicmp(const string & s1, const string& s2) 
 {
@ -1097,6 +1100,63 @@ void HighlightData::append(const HighlightData& hl)
    }
 }
 static const char *vlang_to_code[] = {
    "be", "cp1251",
    "bg", "cp1251",
    "cs", "iso-8859-2",
    "el", "iso-8859-7",
    "he", "iso-8859-8",
    "hr", "iso-8859-2",
    "hu", "iso-8859-2",
    "ja", "eucjp",
    "kk", "pt154",
    "ko", "euckr",
    "lt", "iso-8859-13",
    "lv", "iso-8859-13",
    "pl", "iso-8859-2",
    "rs", "iso-8859-2",
    "ro", "iso-8859-2",
    "ru", "koi8-r",
    "sk", "iso-8859-2",
    "sl", "iso-8859-2",
    "sr", "iso-8859-2",
    "th", "iso-8859-11",
    "tr", "iso-8859-9",
    "uk", "koi8-u",
 };
 string langtocode(const string& lang)
 {
    static unordered_map<string, string> lang_to_code;
    if (lang_to_code.empty()) {
 	for (unsigned int i = 0; 
 	     i < sizeof(vlang_to_code) / sizeof(char *); i += 2) {
 	    lang_to_code[vlang_to_code[i]] = vlang_to_code[i+1];
 	}
    }
    unordered_map<string,string>::const_iterator it = 
 	lang_to_code.find(lang);
    // Use cp1252 by default...
    if (it == lang_to_code.end())
 	return cstr_cp1252;
    return it->second;
 }
 string localelang()
 {
    const char *lang = getenv("LANG");
    if (lang == 0 || *lang == 0 || !strcmp(lang, "C") || !strcmp(lang, "POSIX"))
 	return "en";
    string locale(lang);
    string::size_type under = locale.find_first_of("_");
    if (under == string::npos)
 	return locale;
    return locale.substr(0, under);
 }
 #else // TEST_SMALLUT
 #include <string>
--- a/src/utils/smallut.h
+++ b/src/utils/smallut.h
@ -53,6 +53,11 @@ extern string stringtolower(const string& io);
 // Is one string the end part of the other ?
 extern int stringisuffcmp(const string& s1, const string& s2);
 // Divine language from locale
 extern std::string localelang();
 // Divine 8bit charset from language
 extern std::string langtocode(const string& lang);
 // Compare charset names, removing the more common spelling variations
 extern bool samecharset(const string &cs1, const string &cs2);