diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index f4cfe133..5e4f6d65 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1101,6 +1101,12 @@ static const char blurb0[] = "# values is identical.\n" ; +// Use uni2ascii -a K to generate these from the utf-8 strings +// Swedish and Danish. +static const char swedish_ex[] = "unac_except_trans = \303\244\303\244 \303\204\303\244 \303\266\303\266 \303\226\303\266 \303\274\303\274 \303\234\303\274 \303\237ss \305\223oe \305\222oe \303\246ae \303\206ae \357\254\201fi \357\254\202fl \303\245\303\245 \303\205\303\245"; +// German: +static const char german_ex[] = "unac_except_trans = \303\244\303\244 \303\204\303\244 \303\266\303\266 \303\226\303\266 \303\274\303\274 \303\234\303\274 \303\237ss \305\223oe \305\222oe \303\246ae \303\206ae \357\254\201fi \357\254\202fl"; + // Create initial user config by creating commented empty files static const char *configfiles[] = {"recoll.conf", "mimemap", "mimeconf", "mimeview"}; @@ -1121,12 +1127,22 @@ bool RclConfig::initUserConfig() strerror(errno); return false; } + string lang = localelang(); for (int i = 0; i < ncffiles; i++) { string dst = path_cat(m_confdir, string(configfiles[i])); if (access(dst.c_str(), 0) < 0) { FILE *fp = fopen(dst.c_str(), "w"); if (fp) { fprintf(fp, "%s\n", blurb); + if (!strcmp(configfiles[i], "recoll.conf")) { + // Add improved unac_except_trans for some languages + if (lang == "se" || lang == "dk" || lang == "no" || + lang == "fi") { + fprintf(fp, "%s\n", swedish_ex); + } else if (lang == "de") { + fprintf(fp, "%s\n", german_ex); + } + } fclose(fp); } else { m_reason += string("fopen ") + dst + ": " + strerror(errno); diff --git a/src/common/rclinit.cpp b/src/common/rclinit.cpp index 3e798ec2..313f4ef6 100644 --- a/src/common/rclinit.cpp +++ b/src/common/rclinit.cpp @@ -30,6 +30,7 @@ #include "rclinit.h" #include "pathut.h" #include "unac.h" +#include "smallut.h" static const int catchedSigs[] = {SIGHUP, SIGINT, SIGQUIT, SIGTERM, SIGUSR1, SIGUSR2}; @@ -119,6 +120,9 @@ RclConfig *recollinit(RclInitFlags flags, if (config->getConfParam("unac_except_trans", unacex) && !unacex.empty()) unac_set_except_translations(unacex.c_str()); + // Init langtocode() static table + langtocode(""); + int flushmb; if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) { LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n", diff --git a/src/internfile/txtdcode.cpp b/src/internfile/txtdcode.cpp index c886dcec..307b5d34 100644 --- a/src/internfile/txtdcode.cpp +++ b/src/internfile/txtdcode.cpp @@ -15,47 +15,12 @@ */ #include "autoconfig.h" -#include -using std::tr1::unordered_map; - #include "cstr.h" #include "transcode.h" #include "mimehandler.h" #include "debuglog.h" #include "smallut.h" -static const char *vcountry_to_code[] = { - "fr", "windows-1252", - "al", "windows-1252", - "dk", "windows-1252", - "en", "windows-1252", - "de", "windows-1252", - "is", "windows-1252", - "my", "windows-1252", - "ie", "windows-1252", - "gb", "windows-1252", - "it", "windows-1252", - "lu", "windows-1252", - "no", "windows-1252", - "pt", "windows-1252", - "es", "windows-1252", - "se", "windows-1252", - "ba", "iso-8859-2", - "hr", "iso-8859-2", - "cz", "iso-8859-2", - "hu", "iso-8859-2", - "pl", "iso-8859-2", - "rs", "iso-8859-2", - "sk", "iso-8859-2", - "si", "iso-8859-2", - "gr", "iso-8859-7", - "il", "iso-8859-8", - "tr", "iso-8859-9", - "th", "iso-8859-11", - "lv", "iso-8859-13", - "lt", "iso-8859-13", -}; - // Called after decoding from utf-8 failed. Handle the common case // where this is a good old 8bit-encoded text document left-over when @@ -64,27 +29,8 @@ static const char *vcountry_to_code[] = { // heuristic, but may be better than discarding the data. static bool alternate_decode(const string& in, string& out) { - static unordered_map country_to_code; - if (country_to_code.empty()) { - for (unsigned int i = 0; - i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) { - country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1]; - } - } - - string locale = setlocale(LC_CTYPE, 0); - LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str())); - string::size_type under = locale.find_first_of("_"); - if (under == string::npos) - return false; - string country = locale.substr(0, under); - - unordered_map::const_iterator it = - country_to_code.find(country); - if (it == country_to_code.end()) - return false; - string code = it->second; - + string lang = localelang(); + string code = langtocode(lang); LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n", code.c_str())); return transcode(in, out, code, cstr_utf8); diff --git a/src/utils/smallut.cpp b/src/utils/smallut.cpp index 72e4ee09..cc469e25 100644 --- a/src/utils/smallut.cpp +++ b/src/utils/smallut.cpp @@ -33,11 +33,14 @@ #include #include #include +#include +using std::tr1::unordered_map; using namespace std; #include "smallut.h" #include "utf8iter.h" #include "hldata.h" +#include "cstr.h" int stringicmp(const string & s1, const string& s2) { @@ -1097,6 +1100,63 @@ void HighlightData::append(const HighlightData& hl) } } +static const char *vlang_to_code[] = { + "be", "cp1251", + "bg", "cp1251", + "cs", "iso-8859-2", + "el", "iso-8859-7", + "he", "iso-8859-8", + "hr", "iso-8859-2", + "hu", "iso-8859-2", + "ja", "eucjp", + "kk", "pt154", + "ko", "euckr", + "lt", "iso-8859-13", + "lv", "iso-8859-13", + "pl", "iso-8859-2", + "rs", "iso-8859-2", + "ro", "iso-8859-2", + "ru", "koi8-r", + "sk", "iso-8859-2", + "sl", "iso-8859-2", + "sr", "iso-8859-2", + "th", "iso-8859-11", + "tr", "iso-8859-9", + "uk", "koi8-u", +}; + +string langtocode(const string& lang) +{ + static unordered_map lang_to_code; + if (lang_to_code.empty()) { + for (unsigned int i = 0; + i < sizeof(vlang_to_code) / sizeof(char *); i += 2) { + lang_to_code[vlang_to_code[i]] = vlang_to_code[i+1]; + } + } + unordered_map::const_iterator it = + lang_to_code.find(lang); + + // Use cp1252 by default... + if (it == lang_to_code.end()) + return cstr_cp1252; + + return it->second; +} + +string localelang() +{ + const char *lang = getenv("LANG"); + + if (lang == 0 || *lang == 0 || !strcmp(lang, "C") || !strcmp(lang, "POSIX")) + return "en"; + string locale(lang); + string::size_type under = locale.find_first_of("_"); + if (under == string::npos) + return locale; + return locale.substr(0, under); +} + #else // TEST_SMALLUT #include diff --git a/src/utils/smallut.h b/src/utils/smallut.h index c0f698d3..ff6c0a5b 100644 --- a/src/utils/smallut.h +++ b/src/utils/smallut.h @@ -53,6 +53,11 @@ extern string stringtolower(const string& io); // Is one string the end part of the other ? extern int stringisuffcmp(const string& s1, const string& s2); +// Divine language from locale +extern std::string localelang(); +// Divine 8bit charset from language +extern std::string langtocode(const string& lang); + // Compare charset names, removing the more common spelling variations extern bool samecharset(const string &cs1, const string &cs2);