from 1.18 branch: When creating initial config directory (1st exec), initialize specific unac_except_trans for some languages: de, se/no/dk/fi + fix mixup of language and country codes

This commit is contained in:
Jean-Francois Dockes 2012-11-01 11:27:50 +01:00
parent a11c696554
commit 3da5158e9f
5 changed files with 87 additions and 56 deletions

View file

@ -1101,6 +1101,12 @@ static const char blurb0[] =
"# values is identical.\n"
;
// Use uni2ascii -a K to generate these from the utf-8 strings
// Swedish and Danish.
static const char swedish_ex[] = "unac_except_trans = \303\244\303\244 \303\204\303\244 \303\266\303\266 \303\226\303\266 \303\274\303\274 \303\234\303\274 \303\237ss \305\223oe \305\222oe \303\246ae \303\206ae \357\254\201fi \357\254\202fl \303\245\303\245 \303\205\303\245";
// German:
static const char german_ex[] = "unac_except_trans = \303\244\303\244 \303\204\303\244 \303\266\303\266 \303\226\303\266 \303\274\303\274 \303\234\303\274 \303\237ss \305\223oe \305\222oe \303\246ae \303\206ae \357\254\201fi \357\254\202fl";
// Create initial user config by creating commented empty files
static const char *configfiles[] = {"recoll.conf", "mimemap", "mimeconf",
"mimeview"};
@ -1121,12 +1127,22 @@ bool RclConfig::initUserConfig()
strerror(errno);
return false;
}
string lang = localelang();
for (int i = 0; i < ncffiles; i++) {
string dst = path_cat(m_confdir, string(configfiles[i]));
if (access(dst.c_str(), 0) < 0) {
FILE *fp = fopen(dst.c_str(), "w");
if (fp) {
fprintf(fp, "%s\n", blurb);
if (!strcmp(configfiles[i], "recoll.conf")) {
// Add improved unac_except_trans for some languages
if (lang == "se" || lang == "dk" || lang == "no" ||
lang == "fi") {
fprintf(fp, "%s\n", swedish_ex);
} else if (lang == "de") {
fprintf(fp, "%s\n", german_ex);
}
}
fclose(fp);
} else {
m_reason += string("fopen ") + dst + ": " + strerror(errno);

View file

@ -30,6 +30,7 @@
#include "rclinit.h"
#include "pathut.h"
#include "unac.h"
#include "smallut.h"
static const int catchedSigs[] = {SIGHUP, SIGINT, SIGQUIT, SIGTERM,
SIGUSR1, SIGUSR2};
@ -119,6 +120,9 @@ RclConfig *recollinit(RclInitFlags flags,
if (config->getConfParam("unac_except_trans", unacex) && !unacex.empty())
unac_set_except_translations(unacex.c_str());
// Init langtocode() static table
langtocode("");
int flushmb;
if (config->getConfParam("idxflushmb", &flushmb) && flushmb > 0) {
LOGDEB1(("rclinit: idxflushmb=%d, set XAPIAN_FLUSH_THRESHOLD to 10E6\n",

View file

@ -15,47 +15,12 @@
*/
#include "autoconfig.h"
#include <tr1/unordered_map>
using std::tr1::unordered_map;
#include "cstr.h"
#include "transcode.h"
#include "mimehandler.h"
#include "debuglog.h"
#include "smallut.h"
static const char *vcountry_to_code[] = {
"fr", "windows-1252",
"al", "windows-1252",
"dk", "windows-1252",
"en", "windows-1252",
"de", "windows-1252",
"is", "windows-1252",
"my", "windows-1252",
"ie", "windows-1252",
"gb", "windows-1252",
"it", "windows-1252",
"lu", "windows-1252",
"no", "windows-1252",
"pt", "windows-1252",
"es", "windows-1252",
"se", "windows-1252",
"ba", "iso-8859-2",
"hr", "iso-8859-2",
"cz", "iso-8859-2",
"hu", "iso-8859-2",
"pl", "iso-8859-2",
"rs", "iso-8859-2",
"sk", "iso-8859-2",
"si", "iso-8859-2",
"gr", "iso-8859-7",
"il", "iso-8859-8",
"tr", "iso-8859-9",
"th", "iso-8859-11",
"lv", "iso-8859-13",
"lt", "iso-8859-13",
};
// Called after decoding from utf-8 failed. Handle the common case
// where this is a good old 8bit-encoded text document left-over when
@ -64,27 +29,8 @@ static const char *vcountry_to_code[] = {
// heuristic, but may be better than discarding the data.
static bool alternate_decode(const string& in, string& out)
{
static unordered_map<string, string> country_to_code;
if (country_to_code.empty()) {
for (unsigned int i = 0;
i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
}
}
string locale = setlocale(LC_CTYPE, 0);
LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
string::size_type under = locale.find_first_of("_");
if (under == string::npos)
return false;
string country = locale.substr(0, under);
unordered_map<string,string>::const_iterator it =
country_to_code.find(country);
if (it == country_to_code.end())
return false;
string code = it->second;
string lang = localelang();
string code = langtocode(lang);
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
code.c_str()));
return transcode(in, out, code, cstr_utf8);

View file

@ -33,11 +33,14 @@
#include <string>
#include <iostream>
#include <list>
#include <tr1/unordered_map>
using std::tr1::unordered_map;
using namespace std;
#include "smallut.h"
#include "utf8iter.h"
#include "hldata.h"
#include "cstr.h"
int stringicmp(const string & s1, const string& s2)
{
@ -1097,6 +1100,63 @@ void HighlightData::append(const HighlightData& hl)
}
}
static const char *vlang_to_code[] = {
"be", "cp1251",
"bg", "cp1251",
"cs", "iso-8859-2",
"el", "iso-8859-7",
"he", "iso-8859-8",
"hr", "iso-8859-2",
"hu", "iso-8859-2",
"ja", "eucjp",
"kk", "pt154",
"ko", "euckr",
"lt", "iso-8859-13",
"lv", "iso-8859-13",
"pl", "iso-8859-2",
"rs", "iso-8859-2",
"ro", "iso-8859-2",
"ru", "koi8-r",
"sk", "iso-8859-2",
"sl", "iso-8859-2",
"sr", "iso-8859-2",
"th", "iso-8859-11",
"tr", "iso-8859-9",
"uk", "koi8-u",
};
string langtocode(const string& lang)
{
static unordered_map<string, string> lang_to_code;
if (lang_to_code.empty()) {
for (unsigned int i = 0;
i < sizeof(vlang_to_code) / sizeof(char *); i += 2) {
lang_to_code[vlang_to_code[i]] = vlang_to_code[i+1];
}
}
unordered_map<string,string>::const_iterator it =
lang_to_code.find(lang);
// Use cp1252 by default...
if (it == lang_to_code.end())
return cstr_cp1252;
return it->second;
}
string localelang()
{
const char *lang = getenv("LANG");
if (lang == 0 || *lang == 0 || !strcmp(lang, "C") || !strcmp(lang, "POSIX"))
return "en";
string locale(lang);
string::size_type under = locale.find_first_of("_");
if (under == string::npos)
return locale;
return locale.substr(0, under);
}
#else // TEST_SMALLUT
#include <string>

View file

@ -53,6 +53,11 @@ extern string stringtolower(const string& io);
// Is one string the end part of the other ?
extern int stringisuffcmp(const string& s1, const string& s2);
// Divine language from locale
extern std::string localelang();
// Divine 8bit charset from language
extern std::string langtocode(const string& lang);
// Compare charset names, removing the more common spelling variations
extern bool samecharset(const string &cs1, const string &cs2);