diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 6f6ec642..519bc387 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -22,6 +22,8 @@ #ifndef _WIN32 #include #include +#else +#include "wincodepages.h" #endif #include #include "safesysstat.h" @@ -208,11 +210,10 @@ RclConfig::RclConfig(const string *argcnf) o_localecharset = string(cstr_cp1252); } #else - // This is quite incorrect, when using the non-unicode (utf16) - // interface, Windows will never use utf-8 - o_localecharset = "UTF-8"; + o_localecharset = winACPName(); #endif - LOGDEB1("RclConfig::getDefCharset: localecharset [" << (o_localecharset) << "]\n" ); + LOGDEB1("RclConfig::getDefCharset: localecharset [" << + o_localecharset << "]\n"); } const char *cp; diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index 876a2fc1..3f43c2d8 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -46,6 +46,7 @@ #include "cancelcheck.h" #include "rclinit.h" #include "extrameta.h" +#include "utf8fn.h" using namespace std; @@ -592,24 +593,6 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, return processonefile(m_config, fn, stp, m_localfields); } -// File name transcoded to utf8 for indexing. If this fails, the file -// name won't be indexed, no big deal Note that we used to do the full -// path here, but I ended up believing that it made more sense to use -// only the file name The charset is used is the one from the locale. -static string compute_utf8fn(RclConfig *config, const string& fn) -{ - string charset = config->getDefCharset(true); - string utf8fn; - int ercnt; - if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) { - LOGERR("processone: fn transcode failure from [" << (charset) << "] to UTF-8: " << (path_getsimple(fn)) << "\n" ); - } else if (ercnt) { - LOGDEB("processone: fn transcode " << (ercnt) << " errors from [" << (charset) << "] to UTF-8: " << (path_getsimple(fn)) << "\n" ); - } - LOGDEB2("processone: fn transcoded from [" << (path_getsimple(fn)) << "] to [" << (utf8fn) << "] (" << (charset) << "->" << ("UTF-8") << ")\n" ); - return utf8fn; -} - FsTreeWalker::Status FsIndexer::processonefile(RclConfig *config, const std::string &fn, const struct stat *stp, @@ -680,9 +663,12 @@ FsIndexer::processonefile(RclConfig *config, return FsTreeWalker::FtwOk; } - LOGDEB0("processone: processing: [" << (displayableBytes(off_t(stp->st_size))) << "] " << (fn) << "\n" ); + LOGDEB0("processone: processing: [" << + displayableBytes(off_t(stp->st_size)) << "] " << fn << "\n"); - string utf8fn = compute_utf8fn(config, fn); + // Note that we used to do the full path here, but I ended up + // believing that it made more sense to use only the file name + string utf8fn = compute_utf8fn(config, fn, true); // parent_udi is initially the same as udi, it will be used if there // are subdocs. diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index bc571e3e..4cd496e8 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -56,6 +56,7 @@ using namespace std; #include "expansiondbs.h" #include "rclinit.h" #include "internfile.h" +#include "utf8fn.h" // Recoll index format version is stored in user metadata. When this change, // we can't open the db and will have to reindex. @@ -159,7 +160,9 @@ Db::Native::~Native() #ifdef IDX_THREADS if (m_havewriteq) { void *status = m_wqueue.setTerminateAndWait(); - LOGDEB2("Native::~Native: worker status " << (long(status)) << "\n" ); + if (status) { + LOGDEB1("Native::~Native: worker status " << status << "\n"); + } } #endif // IDX_THREADS } @@ -1060,7 +1063,6 @@ class TextSplitDb : public TextSplitP { // Reimplement text_to_words to insert the begin and end anchor terms. virtual bool text_to_words(const string &in) { - bool ret = false; string ermsg; try { @@ -1089,8 +1091,6 @@ class TextSplitDb : public TextSplitP { goto out; } - ret = true; - out: basepos += curpos + 100; return true; @@ -1296,6 +1296,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) // Split and index the path from the url for path-based filtering { string path = url_gpathS(doc.url); + +#ifdef _WIN32 + // Windows file names are case-insensitive, so we + // translate to UTF-8 and lowercase + string upath = compute_utf8fn(m_config, path, false); + unacmaybefold(upath, path, "UTF-8", UNACOP_FOLD); +#endif + vector vpath; stringToTokens(path, vpath, "/"); // If vpath is not /, the last elt is the file/dir name, not a diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp index 1da37cbb..40cfef2c 100644 --- a/src/rcldb/searchdatatox.cpp +++ b/src/rcldb/searchdatatox.cpp @@ -937,11 +937,20 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p) // Translate a dir: path filtering clause. See comments in .h bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p) { - LOGDEB("SearchDataClausePath::toNativeQuery: [" << (m_text) << "]\n" ); + LOGDEB("SearchDataClausePath::toNativeQuery: [" << m_text << "]\n"); Xapian::Query *qp = (Xapian::Query *)p; *qp = Xapian::Query(); - if (m_text.empty()) { + string ltext; +#ifdef _WIN32 + // Windows file names are case-insensitive, so we lowercase (same + // as when indexing) + unacmaybefold(m_text, ltext, "UTF-8", UNACOP_FOLD); +#else + ltext = m_text; +#endif + + if (ltext.empty()) { LOGERR("SearchDataClausePath: empty path??\n" ); m_reason = "Empty path ?"; return false; @@ -949,13 +958,13 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p) vector orqueries; - if (path_isabsolute(m_text)) + if (path_isabsolute(ltext)) orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix))); else - m_text = path_tildexpand(m_text); + ltext = path_tildexpand(ltext); vector vpath; - stringToTokens(m_text, vpath, "/"); + stringToTokens(ltext, vpath, "/"); for (vector::const_iterator pit = vpath.begin(); pit != vpath.end(); pit++){ @@ -967,7 +976,8 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p) *pit, exp, sterm, wrap_prefix(pathelt_prefix))) { return false; } - LOGDEB0("SDataPath::toNative: exp size " << (exp.size()) << ". Exp: " << (stringsToString(exp)) << "\n" ); + LOGDEB0("SDataPath::toNative: exp size " << exp.size() << ". Exp: " << + stringsToString(exp) << "\n"); if (exp.size() == 1) orqueries.push_back(Xapian::Query(exp[0])); else diff --git a/src/windows/qmkrecoll/librecoll.pro b/src/windows/qmkrecoll/librecoll.pro index e5f3f003..a1bb7153 100644 --- a/src/windows/qmkrecoll/librecoll.pro +++ b/src/windows/qmkrecoll/librecoll.pro @@ -29,6 +29,7 @@ SOURCES += \ ../../common/syngroups.cpp \ ../../common/textsplit.cpp \ ../../common/unacpp.cpp \ +../../common/utf8fn.cpp \ ../../index/beaglequeue.cpp \ ../../index/bglfetcher.cpp \ ../../index/checkretryfailed.cpp \ @@ -89,6 +90,7 @@ SOURCES += \ ../../utils/ecrontab.cpp \ ../../windows/execmd_w.cpp \ ../../windows/fnmatch.c \ +../../windows/wincodepages.cpp \ ../../utils/fileudi.cpp \ ../../utils/fstreewalk.cpp \ ../../utils/hldata.cpp \ diff --git a/src/windows/wincodepages.cpp b/src/windows/wincodepages.cpp new file mode 100644 index 00000000..bc2cc355 --- /dev/null +++ b/src/windows/wincodepages.cpp @@ -0,0 +1,180 @@ +#include +#include + +#include "safewindows.h" +#include "wincodepages.h" + +using namespace std; + +struct WinCpDef { + string cpname; + string cpcomment; +}; + +static unordered_map cpdefs { + {037, {"IBM037", "IBM EBCDIC US-Canada"}}, + {437, {"IBM437", "OEM United States"}}, + {500, {"IBM500", "IBM EBCDIC International"}}, + {708, {"ASMO-708", "Arabic (ASMO 708)"}}, + {709, {"", "Arabic (ASMO-449+, BCON V4)"}}, + {710, {"", "Arabic - Transparent Arabic"}}, + {720, {"DOS-720", "Arabic (Transparent ASMO); Arabic (DOS)"}}, + {737, {"ibm737", "OEM Greek (formerly 437G); Greek (DOS)"}}, + {775, {"ibm775", "OEM Baltic; Baltic (DOS)"}}, + {850, {"ibm850", "OEM Multilingual Latin 1; Western European (DOS)"}}, + {852, {"ibm852", "OEM Latin 2; Central European (DOS)"}}, + {855, {"IBM855", "OEM Cyrillic (primarily Russian)"}}, + {857, {"ibm857", "OEM Turkish; Turkish (DOS)"}}, + {858, {"IBM00858", "OEM Multilingual Latin 1 + Euro symbol"}}, + {860, {"IBM860", "OEM Portuguese; Portuguese (DOS)"}}, + {861, {"ibm861", "OEM Icelandic; Icelandic (DOS)"}}, + {862, {"DOS-862", "OEM Hebrew; Hebrew (DOS)"}}, + {863, {"IBM863", "OEM French Canadian; French Canadian (DOS)"}}, + {864, {"IBM864", "OEM Arabic; Arabic (864)"}}, + {865, {"IBM865", "OEM Nordic; Nordic (DOS)"}}, + {866, {"cp866", "OEM Russian; Cyrillic (DOS)"}}, + {869, {"ibm869", "OEM Modern Greek; Greek, Modern (DOS)"}}, + {870, {"IBM870", "IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2"}}, + {874, {"windows-874", "ANSI/OEM Thai (ISO 8859-11); Thai (Windows)"}}, + {875, {"cp875", "IBM EBCDIC Greek Modern"}}, + {932, {"shift_jis", "ANSI/OEM Japanese; Japanese (Shift-JIS)"}}, + {936, {"gb2312", "ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)"}}, + {949, {"ks_c_5601-1987", "ANSI/OEM Korean (Unified Hangul Code)"}}, + {950, {"big5", "ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)"}}, + {1026, {"IBM1026", "IBM EBCDIC Turkish (Latin 5)"}}, + {1047, {"IBM01047", "IBM EBCDIC Latin 1/Open System"}}, + {1140, {"IBM01140", "IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)"}}, + {1141, {"IBM01141", "IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)"}}, + {1142, {"IBM01142", "IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)"}}, + {1143, {"IBM01143", "IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)"}}, + {1144, {"IBM01144", "IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)"}}, + {1145, {"IBM01145", "IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)"}}, + {1146, {"IBM01146", "IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)"}}, + {1147, {"IBM01147", "IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)"}}, + {1148, {"IBM01148", "IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)"}}, + {1149, {"IBM01149", "IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)"}}, + {1200, {"utf-16", "Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications"}}, + {1201, {"unicodeFFFE", "Unicode UTF-16, big endian byte order; available only to managed applications"}}, + {1250, {"windows-1250", "ANSI Central European; Central European (Windows)"}}, + {1251, {"windows-1251", "ANSI Cyrillic; Cyrillic (Windows)"}}, + {1252, {"windows-1252", "ANSI Latin 1; Western European (Windows)"}}, + {1253, {"windows-1253", "ANSI Greek; Greek (Windows)"}}, + {1254, {"windows-1254", "ANSI Turkish; Turkish (Windows)"}}, + {1255, {"windows-1255", "ANSI Hebrew; Hebrew (Windows)"}}, + {1256, {"windows-1256", "ANSI Arabic; Arabic (Windows)"}}, + {1257, {"windows-1257", "ANSI Baltic; Baltic (Windows)"}}, + {1258, {"windows-1258", "ANSI/OEM Vietnamese; Vietnamese (Windows)"}}, + {1361, {"Johab", "Korean (Johab)"}}, + {10000, {"macintosh", "MAC Roman; Western European (Mac)"}}, + {10001, {"x-mac-japanese", "Japanese (Mac)"}}, + {10002, {"x-mac-chinesetrad", "MAC Traditional Chinese (Big5); Chinese Traditional (Mac)"}}, + {10003, {"x-mac-korean", "Korean (Mac)"}}, + {10004, {"x-mac-arabic", "Arabic (Mac)"}}, + {10005, {"x-mac-hebrew", "Hebrew (Mac)"}}, + {10006, {"x-mac-greek", "Greek (Mac)"}}, + {10007, {"x-mac-cyrillic", "Cyrillic (Mac)"}}, + {10008, {"x-mac-chinesesimp", "MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)"}}, + {10010, {"x-mac-romanian", "Romanian (Mac)"}}, + {10017, {"x-mac-ukrainian", "Ukrainian (Mac)"}}, + {10021, {"x-mac-thai", "Thai (Mac)"}}, + {10029, {"x-mac-ce", "MAC Latin 2; Central European (Mac)"}}, + {10079, {"x-mac-icelandic", "Icelandic (Mac)"}}, + {10081, {"x-mac-turkish", "Turkish (Mac)"}}, + {10082, {"x-mac-croatian", "Croatian (Mac)"}}, + {12000, {"utf-32", "Unicode UTF-32, little endian byte order; available only to managed applications"}}, + {12001, {"utf-32BE", "Unicode UTF-32, big endian byte order; available only to managed applications"}}, + {20000, {"x-Chinese_CNS", "CNS Taiwan; Chinese Traditional (CNS)"}}, + {20001, {"x-cp20001", "TCA Taiwan"}}, + {20002, {"x_Chinese-Eten", "Eten Taiwan; Chinese Traditional (Eten)"}}, + {20003, {"x-cp20003", "IBM5550 Taiwan"}}, + {20004, {"x-cp20004", "TeleText Taiwan"}}, + {20005, {"x-cp20005", "Wang Taiwan"}}, + {20105, {"x-IA5", "IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)"}}, + {20106, {"x-IA5-German", "IA5 German (7-bit)"}}, + {20107, {"x-IA5-Swedish", "IA5 Swedish (7-bit)"}}, + {20108, {"x-IA5-Norwegian", "IA5 Norwegian (7-bit)"}}, + {20127, {"us-ascii", "US-ASCII (7-bit)"}}, + {20261, {"x-cp20261", "T.61"}}, + {20269, {"x-cp20269", "ISO 6937 Non-Spacing Accent"}}, + {20273, {"IBM273", "IBM EBCDIC Germany"}}, + {20277, {"IBM277", "IBM EBCDIC Denmark-Norway"}}, + {20278, {"IBM278", "IBM EBCDIC Finland-Sweden"}}, + {20280, {"IBM280", "IBM EBCDIC Italy"}}, + {20284, {"IBM284", "IBM EBCDIC Latin America-Spain"}}, + {20285, {"IBM285", "IBM EBCDIC United Kingdom"}}, + {20290, {"IBM290", "IBM EBCDIC Japanese Katakana Extended"}}, + {20297, {"IBM297", "IBM EBCDIC France"}}, + {20420, {"IBM420", "IBM EBCDIC Arabic"}}, + {20423, {"IBM423", "IBM EBCDIC Greek"}}, + {20424, {"IBM424", "IBM EBCDIC Hebrew"}}, + {20833, {"x-EBCDIC-KoreanExtended", "IBM EBCDIC Korean Extended"}}, + {20838, {"IBM-Thai", "IBM EBCDIC Thai"}}, + {20866, {"koi8-r", "Russian (KOI8-R); Cyrillic (KOI8-R)"}}, + {20871, {"IBM871", "IBM EBCDIC Icelandic"}}, + {20880, {"IBM880", "IBM EBCDIC Cyrillic Russian"}}, + {20905, {"IBM905", "IBM EBCDIC Turkish"}}, + {20924, {"IBM00924", "IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)"}}, + {20932, {"EUC-JP", "Japanese (JIS 0208-1990 and 0212-1990)"}}, + {20936, {"x-cp20936", "Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)"}}, + {20949, {"x-cp20949", "Korean Wansung"}}, + {21025, {"cp1025", "IBM EBCDIC Cyrillic Serbian-Bulgarian"}}, + {21027, {"", "(deprecated)"}}, + {21866, {"koi8-u", "Ukrainian (KOI8-U); Cyrillic (KOI8-U)"}}, + {28591, {"iso-8859-1", "ISO 8859-1 Latin 1; Western European (ISO)"}}, + {28592, {"iso-8859-2", "ISO 8859-2 Central European; Central European (ISO)"}}, + {28593, {"iso-8859-3", "ISO 8859-3 Latin 3"}}, + {28594, {"iso-8859-4", "ISO 8859-4 Baltic"}}, + {28595, {"iso-8859-5", "ISO 8859-5 Cyrillic"}}, + {28596, {"iso-8859-6", "ISO 8859-6 Arabic"}}, + {28597, {"iso-8859-7", "ISO 8859-7 Greek"}}, + {28598, {"iso-8859-8", "ISO 8859-8 Hebrew; Hebrew (ISO-Visual)"}}, + {28599, {"iso-8859-9", "ISO 8859-9 Turkish"}}, + {28603, {"iso-8859-13", "ISO 8859-13 Estonian"}}, + {28605, {"iso-8859-15", "ISO 8859-15 Latin 9"}}, + {29001, {"x-Europa", "Europa 3"}}, + {38598, {"iso-8859-8-i", "ISO 8859-8 Hebrew; Hebrew (ISO-Logical)"}}, + {50220, {"iso-2022-jp", "ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)"}}, + {50221, {"csISO2022JP", "ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)"}}, + {50222, {"iso-2022-jp", "ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)"}}, + {50225, {"iso-2022-kr", "ISO 2022 Korean"}}, + {50227, {"x-cp50227", "ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)"}}, + {50229, {"", "ISO 2022 Traditional Chinese"}}, + {50930, {"", "EBCDIC Japanese (Katakana) Extended"}}, + {50931, {"", "EBCDIC US-Canada and Japanese"}}, + {50933, {"", "EBCDIC Korean Extended and Korean"}}, + {50935, {"", "EBCDIC Simplified Chinese Extended and Simplified Chinese"}}, + {50936, {"", "EBCDIC Simplified Chinese"}}, + {50937, {"", "EBCDIC US-Canada and Traditional Chinese"}}, + {50939, {"", "EBCDIC Japanese (Latin) Extended and Japanese"}}, + {51932, {"euc-jp", "EUC Japanese"}}, + {51936, {"EUC-CN", "EUC Simplified Chinese; Chinese Simplified (EUC)"}}, + {51949, {"euc-kr", "EUC Korean"}}, + {51950, {"", "EUC Traditional Chinese"}}, + {52936, {"hz-gb-2312", "HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)"}}, + {54936, {"GB18030", "Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)"}}, + {57002, {"x-iscii-de", "ISCII Devanagari"}}, + {57003, {"x-iscii-be", "ISCII Bangla"}}, + {57004, {"x-iscii-ta", "ISCII Tamil"}}, + {57005, {"x-iscii-te", "ISCII Telugu"}}, + {57006, {"x-iscii-as", "ISCII Assamese"}}, + {57007, {"x-iscii-or", "ISCII Odia"}}, + {57008, {"x-iscii-ka", "ISCII Kannada"}}, + {57009, {"x-iscii-ma", "ISCII Malayalam"}}, + {57010, {"x-iscii-gu", "ISCII Gujarati"}}, + {57011, {"x-iscii-pa", "ISCII Punjabi"}}, + {65000, {"utf-7", "Unicode (UTF-7)"}}, + {65001, {"utf-8", "Unicode (UTF-8)"}}, + }; + +static const string cp1252("CP1252"); + +const string& winACPName() +{ + unsigned int acp = GetACP(); + auto it = cpdefs.find(acp); + if (it == cpdefs.end()) { + return cp1252; + } else { + return it->second.cpname; + } +} diff --git a/src/windows/wincodepages.h b/src/windows/wincodepages.h new file mode 100644 index 00000000..bb244832 --- /dev/null +++ b/src/windows/wincodepages.h @@ -0,0 +1,8 @@ +#ifndef WINCODEPAGES_H_ +#define WINCODEPAGES_H_ + +#include + +extern const std::string& winACPName(); + +#endif // WINCODEPAGES_H_