diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index a4770d57..b9f2f154 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -321,8 +321,9 @@ bool RclConfig::updateMainConfig() m_mdrstate.init(m_conf); setKeyDir(cstr_null); - bool nocjk = false; - if (getConfParam("nocjk", &nocjk) && nocjk == true) { + + bool bvalue = false; + if (getConfParam("nocjk", &bvalue) && bvalue == true) { TextSplit::cjkProcessing(false); } else { int ngramlen; @@ -333,14 +334,18 @@ bool RclConfig::updateMainConfig() } } - bool nonum = false; - if (getConfParam("nonumbers", &nonum) && nonum == true) { + bvalue = false; + if (getConfParam("nonumbers", &bvalue) && bvalue == true) { TextSplit::noNumbers(); } - bool fnmpathname = true; - if (getConfParam("skippedPathsFnmPathname", &fnmpathname) - && fnmpathname == false) { + bvalue = false; + if (getConfParam("dehyphenate", &bvalue)) { + TextSplit::deHyphenate(bvalue); + } + + bvalue = true; + if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) { FsTreeWalker::setNoFnmPathname(); } diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 728edb90..6807a7a2 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -212,6 +212,7 @@ bool TextSplit::isCJK(int c) bool TextSplit::o_processCJK = true; unsigned int TextSplit::o_CJKNgramLen = 2; bool TextSplit::o_noNumbers = false; +bool TextSplit::o_deHyphenate = false; // Final term checkpoint: do some checking (the kind which is simpler // to do here than in the main loop), then send term to our client. @@ -309,7 +310,8 @@ bool TextSplit::words_from_span(int bp) // Byte position of the span start int spboffs = bp - m_span.size(); - if (spanwords == 2 && m_span[m_words_in_span[0].second] == '-') { + if (o_deHyphenate && spanwords == 2 && + m_span[m_words_in_span[0].second] == '-') { unsigned int s0 = m_words_in_span[0].first; unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first; unsigned int s1 = m_words_in_span[1].first; diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 89de6b24..ff0ab31e 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -52,6 +52,13 @@ public: o_noNumbers = true; } + // Given [co-worker] as input, do we also generate [coworker] ? + // Set by rclconfig + static bool o_deHyphenate; + static void deHyphenate(bool on) { + o_deHyphenate = on; + } + enum Flags { // Default: will return spans and words (a_b, a, b) TXTS_NONE = 0, diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index cbe8c549..51e9c350 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -5591,6 +5591,15 @@ skippedPaths = ~/somedir/*.txt + dehyphenate + Determines if, given an input of + co-worker, we add a term for + coworker. This possibility is new in version + 1.22, and on by default. Setting the variable to off allows + restoring the previous behaviour. + + + nocjk If this set to true, specific east asian (Chinese Korean Japanese) characters/word splitting is diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index bf0d9bcc..327b2366 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -134,6 +134,13 @@ indexstemminglanguages = english # are not performed by unac, but I cant imagine someone typing the composed # forms in a search. unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl +# Turn off the indexing of numbers: may reduce the index size if you have +# no use for them +# nonumbers = 0 + +# Turn off indexing "coworker" for an input of "co-worker" (in addition to +# co, worker, "co worker". Default is on as of version 1.22 +# dehyphenate = 1 # Maximum expansion count for a single term (ie: when using wildcards). # We used to not limit this at all (except for filenames where the limit