diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp
index a4770d57..b9f2f154 100644
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@@ -321,8 +321,9 @@ bool RclConfig::updateMainConfig()
m_mdrstate.init(m_conf);
setKeyDir(cstr_null);
- bool nocjk = false;
- if (getConfParam("nocjk", &nocjk) && nocjk == true) {
+
+ bool bvalue = false;
+ if (getConfParam("nocjk", &bvalue) && bvalue == true) {
TextSplit::cjkProcessing(false);
} else {
int ngramlen;
@@ -333,14 +334,18 @@ bool RclConfig::updateMainConfig()
}
}
- bool nonum = false;
- if (getConfParam("nonumbers", &nonum) && nonum == true) {
+ bvalue = false;
+ if (getConfParam("nonumbers", &bvalue) && bvalue == true) {
TextSplit::noNumbers();
}
- bool fnmpathname = true;
- if (getConfParam("skippedPathsFnmPathname", &fnmpathname)
- && fnmpathname == false) {
+ bvalue = false;
+ if (getConfParam("dehyphenate", &bvalue)) {
+ TextSplit::deHyphenate(bvalue);
+ }
+
+ bvalue = true;
+ if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) {
FsTreeWalker::setNoFnmPathname();
}
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 728edb90..6807a7a2 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -212,6 +212,7 @@ bool TextSplit::isCJK(int c)
bool TextSplit::o_processCJK = true;
unsigned int TextSplit::o_CJKNgramLen = 2;
bool TextSplit::o_noNumbers = false;
+bool TextSplit::o_deHyphenate = false;
// Final term checkpoint: do some checking (the kind which is simpler
// to do here than in the main loop), then send term to our client.
@@ -309,7 +310,8 @@ bool TextSplit::words_from_span(int bp)
// Byte position of the span start
int spboffs = bp - m_span.size();
- if (spanwords == 2 && m_span[m_words_in_span[0].second] == '-') {
+ if (o_deHyphenate && spanwords == 2 &&
+ m_span[m_words_in_span[0].second] == '-') {
unsigned int s0 = m_words_in_span[0].first;
unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first;
unsigned int s1 = m_words_in_span[1].first;
diff --git a/src/common/textsplit.h b/src/common/textsplit.h
index 89de6b24..ff0ab31e 100644
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@@ -52,6 +52,13 @@ public:
o_noNumbers = true;
}
+ // Given [co-worker] as input, do we also generate [coworker] ?
+ // Set by rclconfig
+ static bool o_deHyphenate;
+ static void deHyphenate(bool on) {
+ o_deHyphenate = on;
+ }
+
enum Flags {
// Default: will return spans and words (a_b, a, b)
TXTS_NONE = 0,
diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml
index cbe8c549..51e9c350 100644
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@@ -5591,6 +5591,15 @@ skippedPaths = ~/somedir/*.txt
+ dehyphenate
+ Determines if, given an input of
+ co-worker, we add a term for
+ coworker. This possibility is new in version
+ 1.22, and on by default. Setting the variable to off allows
+ restoring the previous behaviour.
+
+
+
nocjk
If this set to true, specific east asian
(Chinese Korean Japanese) characters/word splitting is
diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in
index bf0d9bcc..327b2366 100644
--- a/src/sampleconf/recoll.conf.in
+++ b/src/sampleconf/recoll.conf.in
@@ -134,6 +134,13 @@ indexstemminglanguages = english
# are not performed by unac, but I cant imagine someone typing the composed
# forms in a search.
unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl
+# Turn off the indexing of numbers: may reduce the index size if you have
+# no use for them
+# nonumbers = 0
+
+# Turn off indexing "coworker" for an input of "co-worker" (in addition to
+# co, worker, "co worker". Default is on as of version 1.22
+# dehyphenate = 1
# Maximum expansion count for a single term (ie: when using wildcards).
# We used to not limit this at all (except for filenames where the limit