Make dehyphenate (co-worker->coworker) optional

2015-08-19 11:34:26 +02:00 · 2015-08-19 11:34:26 +02:00 · bd3e930533
commit bd3e930533
parent 5c001aec83
5 changed files with 38 additions and 8 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -321,8 +321,9 @@ bool RclConfig::updateMainConfig()
    m_mdrstate.init(m_conf);
    setKeyDir(cstr_null);
-    bool nocjk = false;
+
-    if (getConfParam("nocjk", &nocjk) && nocjk == true) {
+    bool bvalue = false;
    if (getConfParam("nocjk", &bvalue) && bvalue == true) {
 	TextSplit::cjkProcessing(false);
    } else {
 	int ngramlen;
@ -333,14 +334,18 @@ bool RclConfig::updateMainConfig()
 	}
    }
-    bool nonum = false;
+    bvalue = false;
-    if (getConfParam("nonumbers", &nonum) && nonum == true) {
+    if (getConfParam("nonumbers", &bvalue) && bvalue == true) {
 	TextSplit::noNumbers();
    }
-    bool fnmpathname = true;
+    bvalue = false;
-    if (getConfParam("skippedPathsFnmPathname", &fnmpathname)
+    if (getConfParam("dehyphenate", &bvalue)) {
-	&& fnmpathname == false) {
+	TextSplit::deHyphenate(bvalue);
    }
    bvalue = true;
    if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) {
 	FsTreeWalker::setNoFnmPathname();
    }
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -212,6 +212,7 @@ bool TextSplit::isCJK(int c)
 bool          TextSplit::o_processCJK = true;
 unsigned int  TextSplit::o_CJKNgramLen = 2;
 bool          TextSplit::o_noNumbers = false;
 bool          TextSplit::o_deHyphenate = false;
 // Final term checkpoint: do some checking (the kind which is simpler
 // to do here than in the main loop), then send term to our client.
@ -309,7 +310,8 @@ bool TextSplit::words_from_span(int bp)
    // Byte position of the span start
    int spboffs = bp - m_span.size();
-    if (spanwords == 2 && m_span[m_words_in_span[0].second] == '-') {
+    if (o_deHyphenate && spanwords == 2 && 
 	m_span[m_words_in_span[0].second] == '-') {
 	unsigned int s0 = m_words_in_span[0].first;
 	unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first;
 	unsigned int s1 = m_words_in_span[1].first;
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -52,6 +52,13 @@ public:
 	o_noNumbers = true;
    }
    // Given [co-worker] as input, do we also generate [coworker] ?
    // Set by rclconfig
    static bool o_deHyphenate;
    static void deHyphenate(bool on) {
 	o_deHyphenate = on;
    }
    enum Flags {
        // Default: will return spans and words (a_b, a, b)
        TXTS_NONE = 0, 
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@ -5591,6 +5591,15 @@ skippedPaths = ~/somedir/*.txt
            </listitem>
          </varlistentry>
          <varlistentry><term><varname>dehyphenate</varname></term>
            <listitem><para>Determines if, given an input of
            <literal>co-worker</literal>, we add a term for
            <literal>coworker</literal>. This possibility is new in version
            1.22, and on by default. Setting the variable to off allows
            restoring the previous behaviour.</para>
            </listitem>
          </varlistentry>
          <varlistentry><term><varname>nocjk</varname></term>
            <listitem><para>If this set to true, specific east asian
            (Chinese Korean Japanese) characters/word splitting is
--- a/src/sampleconf/recoll.conf.in
+++ b/src/sampleconf/recoll.conf.in
@ -134,6 +134,13 @@ indexstemminglanguages = english
 # are not performed by unac, but I cant imagine someone typing the composed
 # forms in a search.
 unac_except_trans = ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl
 # Turn off the indexing of numbers: may reduce the index size if you have
 # no use for them 
 # nonumbers = 0
 # Turn off indexing "coworker" for an input of "co-worker" (in addition to
 # co, worker, "co worker". Default is on as of version 1.22
 # dehyphenate = 1
 # Maximum expansion count for a single term (ie: when using wildcards).
 # We used to not limit this at all (except for filenames where the limit