Make dehyphenate (co-worker->coworker) optional

This commit is contained in:
Jean-Francois Dockes 2015-08-19 11:34:26 +02:00
parent 5c001aec83
commit bd3e930533
5 changed files with 38 additions and 8 deletions

View file

@ -321,8 +321,9 @@ bool RclConfig::updateMainConfig()
m_mdrstate.init(m_conf);
setKeyDir(cstr_null);
bool nocjk = false;
if (getConfParam("nocjk", &nocjk) && nocjk == true) {
bool bvalue = false;
if (getConfParam("nocjk", &bvalue) && bvalue == true) {
TextSplit::cjkProcessing(false);
} else {
int ngramlen;
@ -333,14 +334,18 @@ bool RclConfig::updateMainConfig()
}
}
bool nonum = false;
if (getConfParam("nonumbers", &nonum) && nonum == true) {
bvalue = false;
if (getConfParam("nonumbers", &bvalue) && bvalue == true) {
TextSplit::noNumbers();
}
bool fnmpathname = true;
if (getConfParam("skippedPathsFnmPathname", &fnmpathname)
&& fnmpathname == false) {
bvalue = false;
if (getConfParam("dehyphenate", &bvalue)) {
TextSplit::deHyphenate(bvalue);
}
bvalue = true;
if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) {
FsTreeWalker::setNoFnmPathname();
}

View file

@ -212,6 +212,7 @@ bool TextSplit::isCJK(int c)
bool TextSplit::o_processCJK = true;
unsigned int TextSplit::o_CJKNgramLen = 2;
bool TextSplit::o_noNumbers = false;
bool TextSplit::o_deHyphenate = false;
// Final term checkpoint: do some checking (the kind which is simpler
// to do here than in the main loop), then send term to our client.
@ -309,7 +310,8 @@ bool TextSplit::words_from_span(int bp)
// Byte position of the span start
int spboffs = bp - m_span.size();
if (spanwords == 2 && m_span[m_words_in_span[0].second] == '-') {
if (o_deHyphenate && spanwords == 2 &&
m_span[m_words_in_span[0].second] == '-') {
unsigned int s0 = m_words_in_span[0].first;
unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first;
unsigned int s1 = m_words_in_span[1].first;

View file

@ -52,6 +52,13 @@ public:
o_noNumbers = true;
}
// Given [co-worker] as input, do we also generate [coworker] ?
// Set by rclconfig
static bool o_deHyphenate;
static void deHyphenate(bool on) {
o_deHyphenate = on;
}
enum Flags {
// Default: will return spans and words (a_b, a, b)
TXTS_NONE = 0,

View file

@ -5591,6 +5591,15 @@ skippedPaths = ~/somedir/*.txt
</listitem>
</varlistentry>
<varlistentry><term><varname>dehyphenate</varname></term>
<listitem><para>Determines if, given an input of
<literal>co-worker</literal>, we add a term for
<literal>coworker</literal>. This possibility is new in version
1.22, and on by default. Setting the variable to off allows
restoring the previous behaviour.</para>
</listitem>
</varlistentry>
<varlistentry><term><varname>nocjk</varname></term>
<listitem><para>If this set to true, specific east asian
(Chinese Korean Japanese) characters/word splitting is

View file

@ -134,6 +134,13 @@ indexstemminglanguages = english
# are not performed by unac, but I cant imagine someone typing the composed
# forms in a search.
unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl
# Turn off the indexing of numbers: may reduce the index size if you have
# no use for them
# nonumbers = 0
# Turn off indexing "coworker" for an input of "co-worker" (in addition to
# co, worker, "co worker". Default is on as of version 1.22
# dehyphenate = 1
# Maximum expansion count for a single term (ie: when using wildcards).
# We used to not limit this at all (except for filenames where the limit