From 3dfa44c9a38f036f5ad9543bd9e044a966d89b98 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Wed, 3 Oct 2012 09:07:59 +0200
Subject: [PATCH] add a class for skipped characters

---
 src/common/textsplit.cpp | 32 ++++++++++++++++++++------------
 src/common/uproplist.h   | 23 +++++++++++++++++++----
 2 files changed, 39 insertions(+), 16 deletions(-)
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index d0431d46..ae3a5d98 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -53,16 +53,17 @@ using namespace std;
 // because it makes some tests in the code simpler.
 const unsigned int charclasses_size = 256;
 enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259, 
-                A_ULETTER=260, A_LLETTER=261};
+                A_ULETTER=260, A_LLETTER=261, SKIP=262};
 static int charclasses[charclasses_size];
 
 // Real UTF-8 characters are handled with sets holding all characters
 // with interesting properties. This is far from full-blown management
 // of Unicode properties, but seems to do the job well enough in most
 // common cases
-static unordered_set<unsigned int> unicign;
+static vector<unsigned int> vpuncblocks;
+static unordered_set<unsigned int> spunc;
 static unordered_set<unsigned int> visiblewhite;
-static vector<unsigned int> vignblocks;
+static unordered_set<unsigned int> sskip;
 
 class CharClassInit {
 public:
@@ -94,19 +95,22 @@ public:
 	for (i = 0; i  < strlen(special); i++)
 	    charclasses[int(special[i])] = special[i];
 
-	for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
-	    unicign.insert(uniign[i]);
+	for (i = 0; i < sizeof(unipunc) / sizeof(int); i++) {
+	    spunc.insert(unipunc[i]);
 	}
-	unicign.insert((unsigned int)-1);
+	spunc.insert((unsigned int)-1);
 
-	for (i = 0; i < sizeof(uniignblocks) / sizeof(int); i++) {
-	    vignblocks.push_back(uniignblocks[i]);
+	for (i = 0; i < sizeof(unipuncblocks) / sizeof(int); i++) {
+	    vpuncblocks.push_back(unipuncblocks[i]);
 	}
-	assert((vignblocks.size() % 2) == 0);
+	assert((vpuncblocks.size() % 2) == 0);
 
 	for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
 	    visiblewhite.insert(avsbwht[i]);
 	}
+	for (i = 0; i < sizeof(uniskip) / sizeof(int); i++) {
+	    sskip.insert(uniskip[i]);
+	}
     }
 };
 static const CharClassInit charClassInitInstance;
@@ -116,14 +120,16 @@ static inline int whatcc(unsigned int c)
     if (c <= 127) {
 	return charclasses[c]; 
     } else {
-	if (unicign.find(c) != unicign.end()) {
+	if (sskip.find(c) != sskip.end()) {
+	    return SKIP;
+	} else if (spunc.find(c) != spunc.end()) {
 	    return SPACE;
 	} else {
 	    vector<unsigned int>::iterator it = 
-		lower_bound(vignblocks.begin(), vignblocks.end(), c);
+		lower_bound(vpuncblocks.begin(), vpuncblocks.end(), c);
 	    if (c == *it)
 		return SPACE;
-	    if ((it - vignblocks.begin()) % 2 == 1) {
+	    if ((it - vpuncblocks.begin()) % 2 == 1) {
 		return SPACE;
 	    } else {
 		return LETTER;
@@ -385,6 +391,8 @@ bool TextSplit::text_to_words(const string &in)
 
 	int cc = whatcc(c);
 	switch (cc) {
+	case SKIP:
+	    continue;
 	case DIGIT:
 	    if (m_wordLen == 0)
 		m_inNumber = true;
diff --git a/src/common/uproplist.h b/src/common/uproplist.h
index 016bf922..84267e26 100644
--- a/src/common/uproplist.h
+++ b/src/common/uproplist.h
@@ -27,9 +27,9 @@
  * says. 
 */
 
-// Blocks array. Each block is defined by a starting and ending code
-// point (both included). MUST BE SORTED.
-static const unsigned uniignblocks[] = {
+// Punctuation chararacters blocks array.  Each block is defined by a
+// starting and ending code point (both included). MUST BE SORTED.
+static const unsigned unipuncblocks[] = {
     // Start of latin-1 supplement block, up to capital A grave
     0x0080, 0x00BF,
     // General punctuation
@@ -78,7 +78,9 @@ static const unsigned uniignblocks[] = {
     0x2B00, 0x2BFF,
 };
 
-static const unsigned int uniign[] = {
+// Other punctuation characters list. Not all punctuation is in a
+// separate block some is found in the middle of alphanumeric codes.
+static const unsigned int unipunc[] = {
     0x00D7, /* MULTIPLICATION SIGN */
     0x00F7, /* DIVISION SIGN */
     0x037E, /* GREEK QUESTION MARK */
@@ -156,6 +158,19 @@ static const unsigned int uniign[] = {
     0xFF65, /* HALFWIDTH KATAKANA MIDDLE DOT*/
 };
 
+// Characters that should just be discarded. Some of these are in the
+// above blocks, but this array is tested first, so it's not worth
+// breaking the blocks
+static const unsigned int uniskip[] = {
+    0x00AD, /* SOFT HYPHEN */
+    0x034F, /* COMBINING GRAPHEME JOINER */
+    0x2027, /* HYPHENATION POINT */
+    0x200C, /* ZERO WIDTH NON-JOINER */
+    0x200D, /* ZERO WIDTH JOINER */
+    0x2060, /* WORD JOINER . Actually this should not be ignored but used to 
+	     * prevent a word break... */
+};
+
 /* Things that would visibly break a block of text, rendering obvious the need
  * of quotation if a phrase search is wanted */
 static const unsigned int avsbwht[] = {