From 3dfa44c9a38f036f5ad9543bd9e044a966d89b98 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 3 Oct 2012 09:07:59 +0200 Subject: [PATCH] add a class for skipped characters --- src/common/textsplit.cpp | 32 ++++++++++++++++++++------------ src/common/uproplist.h | 23 +++++++++++++++++++---- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index d0431d46..ae3a5d98 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -53,16 +53,17 @@ using namespace std; // because it makes some tests in the code simpler. const unsigned int charclasses_size = 256; enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259, - A_ULETTER=260, A_LLETTER=261}; + A_ULETTER=260, A_LLETTER=261, SKIP=262}; static int charclasses[charclasses_size]; // Real UTF-8 characters are handled with sets holding all characters // with interesting properties. This is far from full-blown management // of Unicode properties, but seems to do the job well enough in most // common cases -static unordered_set unicign; +static vector vpuncblocks; +static unordered_set spunc; static unordered_set visiblewhite; -static vector vignblocks; +static unordered_set sskip; class CharClassInit { public: @@ -94,19 +95,22 @@ public: for (i = 0; i < strlen(special); i++) charclasses[int(special[i])] = special[i]; - for (i = 0; i < sizeof(uniign) / sizeof(int); i++) { - unicign.insert(uniign[i]); + for (i = 0; i < sizeof(unipunc) / sizeof(int); i++) { + spunc.insert(unipunc[i]); } - unicign.insert((unsigned int)-1); + spunc.insert((unsigned int)-1); - for (i = 0; i < sizeof(uniignblocks) / sizeof(int); i++) { - vignblocks.push_back(uniignblocks[i]); + for (i = 0; i < sizeof(unipuncblocks) / sizeof(int); i++) { + vpuncblocks.push_back(unipuncblocks[i]); } - assert((vignblocks.size() % 2) == 0); + assert((vpuncblocks.size() % 2) == 0); for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) { visiblewhite.insert(avsbwht[i]); } + for (i = 0; i < sizeof(uniskip) / sizeof(int); i++) { + sskip.insert(uniskip[i]); + } } }; static const CharClassInit charClassInitInstance; @@ -116,14 +120,16 @@ static inline int whatcc(unsigned int c) if (c <= 127) { return charclasses[c]; } else { - if (unicign.find(c) != unicign.end()) { + if (sskip.find(c) != sskip.end()) { + return SKIP; + } else if (spunc.find(c) != spunc.end()) { return SPACE; } else { vector::iterator it = - lower_bound(vignblocks.begin(), vignblocks.end(), c); + lower_bound(vpuncblocks.begin(), vpuncblocks.end(), c); if (c == *it) return SPACE; - if ((it - vignblocks.begin()) % 2 == 1) { + if ((it - vpuncblocks.begin()) % 2 == 1) { return SPACE; } else { return LETTER; @@ -385,6 +391,8 @@ bool TextSplit::text_to_words(const string &in) int cc = whatcc(c); switch (cc) { + case SKIP: + continue; case DIGIT: if (m_wordLen == 0) m_inNumber = true; diff --git a/src/common/uproplist.h b/src/common/uproplist.h index 016bf922..84267e26 100644 --- a/src/common/uproplist.h +++ b/src/common/uproplist.h @@ -27,9 +27,9 @@ * says. */ -// Blocks array. Each block is defined by a starting and ending code -// point (both included). MUST BE SORTED. -static const unsigned uniignblocks[] = { +// Punctuation chararacters blocks array. Each block is defined by a +// starting and ending code point (both included). MUST BE SORTED. +static const unsigned unipuncblocks[] = { // Start of latin-1 supplement block, up to capital A grave 0x0080, 0x00BF, // General punctuation @@ -78,7 +78,9 @@ static const unsigned uniignblocks[] = { 0x2B00, 0x2BFF, }; -static const unsigned int uniign[] = { +// Other punctuation characters list. Not all punctuation is in a +// separate block some is found in the middle of alphanumeric codes. +static const unsigned int unipunc[] = { 0x00D7, /* MULTIPLICATION SIGN */ 0x00F7, /* DIVISION SIGN */ 0x037E, /* GREEK QUESTION MARK */ @@ -156,6 +158,19 @@ static const unsigned int uniign[] = { 0xFF65, /* HALFWIDTH KATAKANA MIDDLE DOT*/ }; +// Characters that should just be discarded. Some of these are in the +// above blocks, but this array is tested first, so it's not worth +// breaking the blocks +static const unsigned int uniskip[] = { + 0x00AD, /* SOFT HYPHEN */ + 0x034F, /* COMBINING GRAPHEME JOINER */ + 0x2027, /* HYPHENATION POINT */ + 0x200C, /* ZERO WIDTH NON-JOINER */ + 0x200D, /* ZERO WIDTH JOINER */ + 0x2060, /* WORD JOINER . Actually this should not be ignored but used to + * prevent a word break... */ +}; + /* Things that would visibly break a block of text, rendering obvious the need * of quotation if a phrase search is wanted */ static const unsigned int avsbwht[] = {