add a class for skipped characters
This commit is contained in:
parent
343e4f4f17
commit
3dfa44c9a3
2 changed files with 39 additions and 16 deletions
|
@ -53,16 +53,17 @@ using namespace std;
|
||||||
// because it makes some tests in the code simpler.
|
// because it makes some tests in the code simpler.
|
||||||
const unsigned int charclasses_size = 256;
|
const unsigned int charclasses_size = 256;
|
||||||
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
|
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
|
||||||
A_ULETTER=260, A_LLETTER=261};
|
A_ULETTER=260, A_LLETTER=261, SKIP=262};
|
||||||
static int charclasses[charclasses_size];
|
static int charclasses[charclasses_size];
|
||||||
|
|
||||||
// Real UTF-8 characters are handled with sets holding all characters
|
// Real UTF-8 characters are handled with sets holding all characters
|
||||||
// with interesting properties. This is far from full-blown management
|
// with interesting properties. This is far from full-blown management
|
||||||
// of Unicode properties, but seems to do the job well enough in most
|
// of Unicode properties, but seems to do the job well enough in most
|
||||||
// common cases
|
// common cases
|
||||||
static unordered_set<unsigned int> unicign;
|
static vector<unsigned int> vpuncblocks;
|
||||||
|
static unordered_set<unsigned int> spunc;
|
||||||
static unordered_set<unsigned int> visiblewhite;
|
static unordered_set<unsigned int> visiblewhite;
|
||||||
static vector<unsigned int> vignblocks;
|
static unordered_set<unsigned int> sskip;
|
||||||
|
|
||||||
class CharClassInit {
|
class CharClassInit {
|
||||||
public:
|
public:
|
||||||
|
@ -94,19 +95,22 @@ public:
|
||||||
for (i = 0; i < strlen(special); i++)
|
for (i = 0; i < strlen(special); i++)
|
||||||
charclasses[int(special[i])] = special[i];
|
charclasses[int(special[i])] = special[i];
|
||||||
|
|
||||||
for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
|
for (i = 0; i < sizeof(unipunc) / sizeof(int); i++) {
|
||||||
unicign.insert(uniign[i]);
|
spunc.insert(unipunc[i]);
|
||||||
}
|
}
|
||||||
unicign.insert((unsigned int)-1);
|
spunc.insert((unsigned int)-1);
|
||||||
|
|
||||||
for (i = 0; i < sizeof(uniignblocks) / sizeof(int); i++) {
|
for (i = 0; i < sizeof(unipuncblocks) / sizeof(int); i++) {
|
||||||
vignblocks.push_back(uniignblocks[i]);
|
vpuncblocks.push_back(unipuncblocks[i]);
|
||||||
}
|
}
|
||||||
assert((vignblocks.size() % 2) == 0);
|
assert((vpuncblocks.size() % 2) == 0);
|
||||||
|
|
||||||
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
|
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
|
||||||
visiblewhite.insert(avsbwht[i]);
|
visiblewhite.insert(avsbwht[i]);
|
||||||
}
|
}
|
||||||
|
for (i = 0; i < sizeof(uniskip) / sizeof(int); i++) {
|
||||||
|
sskip.insert(uniskip[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
static const CharClassInit charClassInitInstance;
|
static const CharClassInit charClassInitInstance;
|
||||||
|
@ -116,14 +120,16 @@ static inline int whatcc(unsigned int c)
|
||||||
if (c <= 127) {
|
if (c <= 127) {
|
||||||
return charclasses[c];
|
return charclasses[c];
|
||||||
} else {
|
} else {
|
||||||
if (unicign.find(c) != unicign.end()) {
|
if (sskip.find(c) != sskip.end()) {
|
||||||
|
return SKIP;
|
||||||
|
} else if (spunc.find(c) != spunc.end()) {
|
||||||
return SPACE;
|
return SPACE;
|
||||||
} else {
|
} else {
|
||||||
vector<unsigned int>::iterator it =
|
vector<unsigned int>::iterator it =
|
||||||
lower_bound(vignblocks.begin(), vignblocks.end(), c);
|
lower_bound(vpuncblocks.begin(), vpuncblocks.end(), c);
|
||||||
if (c == *it)
|
if (c == *it)
|
||||||
return SPACE;
|
return SPACE;
|
||||||
if ((it - vignblocks.begin()) % 2 == 1) {
|
if ((it - vpuncblocks.begin()) % 2 == 1) {
|
||||||
return SPACE;
|
return SPACE;
|
||||||
} else {
|
} else {
|
||||||
return LETTER;
|
return LETTER;
|
||||||
|
@ -385,6 +391,8 @@ bool TextSplit::text_to_words(const string &in)
|
||||||
|
|
||||||
int cc = whatcc(c);
|
int cc = whatcc(c);
|
||||||
switch (cc) {
|
switch (cc) {
|
||||||
|
case SKIP:
|
||||||
|
continue;
|
||||||
case DIGIT:
|
case DIGIT:
|
||||||
if (m_wordLen == 0)
|
if (m_wordLen == 0)
|
||||||
m_inNumber = true;
|
m_inNumber = true;
|
||||||
|
|
|
@ -27,9 +27,9 @@
|
||||||
* says.
|
* says.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Blocks array. Each block is defined by a starting and ending code
|
// Punctuation chararacters blocks array. Each block is defined by a
|
||||||
// point (both included). MUST BE SORTED.
|
// starting and ending code point (both included). MUST BE SORTED.
|
||||||
static const unsigned uniignblocks[] = {
|
static const unsigned unipuncblocks[] = {
|
||||||
// Start of latin-1 supplement block, up to capital A grave
|
// Start of latin-1 supplement block, up to capital A grave
|
||||||
0x0080, 0x00BF,
|
0x0080, 0x00BF,
|
||||||
// General punctuation
|
// General punctuation
|
||||||
|
@ -78,7 +78,9 @@ static const unsigned uniignblocks[] = {
|
||||||
0x2B00, 0x2BFF,
|
0x2B00, 0x2BFF,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const unsigned int uniign[] = {
|
// Other punctuation characters list. Not all punctuation is in a
|
||||||
|
// separate block some is found in the middle of alphanumeric codes.
|
||||||
|
static const unsigned int unipunc[] = {
|
||||||
0x00D7, /* MULTIPLICATION SIGN */
|
0x00D7, /* MULTIPLICATION SIGN */
|
||||||
0x00F7, /* DIVISION SIGN */
|
0x00F7, /* DIVISION SIGN */
|
||||||
0x037E, /* GREEK QUESTION MARK */
|
0x037E, /* GREEK QUESTION MARK */
|
||||||
|
@ -156,6 +158,19 @@ static const unsigned int uniign[] = {
|
||||||
0xFF65, /* HALFWIDTH KATAKANA MIDDLE DOT*/
|
0xFF65, /* HALFWIDTH KATAKANA MIDDLE DOT*/
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Characters that should just be discarded. Some of these are in the
|
||||||
|
// above blocks, but this array is tested first, so it's not worth
|
||||||
|
// breaking the blocks
|
||||||
|
static const unsigned int uniskip[] = {
|
||||||
|
0x00AD, /* SOFT HYPHEN */
|
||||||
|
0x034F, /* COMBINING GRAPHEME JOINER */
|
||||||
|
0x2027, /* HYPHENATION POINT */
|
||||||
|
0x200C, /* ZERO WIDTH NON-JOINER */
|
||||||
|
0x200D, /* ZERO WIDTH JOINER */
|
||||||
|
0x2060, /* WORD JOINER . Actually this should not be ignored but used to
|
||||||
|
* prevent a word break... */
|
||||||
|
};
|
||||||
|
|
||||||
/* Things that would visibly break a block of text, rendering obvious the need
|
/* Things that would visibly break a block of text, rendering obvious the need
|
||||||
* of quotation if a phrase search is wanted */
|
* of quotation if a phrase search is wanted */
|
||||||
static const unsigned int avsbwht[] = {
|
static const unsigned int avsbwht[] = {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue