987 lines
27 KiB
C++
987 lines
27 KiB
C++
/* Copyright (C) 2004 J.F.Dockes
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
#ifndef TEST_TEXTSPLIT
|
|
#include "autoconfig.h"
|
|
|
|
#include <assert.h>
|
|
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <set>
|
|
#include <cstring>
|
|
|
|
#include "textsplit.h"
|
|
#include "debuglog.h"
|
|
//#define UTF8ITER_CHECK
|
|
#include "utf8iter.h"
|
|
#include "uproplist.h"
|
|
|
|
#ifndef NO_NAMESPACES
|
|
using namespace std;
|
|
#endif /* NO_NAMESPACES */
|
|
|
|
/**
|
|
* Splitting a text into words. The code in this file works with utf-8
|
|
* in a semi-clean way (see uproplist.h). Ascii still gets special treatment.
|
|
*/
|
|
|
|
// Character classes: we have three main groups, and then some chars
|
|
// are their own class because they want special handling.
|
|
//
|
|
// We have an array with 256 slots where we keep the character types.
|
|
// The array could be fully static, but we use a small function to fill it
|
|
// once.
|
|
// The array is actually a remnant of the original version which did no utf8.
|
|
// Only the lower 127 slots are now used, but keep it at 256
|
|
// because it makes some tests in the code simpler.
|
|
const unsigned int charclasses_size = 256;
|
|
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
|
|
A_ULETTER=260, A_LLETTER=261};
|
|
static int charclasses[charclasses_size];
|
|
|
|
// Real UTF-8 characters are handled with sets holding all characters
|
|
// with interesting properties. This is far from full-blown management
|
|
// of Unicode properties, but seems to do the job well enough in most
|
|
// common cases
|
|
static set<unsigned int> unicign;
|
|
static set<unsigned int> visiblewhite;
|
|
|
|
class CharClassInit {
|
|
public:
|
|
CharClassInit()
|
|
{
|
|
unsigned int i;
|
|
|
|
// Set default value for all: SPACE
|
|
for (i = 0 ; i < 256 ; i ++)
|
|
charclasses[i] = SPACE;
|
|
|
|
char digits[] = "0123456789";
|
|
for (i = 0; i < strlen(digits); i++)
|
|
charclasses[int(digits[i])] = DIGIT;
|
|
|
|
char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
for (i = 0; i < strlen(upper); i++)
|
|
charclasses[int(upper[i])] = A_ULETTER;
|
|
|
|
char lower[] = "abcdefghijklmnopqrstuvwxyz";
|
|
for (i = 0; i < strlen(lower); i++)
|
|
charclasses[int(lower[i])] = A_LLETTER;
|
|
|
|
char wild[] = "*?[]";
|
|
for (i = 0; i < strlen(wild); i++)
|
|
charclasses[int(wild[i])] = WILD;
|
|
|
|
char special[] = ".@+-,#'_\n\r";
|
|
for (i = 0; i < strlen(special); i++)
|
|
charclasses[int(special[i])] = special[i];
|
|
|
|
for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
|
|
unicign.insert(uniign[i]);
|
|
}
|
|
unicign.insert((unsigned int)-1);
|
|
|
|
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
|
|
visiblewhite.insert(avsbwht[i]);
|
|
}
|
|
}
|
|
};
|
|
static const CharClassInit charClassInitInstance;
|
|
|
|
static inline int whatcc(unsigned int c)
|
|
{
|
|
if (c <= 127) {
|
|
return charclasses[c];
|
|
} else {
|
|
if (unicign.find(c) != unicign.end())
|
|
return SPACE;
|
|
else
|
|
return LETTER;
|
|
}
|
|
}
|
|
|
|
|
|
// CJK Unicode character detection:
|
|
//
|
|
// 2E80..2EFF; CJK Radicals Supplement
|
|
// 3000..303F; CJK Symbols and Punctuation
|
|
// 3040..309F; Hiragana
|
|
// 30A0..30FF; Katakana
|
|
// 3100..312F; Bopomofo
|
|
// 3130..318F; Hangul Compatibility Jamo
|
|
// 3190..319F; Kanbun
|
|
// 31A0..31BF; Bopomofo Extended
|
|
// 31C0..31EF; CJK Strokes
|
|
// 31F0..31FF; Katakana Phonetic Extensions
|
|
// 3200..32FF; Enclosed CJK Letters and Months
|
|
// 3300..33FF; CJK Compatibility
|
|
// 3400..4DBF; CJK Unified Ideographs Extension A
|
|
// 4DC0..4DFF; Yijing Hexagram Symbols
|
|
// 4E00..9FFF; CJK Unified Ideographs
|
|
// A700..A71F; Modifier Tone Letters
|
|
// AC00..D7AF; Hangul Syllables
|
|
// F900..FAFF; CJK Compatibility Ideographs
|
|
// FE30..FE4F; CJK Compatibility Forms
|
|
// FF00..FFEF; Halfwidth and Fullwidth Forms
|
|
// 20000..2A6DF; CJK Unified Ideographs Extension B
|
|
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
|
// Note: the p > 127 test is not necessary, but optimizes away the ascii case
|
|
#define UNICODE_IS_CJK(p) \
|
|
((p) > 127 && \
|
|
(((p) >= 0x2E80 && (p) <= 0x2EFF) || \
|
|
((p) >= 0x3000 && (p) <= 0x9FFF) || \
|
|
((p) >= 0xA700 && (p) <= 0xA71F) || \
|
|
((p) >= 0xAC00 && (p) <= 0xD7AF) || \
|
|
((p) >= 0xF900 && (p) <= 0xFAFF) || \
|
|
((p) >= 0xFE30 && (p) <= 0xFE4F) || \
|
|
((p) >= 0xFF00 && (p) <= 0xFFEF) || \
|
|
((p) >= 0x20000 && (p) <= 0x2A6DF) || \
|
|
((p) >= 0x2F800 && (p) <= 0x2FA1F)))
|
|
|
|
bool TextSplit::isCJK(int c)
|
|
{
|
|
return UNICODE_IS_CJK(c);
|
|
}
|
|
|
|
bool TextSplit::o_processCJK = true;
|
|
unsigned int TextSplit::o_CJKNgramLen = 2;
|
|
bool TextSplit::o_noNumbers = false;
|
|
|
|
// Do some checking (the kind which is simpler to do here than in the
|
|
// main loop), then send term to our client.
|
|
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|
int btstart, int btend)
|
|
{
|
|
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
|
|
|
unsigned int l = w.length();
|
|
if (l > 0 && l < (unsigned)m_maxWordLength) {
|
|
// 1 byte word: we index single ascii letters and digits, but
|
|
// nothing else. We might want to turn this into a test for a
|
|
// single utf8 character instead ?
|
|
if (l == 1) {
|
|
int c = (int)w[0];
|
|
if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER &&
|
|
charclasses[c] != DIGIT) {
|
|
//cerr << "ERASING single letter term " << c << endl;
|
|
return true;
|
|
}
|
|
}
|
|
if (pos != m_prevpos || l != m_prevlen) {
|
|
bool ret = takeword(w, pos, btstart, btend);
|
|
m_prevpos = pos;
|
|
m_prevlen = w.length();
|
|
return ret;
|
|
}
|
|
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* A routine called from different places in text_to_words(), to
|
|
* adjust the current state of the parser, and call the word
|
|
* handler/emitter. Emit and reset the current word, possibly emit the current
|
|
* span (if different). In query mode, words are not emitted, only final spans
|
|
*
|
|
* This is purely for factoring common code from different places in
|
|
* text_to_words().
|
|
*
|
|
* @return true if ok, false for error. Splitting should stop in this case.
|
|
* @param spanerase Set if the current span is at its end. Reset it.
|
|
* @param bp The current BYTE position in the stream
|
|
* @param spanemit This is set for intermediate spans: glue char changed.
|
|
*/
|
|
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
|
{
|
|
LOGDEB2(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
|
|
"inn %d span [%s]\n",
|
|
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
|
|
m_inNumber, m_span.c_str()));
|
|
|
|
// Emit span? When splitting for query, we only emit final spans
|
|
// (spanerase)
|
|
bool spanemitted = false;
|
|
if (!(m_flags & TXTS_NOSPANS) &&
|
|
!((m_wordLen == m_span.length()) &&
|
|
(o_noNumbers) && m_inNumber) &&
|
|
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
|
|
|
// Check for an acronym/abbreviation ie I.B.M.
|
|
if (spanerase && m_wordLen != m_span.length() && m_span.length() > 2
|
|
&& m_span.length() <= 20) {
|
|
bool acron = true;
|
|
for (unsigned int i = 1 ; i < m_span.length(); i += 2) {
|
|
if (m_span[i] != '.') {
|
|
acron = false;
|
|
break;
|
|
}
|
|
}
|
|
if (acron) {
|
|
string acronym;
|
|
for (unsigned int i = 0; i < m_span.length(); i += 2) {
|
|
acronym += m_span[i];
|
|
}
|
|
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(),
|
|
bp))
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Maybe trim at end. These are chars that we would keep inside
|
|
// a span, but not at the end
|
|
while (m_span.length() > 0) {
|
|
switch (m_span[m_span.length()-1]) {
|
|
case '.':
|
|
case '-':
|
|
case ',':
|
|
case '@':
|
|
case '_':
|
|
case '\'':
|
|
m_span.resize(m_span.length()-1);
|
|
if (--bp < 0)
|
|
bp = 0;
|
|
break;
|
|
default:
|
|
goto breakloop1;
|
|
}
|
|
}
|
|
breakloop1:
|
|
spanemitted = true;
|
|
if (!emitterm(true, m_span, m_spanpos, bp - m_span.length(), bp))
|
|
return false;
|
|
}
|
|
|
|
// Emit word if different from span and not 'no words' mode
|
|
if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen &&
|
|
!(o_noNumbers && m_inNumber) &&
|
|
(!spanemitted || m_wordLen != m_span.length())) {
|
|
string s(m_span.substr(m_wordStart, m_wordLen));
|
|
if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
|
|
return false;
|
|
}
|
|
|
|
// Adjust state
|
|
if (m_wordLen) {
|
|
m_wordpos++;
|
|
m_wordLen = 0;
|
|
}
|
|
if (spanerase) {
|
|
discardspan();
|
|
} else {
|
|
m_wordStart = m_span.length();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void TextSplit::discardspan()
|
|
{
|
|
m_span.erase();
|
|
m_spanpos = m_wordpos;
|
|
m_wordStart = 0;
|
|
m_wordLen = 0;
|
|
}
|
|
|
|
/**
|
|
* Splitting a text into terms to be indexed.
|
|
* We basically emit a word every time we see a separator, but some chars are
|
|
* handled specially so that special cases, ie, c++ and jfd@recoll.com etc,
|
|
* are handled properly,
|
|
*/
|
|
bool TextSplit::text_to_words(const string &in)
|
|
{
|
|
LOGDEB1(("TextSplit::text_to_words: docjk %d (%d) %s%s%s [%s]\n",
|
|
o_processCJK, o_CJKNgramLen,
|
|
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
|
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
|
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
|
in.substr(0,50).c_str()));
|
|
|
|
m_span.erase();
|
|
m_inNumber = false;
|
|
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
|
int curspanglue = 0;
|
|
|
|
// Running count of non-alphanum chars. Reset when we see one;
|
|
int nonalnumcnt = 0;
|
|
|
|
Utf8Iter it(in);
|
|
|
|
for (; !it.eof(); it++) {
|
|
unsigned int c = *it;
|
|
nonalnumcnt++;
|
|
|
|
if (c == (unsigned int)-1) {
|
|
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
|
return false;
|
|
}
|
|
|
|
if (o_processCJK && UNICODE_IS_CJK(c)) {
|
|
// CJK character hit.
|
|
// Do like at EOF with the current non-cjk data.
|
|
if (m_wordLen || m_span.length()) {
|
|
if (!doemit(true, it.getBpos()))
|
|
return false;
|
|
}
|
|
|
|
// Hand off situation to the cjk routine.
|
|
if (!cjk_to_words(&it, &c)) {
|
|
LOGERR(("Textsplit: scan error in cjk handler\n"));
|
|
return false;
|
|
}
|
|
|
|
// Check for eof, else c contains the first non-cjk
|
|
// character after the cjk sequence, just go on.
|
|
if (it.eof())
|
|
break;
|
|
}
|
|
|
|
int cc = whatcc(c);
|
|
switch (cc) {
|
|
case DIGIT:
|
|
if (m_wordLen == 0)
|
|
m_inNumber = true;
|
|
m_wordLen += it.appendchartostring(m_span);
|
|
nonalnumcnt = 0;
|
|
break;
|
|
|
|
case SPACE:
|
|
SPACE:
|
|
curspanglue = 0;
|
|
nonalnumcnt = 0;
|
|
if (m_wordLen || m_span.length()) {
|
|
if (!doemit(true, it.getBpos()))
|
|
return false;
|
|
m_inNumber = false;
|
|
}
|
|
break;
|
|
|
|
case WILD:
|
|
if (m_flags & TXTS_KEEPWILD)
|
|
goto NORMALCHAR;
|
|
else
|
|
goto SPACE;
|
|
break;
|
|
|
|
case '-':
|
|
case '+':
|
|
curspanglue = cc;
|
|
if (m_wordLen == 0) {
|
|
// + or - don't start a term except if this looks like
|
|
// it's going to be to be a number
|
|
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
|
// -10
|
|
m_inNumber = true;
|
|
m_wordLen += it.appendchartostring(m_span);
|
|
} else {
|
|
goto SPACE;
|
|
}
|
|
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
|
m_span[m_span.length() - 1] == 'E')) {
|
|
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
|
m_wordLen += it.appendchartostring(m_span);
|
|
} else {
|
|
goto SPACE;
|
|
}
|
|
} else {
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
m_inNumber = false;
|
|
m_wordStart += it.appendchartostring(m_span);
|
|
}
|
|
break;
|
|
|
|
case '.':
|
|
case ',':
|
|
{
|
|
// Need a little lookahead here. At worse this gets the end null
|
|
int nextc = it[it.getCpos()+1];
|
|
int nextwhat = whatcc(nextc);
|
|
if (m_inNumber) {
|
|
// we're eliminating 132.jpg here. Good idea ?
|
|
if (nextwhat != DIGIT && nextc != 'e' && nextc != 'E')
|
|
goto SPACE;
|
|
m_wordLen += it.appendchartostring(m_span);
|
|
curspanglue = cc;
|
|
break;
|
|
} else {
|
|
// If . inside a word, it's spanglue, else, it's whitespace.
|
|
// We also keep an initial '.' for catching .net, but this adds
|
|
// quite a few spurious terms !
|
|
// Another problem is that something like .x-errs
|
|
// will be split as .x-errs, x, errs but not x-errs
|
|
// A final comma in a word will be removed by doemit
|
|
|
|
// Only letters and digits make sense after
|
|
if (nextwhat != A_LLETTER && nextwhat != A_ULETTER &&
|
|
nextwhat != DIGIT && nextwhat != LETTER)
|
|
goto SPACE;
|
|
|
|
if (cc == '.') {
|
|
// Check for number like .1
|
|
if (m_span.length() == 0 && nextwhat == DIGIT) {
|
|
m_inNumber = true;
|
|
m_wordLen += it.appendchartostring(m_span);
|
|
curspanglue = cc;
|
|
break;
|
|
}
|
|
|
|
if (m_wordLen) {
|
|
// Disputable special case: set spanemit to
|
|
// true when encountering a '.' while spanglue
|
|
// is '_'. Think of a_b.c Done to
|
|
// avoid breaking stuff after changing '_'
|
|
// from wordchar to spanglue
|
|
if (!doemit(false, it.getBpos(), curspanglue == '_'))
|
|
return false;
|
|
curspanglue = cc;
|
|
// span length could have been adjusted by trimming
|
|
// inside doemit
|
|
if (m_span.length())
|
|
m_wordStart += it.appendchartostring(m_span);
|
|
break;
|
|
} else {
|
|
m_wordStart += it.appendchartostring(m_span);
|
|
curspanglue = cc;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
goto SPACE;
|
|
}
|
|
break;
|
|
|
|
case '@':
|
|
if (m_wordLen) {
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
curspanglue = cc;
|
|
m_inNumber = false;
|
|
}
|
|
m_wordStart += it.appendchartostring(m_span);
|
|
break;
|
|
case '_':
|
|
if (m_wordLen) {
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
curspanglue = cc;
|
|
m_inNumber = false;
|
|
}
|
|
m_wordStart += it.appendchartostring(m_span);
|
|
break;
|
|
case '\'':
|
|
// If in word, potential span: o'brien, else, this is more
|
|
// whitespace
|
|
if (m_wordLen) {
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
curspanglue = cc;
|
|
m_inNumber = false;
|
|
m_wordStart += it.appendchartostring(m_span);
|
|
}
|
|
break;
|
|
case '#':
|
|
// Keep it only at end of word ... Special case for c# you see...
|
|
if (m_wordLen > 0) {
|
|
int w = whatcc(it[it.getCpos()+1]);
|
|
if (w == SPACE || w == '\n' || w == '\r') {
|
|
m_wordLen += it.appendchartostring(m_span);
|
|
break;
|
|
}
|
|
}
|
|
goto SPACE;
|
|
break;
|
|
case '\n':
|
|
case '\r':
|
|
if (m_span.length() && m_span[m_span.length() - 1] == '-') {
|
|
// if '-' is the last char before end of line, just
|
|
// ignore the line change. This is the right thing to
|
|
// do almost always. We'd then need a way to check if
|
|
// the - was added as part of the word hyphenation, or was
|
|
// there in the first place, but this would need a dictionary.
|
|
// Also we'd need to check for a soft-hyphen and remove it,
|
|
// but this would require more utf-8 magic
|
|
} else {
|
|
// Handle like a normal separator
|
|
goto SPACE;
|
|
}
|
|
break;
|
|
|
|
#ifdef RCL_SPLIT_CAMELCASE
|
|
// Camelcase handling.
|
|
// If we get uppercase ascii after lowercase ascii, emit word.
|
|
// This emits "camel" when hitting the 'C' of camelCase
|
|
// Not enabled by defaults as this makes phrase searches quite
|
|
// confusing.
|
|
// ie "MySQL manual" is matched by "MySQL manual" and
|
|
// "my sql manual" but not "mysql manual"
|
|
case A_ULETTER:
|
|
if (m_span.length() &&
|
|
charclasses[(unsigned char)m_span[m_span.length() - 1]] ==
|
|
A_LLETTER) {
|
|
if (m_wordLen) {
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
}
|
|
}
|
|
goto NORMALCHAR;
|
|
|
|
// CamelCase handling.
|
|
// If we get lowercase after uppercase and the current
|
|
// word length is bigger than one, it means we had a
|
|
// string of several upper-case letters: an
|
|
// acronym (readHTML) or a single letter article (ALittleHelp).
|
|
// Emit the uppercase word before proceeding
|
|
case A_LLETTER:
|
|
if (m_span.length() &&
|
|
charclasses[(unsigned char)m_span[m_span.length() - 1]] ==
|
|
A_ULETTER && m_wordLen > 1) {
|
|
// Multiple upper-case letters. Single letter word
|
|
// or acronym which we want to emit now
|
|
m_wordLen--;
|
|
if (!doemit(false, it.getBpos()))
|
|
return false;
|
|
m_wordStart--;
|
|
m_wordLen++;
|
|
}
|
|
goto NORMALCHAR;
|
|
#endif /* CAMELCASE */
|
|
|
|
|
|
default:
|
|
NORMALCHAR:
|
|
if (m_inNumber && c != 'e' && c != 'E') {
|
|
m_inNumber = false;
|
|
}
|
|
m_wordLen += it.appendchartostring(m_span);
|
|
nonalnumcnt = 0;
|
|
break;
|
|
}
|
|
}
|
|
if (m_wordLen || m_span.length()) {
|
|
if (!doemit(true, it.getBpos()))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Using an utf8iter pointer just to avoid needing its definition in
|
|
// textsplit.h
|
|
//
|
|
// We output ngrams for exemple for char input a b c and ngramlen== 2,
|
|
// we generate: a ab b bc c as words
|
|
//
|
|
// This is very different from the normal behaviour, so we don't use
|
|
// the doemit() and emitterm() routines
|
|
//
|
|
// The routine is sort of a mess and goes to show that we'd probably
|
|
// be better off converting the whole buffer to utf32 on entry...
|
|
bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|
{
|
|
LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
|
|
Utf8Iter &it = *itp;
|
|
|
|
// We use an offset buffer to remember the starts of the utf-8
|
|
// characters which we still need to use.
|
|
assert(o_CJKNgramLen < o_CJKMaxNgramLen);
|
|
unsigned int boffs[o_CJKMaxNgramLen+1];
|
|
|
|
// Current number of valid offsets;
|
|
unsigned int nchars = 0;
|
|
unsigned int c = 0;
|
|
for (; !it.eof(); it++) {
|
|
c = *it;
|
|
if (!UNICODE_IS_CJK(c)) {
|
|
// Return to normal handler
|
|
break;
|
|
}
|
|
if (whatcc(c) == SPACE) {
|
|
// Flush the ngram buffer and go on
|
|
nchars = 0;
|
|
continue;
|
|
}
|
|
if (nchars == o_CJKNgramLen) {
|
|
// Offset buffer full, shift it. Might be more efficient
|
|
// to have a circular one, but things are complicated
|
|
// enough already...
|
|
for (unsigned int i = 0; i < nchars-1; i++) {
|
|
boffs[i] = boffs[i+1];
|
|
}
|
|
} else {
|
|
nchars++;
|
|
}
|
|
|
|
// Take note of byte offset for this character.
|
|
boffs[nchars-1] = it.getBpos();
|
|
|
|
// Output all new ngrams: they begin at each existing position
|
|
// and end after the new character. onlyspans->only output
|
|
// maximum words, nospans=> single chars
|
|
if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
|
|
unsigned int btend = it.getBpos() + it.getBlen();
|
|
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
|
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
|
for (unsigned int i = loopbeg; i < loopend; i++) {
|
|
if (!takeword(it.buffer().substr(boffs[i],
|
|
btend-boffs[i]),
|
|
m_wordpos - (nchars-i-1), boffs[i], btend)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if ((m_flags & TXTS_ONLYSPANS)) {
|
|
// Only spans: don't overlap: flush buffer
|
|
nchars = 0;
|
|
}
|
|
}
|
|
// Increase word position by one, other words are at an
|
|
// existing position. This could be subject to discussion...
|
|
m_wordpos++;
|
|
}
|
|
|
|
// If onlyspans is set, there may be things to flush in the buffer
|
|
// first
|
|
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
|
unsigned int btend = it.getBpos(); // Current char is out
|
|
if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]),
|
|
m_wordpos - nchars,
|
|
boffs[0], btend)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
m_span.erase();
|
|
m_inNumber = false;
|
|
m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
|
|
m_spanpos = m_wordpos;
|
|
*cp = c;
|
|
return true;
|
|
}
|
|
|
|
// Specialization for countWords
|
|
class TextSplitCW : public TextSplit {
|
|
public:
|
|
int wcnt;
|
|
TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {}
|
|
bool takeword(const string &, int, int, int) {
|
|
wcnt++;
|
|
return true;
|
|
}
|
|
};
|
|
|
|
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
|
{
|
|
TextSplitCW splitter(flgs);
|
|
splitter.text_to_words(s);
|
|
return splitter.wcnt;
|
|
}
|
|
|
|
bool TextSplit::hasVisibleWhite(const string &in)
|
|
{
|
|
Utf8Iter it(in);
|
|
for (; !it.eof(); it++) {
|
|
unsigned int c = (unsigned char)*it;
|
|
LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
|
|
if (c == (unsigned int)-1) {
|
|
LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
|
|
return false;
|
|
}
|
|
if (visiblewhite.find(c) != visiblewhite.end())
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template <class T> bool u8stringToStrings(const string &s, T &tokens)
|
|
{
|
|
Utf8Iter it(s);
|
|
|
|
string current;
|
|
tokens.clear();
|
|
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
|
|
states state = SPACE;
|
|
for (; !it.eof(); it++) {
|
|
unsigned int c = *it;
|
|
if (visiblewhite.find(c) != visiblewhite.end())
|
|
c = ' ';
|
|
LOGDEB3(("TextSplit::stringToStrings: 0x%04x\n", c));
|
|
if (c == (unsigned int)-1) {
|
|
LOGERR(("TextSplit::stringToStrings: error while "
|
|
"scanning UTF-8 string\n"));
|
|
return false;
|
|
}
|
|
|
|
switch (c) {
|
|
case '"':
|
|
switch(state) {
|
|
case SPACE: state = INQUOTE; continue;
|
|
case TOKEN: goto push_char;
|
|
case ESCAPE: state = INQUOTE; goto push_char;
|
|
case INQUOTE: tokens.push_back(current);current.clear();
|
|
state = SPACE; continue;
|
|
}
|
|
break;
|
|
case '\\':
|
|
switch(state) {
|
|
case SPACE:
|
|
case TOKEN: state=TOKEN; goto push_char;
|
|
case INQUOTE: state = ESCAPE; continue;
|
|
case ESCAPE: state = INQUOTE; goto push_char;
|
|
}
|
|
break;
|
|
|
|
case ' ':
|
|
case '\t':
|
|
case '\n':
|
|
case '\r':
|
|
switch(state) {
|
|
case SPACE: continue;
|
|
case TOKEN: tokens.push_back(current); current.clear();
|
|
state = SPACE; continue;
|
|
case INQUOTE:
|
|
case ESCAPE: goto push_char;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
switch(state) {
|
|
case ESCAPE: state = INQUOTE; break;
|
|
case SPACE: state = TOKEN; break;
|
|
case TOKEN:
|
|
case INQUOTE: break;
|
|
}
|
|
push_char:
|
|
it.appendchartostring(current);
|
|
}
|
|
}
|
|
|
|
// End of string. Process residue, and possible error (unfinished quote)
|
|
switch(state) {
|
|
case SPACE: break;
|
|
case TOKEN: tokens.push_back(current); break;
|
|
case INQUOTE:
|
|
case ESCAPE: return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
|
|
{
|
|
return u8stringToStrings<list<string> >(s, tokens);
|
|
}
|
|
|
|
#else // TEST driver ->
|
|
|
|
#include <unistd.h>
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <stdio.h>
|
|
|
|
#include <iostream>
|
|
|
|
#include "textsplit.h"
|
|
#include "readfile.h"
|
|
#include "debuglog.h"
|
|
#include "transcode.h"
|
|
#include "unacpp.h"
|
|
#include "termproc.h"
|
|
|
|
using namespace std;
|
|
|
|
class myTermProc : public Rcl::TermProc {
|
|
int first;
|
|
bool nooutput;
|
|
public:
|
|
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
|
void setNoOut(bool val) {nooutput = val;}
|
|
virtual bool takeword(const string &term, int pos, int bs, int be)
|
|
{
|
|
if (nooutput)
|
|
return true;
|
|
FILE *fp = stdout;
|
|
if (first) {
|
|
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
|
first = 0;
|
|
}
|
|
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
|
return true;
|
|
}
|
|
};
|
|
|
|
static string teststring =
|
|
"Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n"
|
|
"\"Jean-Francois Dockes\" <jfd@okyz.com>\n"
|
|
"n@d @net .net t@v@c c# c++ o'brien 'o'brien' l'ami\n"
|
|
"data123\n"
|
|
"134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n"
|
|
"@^#$(#$(*)\n"
|
|
"192.168.4.1 one\n\rtwo\r"
|
|
"Debut-\ncontinue\n"
|
|
"[olala][ululu] (valeur) (23)\n"
|
|
"utf-8 ucs-4© \\nodef\n"
|
|
"A b C 2 . +"
|
|
"','this\n"
|
|
" ,able,test-domain "
|
|
" -wl,--export-dynamic "
|
|
" ~/.xsession-errors "
|
|
;
|
|
|
|
static string teststring1 = " nouvel-an ";
|
|
|
|
static string thisprog;
|
|
|
|
static string usage =
|
|
" textsplit [opts] [filename]\n"
|
|
" -q : no output\n"
|
|
" -s : only spans\n"
|
|
" -w : only words\n"
|
|
" -n : no numbers\n"
|
|
" -k : preserve wildcards (?*)\n"
|
|
" -c : just count words\n"
|
|
" -u : use unac\n"
|
|
" -C [charset] : input charset\n"
|
|
" -S [stopfile] : stopfile to use for commongrams\n"
|
|
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
|
" \n\n"
|
|
;
|
|
|
|
static void
|
|
Usage(void)
|
|
{
|
|
cerr << thisprog << ": usage:\n" << usage;
|
|
exit(1);
|
|
}
|
|
|
|
static int op_flags;
|
|
#define OPT_s 0x1
|
|
#define OPT_w 0x2
|
|
#define OPT_q 0x4
|
|
#define OPT_c 0x8
|
|
#define OPT_k 0x10
|
|
#define OPT_C 0x20
|
|
#define OPT_n 0x40
|
|
#define OPT_S 0x80
|
|
#define OPT_u 0x100
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
string charset, stopfile;
|
|
|
|
thisprog = argv[0];
|
|
argc--; argv++;
|
|
|
|
while (argc > 0 && **argv == '-') {
|
|
(*argv)++;
|
|
if (!(**argv))
|
|
/* Cas du "adb - core" */
|
|
Usage();
|
|
while (**argv)
|
|
switch (*(*argv)++) {
|
|
case 'c': op_flags |= OPT_c; break;
|
|
case 'C': op_flags |= OPT_C; if (argc < 2) Usage();
|
|
charset = *(++argv); argc--;
|
|
goto b1;
|
|
case 'k': op_flags |= OPT_k; break;
|
|
case 'n': op_flags |= OPT_n; break;
|
|
case 'q': op_flags |= OPT_q; break;
|
|
case 's': op_flags |= OPT_s; break;
|
|
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
|
|
stopfile = *(++argv); argc--;
|
|
goto b1;
|
|
case 'u': op_flags |= OPT_u; break;
|
|
case 'w': op_flags |= OPT_w; break;
|
|
default: Usage(); break;
|
|
}
|
|
b1: argc--; argv++;
|
|
}
|
|
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
|
DebugLog::setfilename("stderr");
|
|
|
|
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
|
|
|
if (op_flags&OPT_s)
|
|
flags = TextSplit::TXTS_ONLYSPANS;
|
|
else if (op_flags&OPT_w)
|
|
flags = TextSplit::TXTS_NOSPANS;
|
|
if (op_flags & OPT_k)
|
|
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
|
|
if (op_flags & OPT_n)
|
|
TextSplit::noNumbers();
|
|
|
|
Rcl::StopList stoplist;
|
|
if (op_flags & OPT_S) {
|
|
if (!stoplist.setFile(stopfile)) {
|
|
cerr << "Can't read stopfile: " << stopfile << endl;
|
|
exit(1);
|
|
}
|
|
}
|
|
string odata, reason;
|
|
if (argc == 1) {
|
|
const char *filename = *argv++; argc--;
|
|
if (!strcmp(filename, "stdin")) {
|
|
char buf[1024];
|
|
int nread;
|
|
while ((nread = read(0, buf, 1024)) > 0) {
|
|
odata.append(buf, nread);
|
|
}
|
|
} else if (!file_to_string(filename, odata, &reason)) {
|
|
cerr << "Failed: file_to_string(" << filename << ") failed: "
|
|
<< reason << endl;
|
|
exit(1);
|
|
}
|
|
} else {
|
|
cout << endl << teststring << endl << endl;
|
|
odata = teststring;
|
|
}
|
|
string& data = odata;
|
|
string ndata;
|
|
if ((op_flags & OPT_C)) {
|
|
if (!transcode(odata, ndata, charset, "UTF-8")) {
|
|
cerr << "Failed: transcode error" << endl;
|
|
exit(1);
|
|
} else {
|
|
data = ndata;
|
|
}
|
|
}
|
|
|
|
if (op_flags & OPT_c) {
|
|
int n = TextSplit::countWords(data, flags);
|
|
cout << n << " words" << endl;
|
|
} else {
|
|
myTermProc printproc;
|
|
|
|
Rcl::TermProc *nxt = &printproc;
|
|
|
|
Rcl::TermProcCommongrams commonproc(nxt, stoplist);
|
|
if (op_flags & OPT_S)
|
|
nxt = &commonproc;
|
|
|
|
Rcl::TermProcPrep preproc(nxt);
|
|
if (op_flags & OPT_u)
|
|
nxt = &preproc;
|
|
|
|
Rcl::TextSplitP splitter(nxt, flags);
|
|
|
|
if (op_flags & OPT_q)
|
|
printproc.setNoOut(true);
|
|
|
|
splitter.text_to_words(data);
|
|
|
|
}
|
|
}
|
|
#endif // TEST
|