recoll/src/common/textsplit.h

/* Copyright (C) 2004 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
#ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_

#include <string>
#include <vector>

using std::string;
using std::vector;

class Utf8Iter;

/**
 * Split text into words.
 * See comments at top of .cpp for more explanations.
 * This uses a callback function. It could be done with an iterator instead,
 * but 'ts much simpler this way...
 */
class TextSplit {
public:
    // Should we activate special processing of Chinese characters ? This
    // needs a little more cpu, so it can be turned off globally. This is set
    // by rclconfig, changing it means reindexing
    static bool o_processCJK;
    static unsigned int  o_CJKNgramLen;
    static const unsigned int o_CJKMaxNgramLen =  5;
    static void cjkProcessing(bool onoff, unsigned int ngramlen = 2)
    {
	o_processCJK = onoff;
	o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ?
	    ngramlen : o_CJKMaxNgramLen;
    }

    // Are we indexing numbers ? Set by rclconfig. Change needs reindex
    static bool o_noNumbers;
    static void noNumbers()
    {
	o_noNumbers = true;
    }

    enum Flags {TXTS_NONE = 0,
		TXTS_ONLYSPANS = 1,  // Only return maximum spans (a@b.com)
		TXTS_NOSPANS = 2,  // Only return atomic words (a, b, com)
		TXTS_KEEPWILD = 4 // Handle wildcards as letters
    };


    TextSplit(Flags flags = Flags(TXTS_NONE))
	: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
    {
    }
    virtual ~TextSplit() {}

    /** Split text, emit words and positions. */
    virtual bool text_to_words(const string &in);

    /** Process one output word: to be implemented by the actual user class */
    virtual bool takeword(const string& term,
			  int pos,  // term pos
			  int bts,  // byte offset of first char in term
			  int bte   // byte offset of first char after term
			  ) = 0;

    /** Called when we encounter formfeed \f 0x0c. Override to use the event.
     * Mostly or exclusively used with pdftoxx output. Other filters mostly
     * just don't know about pages. */
    virtual void newpage(int /*pos*/)
    {
    }

    // Static utility functions:

    /** Count words in string, as the splitter would generate them */
    static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);

    /** Check if this is visibly not a single block of text */
    static bool hasVisibleWhite(const string &in);

    /** Split text span into strings, at white space, allowing for substrings
     * quoted with " . Escaping with \ works as usual inside the quoted areas.
     * This has to be kept separate from smallut.cpp's stringsToStrings, which
     * basically works only if whitespace is ascii, and which processes
     * non-utf-8 input (iso-8859 config files work ok). This hopefully
     * handles all Unicode whitespace, but needs correct utf-8 input
     */
    static bool stringToStrings(const string &s, vector<string> &tokens);

    /** Is char CJK ? */
    static bool isCJK(int c);

private:
    Flags         m_flags;
    int           m_maxWordLength;

    // Current span. Might be jf.dockes@wanadoo.f
    string        m_span;

    // Current word: no punctuation at all in there. Byte offset
    // relative to the current span and byte length
    int           m_wordStart;
    unsigned int  m_wordLen;

    // Currently inside number
    bool          m_inNumber;

    // Term position of current word and span
    int           m_wordpos;
    int           m_spanpos;

    // It may happen that our cleanup would result in emitting the
    // same term twice. We try to avoid this
    int           m_prevpos;
    unsigned int  m_prevlen;

    // This processes cjk text:
    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);

    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
    bool doemit(bool spanerase, int bp, bool spanemit=false);
    void discardspan();
};

#endif /* _TEXTSPLIT_H_INCLUDED_ */