recoll/src/rcldb/searchdatatox.cpp

/* Copyright (C) 2006 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

// Handle translation from rcl's SearchData structures to Xapian Queries

#include "autoconfig.h"

#include <stdio.h>

#include <string>
#include <vector>
#include <algorithm>
#include <sstream>
#include <iostream>
using namespace std;

#include "xapian.h"

#include "cstr.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "searchdata.h"
#include "log.h"
#include "smallut.h"
#include "textsplit.h"
#include "unacpp.h"
#include "utf8iter.h"
#include "stoplist.h"
#include "rclconfig.h"
#include "termproc.h"
#include "synfamily.h"
#include "stemdb.h"
#include "expansiondbs.h"
#include "base64.h"
#include "daterange.h"

namespace Rcl {

typedef  vector<SearchDataClause *>::iterator qlist_it_t;

static const int original_term_wqf_booster = 10;

// Expand doc categories and mime type wild card expressions
//
// Categories are expanded against the configuration, mimetypes
// against the index.
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
{
    const RclConfig *cfg = db.getConf();
    if (!cfg) {
	LOGFATAL("Db::expandFileTypes: null configuration!!\n" );
	return false;
    }
    vector<string> exptps;

    for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
	if (cfg->isMimeCategory(*it)) {
	    vector<string>tps;
	    cfg->getMimeCatTypes(*it, tps);
	    exptps.insert(exptps.end(), tps.begin(), tps.end());
	} else {
	    TermMatchResult res;
	    string mt = stringtolower((const string&)*it);
	    // We set casesens|diacsens to get an equivalent of ixTermMatch()
	    db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
			 mt, res, -1, "mtype");
	    if (res.entries.empty()) {
		exptps.push_back(it->c_str());
	    } else {
		for (vector<TermMatchEntry>::const_iterator rit =
			 res.entries.begin(); rit != res.entries.end(); rit++) {
		    exptps.push_back(strip_prefix(rit->term));
		}
	    }
	}
    }
    sort(exptps.begin(), exptps.end());
    exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());

    tps = exptps;
    return true;
}

static const char *maxXapClauseMsg =
    "Maximum Xapian query size exceeded. Increase maxXapianClauses "
    "in the configuration. ";
static const char *maxXapClauseCaseDiacMsg =
    "Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
    "wildcards ?"
    ;


// Walk the clauses list, translate each and add to top Xapian Query
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
				vector<SearchDataClause*>& query,
				string& reason, void *d)
{
    Xapian::Query xq;
    for (qlist_it_t it = query.begin(); it != query.end(); it++) {
	Xapian::Query nq;
	if (!(*it)->toNativeQuery(db, &nq)) {
	    LOGERR("SearchData::clausesToQuery: toNativeQuery failed: "  << ((*it)->getReason()) << "\n" );
	    reason += (*it)->getReason() + " ";
	    return false;
	}
        if (nq.empty()) {
            LOGDEB("SearchData::clausesToQuery: skipping empty clause\n" );
            continue;
        }
	// If this structure is an AND list, must use AND_NOT for excl clauses.
	// Else this is an OR list, and there can't be excl clauses (checked by
	// addClause())
	Xapian::Query::op op;
	if (tp == SCLT_AND) {
            if ((*it)->getexclude()) {
                op =  Xapian::Query::OP_AND_NOT;
            } else {
                op =  Xapian::Query::OP_AND;
            }
	} else {
	    op = Xapian::Query::OP_OR;
	}
        if (xq.empty()) {
            if (op == Xapian::Query::OP_AND_NOT)
                xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
            else
                xq = nq;
        } else {
            xq = Xapian::Query(op, xq, nq);
        }
	if (int(xq.get_length()) >= getMaxCl()) {
	    LOGERR(""  << (maxXapClauseMsg) << "\n" );
	    m_reason += maxXapClauseMsg;
	    if (!o_index_stripchars)
		m_reason += maxXapClauseCaseDiacMsg;
	    return false;
	}
    }

    LOGDEB0("SearchData::clausesToQuery: got "  << (xq.get_length()) << " clauses\n" );

    if (xq.empty())
	xq = Xapian::Query::MatchAll;

   *((Xapian::Query *)d) = xq;
    return true;
}

bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{
    LOGDEB("SearchData::toNativeQuery: stemlang ["  << (m_stemlang) << "]\n" );
    m_reason.erase();

    db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
    db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
    db.getConf()->getConfParam("autocasesens", &m_autocasesens);
    db.getConf()->getConfParam("autodiacsens", &m_autodiacsens);

    // Walk the clause list translating each in turn and building the
    // Xapian query tree
    Xapian::Query xq;
    if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
	LOGERR("SearchData::toNativeQuery: clausesToQuery failed. reason: "  << (m_reason) << "\n" );
	return false;
    }

    if (m_haveDates) {
        // If one of the extremities is unset, compute db extremas
        if (m_dates.y1 == 0 || m_dates.y2 == 0) {
            int minyear = 1970, maxyear = 2100;
            if (!db.maxYearSpan(&minyear, &maxyear)) {
                LOGERR("Can't retrieve index min/max dates\n" );
                //whatever, go on.
            }

            if (m_dates.y1 == 0) {
                m_dates.y1 = minyear;
                m_dates.m1 = 1;
                m_dates.d1 = 1;
            }
            if (m_dates.y2 == 0) {
                m_dates.y2 = maxyear;
                m_dates.m2 = 12;
                m_dates.d2 = 31;
            }
        }
        LOGDEB("Db::toNativeQuery: date interval: "  << (m_dates.y1) << "-"  << (m_dates.m1) << "-"  << (m_dates.d1) << "/"  << (m_dates.y2) << "-"  << (m_dates.m2) << "-"  << (m_dates.d2) << "\n" );
        Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
                m_dates.y2, m_dates.m2, m_dates.d2);
        if (dq.empty()) {
            LOGINFO("Db::toNativeQuery: date filter is empty\n" );
        }
        // If no probabilistic query is provided then promote the daterange
        // filter to be THE query instead of filtering an empty query.
        if (xq.empty()) {
            LOGINFO("Db::toNativeQuery: proba query is empty\n" );
            xq = dq;
        } else {
            xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
        }
    }


    if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
        Xapian::Query sq;
	string min = lltodecstr(m_minSize);
	string max = lltodecstr(m_maxSize);
	if (m_minSize == size_t(-1)) {
	    string value(max);
	    leftzeropad(value, 12);
	    sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
	} else if (m_maxSize == size_t(-1)) {
	    string value(min);
	    leftzeropad(value, 12);
	    sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
	} else {
	    string minvalue(min);
	    leftzeropad(minvalue, 12);
	    string maxvalue(max);
	    leftzeropad(maxvalue, 12);
	    sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE,
			       minvalue, maxvalue);
	}

        // If no probabilistic query is provided then promote the
        // filter to be THE query instead of filtering an empty query.
        if (xq.empty()) {
            LOGINFO("Db::toNativeQuery: proba query is empty\n" );
            xq = sq;
        } else {
            xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
        }
    }

    // Add the autophrase if any
    if (m_autophrase) {
	Xapian::Query apq;
	if (m_autophrase->toNativeQuery(db, &apq)) {
	    xq = xq.empty() ? apq :
		Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
	}
    }

    // Add the file type filtering clause if any
    if (!m_filetypes.empty()) {
	expandFileTypes(db, m_filetypes);

	Xapian::Query tq;
	for (vector<string>::iterator it = m_filetypes.begin();
	     it != m_filetypes.end(); it++) {
	    string term = wrap_prefix(mimetype_prefix) + *it;
	    LOGDEB0("Adding file type term: ["  << (term) << "]\n" );
	    tq = tq.empty() ? Xapian::Query(term) :
		Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
	}
	xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
    }

    // Add the neg file type filtering clause if any
    if (!m_nfiletypes.empty()) {
	expandFileTypes(db, m_nfiletypes);

	Xapian::Query tq;
	for (vector<string>::iterator it = m_nfiletypes.begin();
	     it != m_nfiletypes.end(); it++) {
	    string term = wrap_prefix(mimetype_prefix) + *it;
	    LOGDEB0("Adding negative file type term: ["  << (term) << "]\n" );
	    tq = tq.empty() ? Xapian::Query(term) :
		Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
	}
	xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
    }

    *((Xapian::Query *)d) = xq;
    return true;
}

// Splitter for breaking a user string into simple terms and
// phrases. This is for parts of the user entry which would appear as
// a single word because there is no white space inside, but are
// actually multiple terms to rcldb (ie term1,term2). Still, most of
// the time, the result of our splitting will be a single term.
class TextSplitQ : public TextSplitP {
 public:
    TextSplitQ(Flags flags, TermProc *prc)
	: TextSplitP(prc, flags), m_nostemexp(false) {
    }

    bool takeword(const std::string &term, int pos, int bs, int be) {
	// Check if the first letter is a majuscule in which
	// case we do not want to do stem expansion. Need to do this
	// before unac of course...
	m_nostemexp = unaciscapital(term);

	return TextSplitP::takeword(term, pos, bs, be);
    }

    bool nostemexp() const {
        return m_nostemexp;
    }
private:
    bool m_nostemexp;
};

class TermProcQ : public TermProc {
public:
    TermProcQ() : TermProc(0), m_alltermcount(0), m_lastpos(0), m_ts(0) {}

    // We need a ref to the splitter (only it knows about orig term
    // capitalization for controlling stemming. The ref can't be set
    // in the constructor because the splitter is not built yet when
    // we are born (chicken and egg).
    void setTSQ(const TextSplitQ *ts) {
        m_ts = ts;
    }

    bool takeword(const std::string &term, int pos, int bs, int be) {
	m_alltermcount++;
	if (m_lastpos < pos)
	    m_lastpos = pos;
	bool noexpand = be ? m_ts->nostemexp() : true;
	LOGDEB1("TermProcQ::takeword: pushing ["  << (term) << "] pos "  << (pos) << " noexp "  << (noexpand) << "\n" );
	if (m_terms[pos].size() < term.size()) {
	    m_terms[pos] = term;
	    m_nste[pos] = noexpand;
	}
	return true;
    }

    bool flush() {
	for (map<int, string>::const_iterator it = m_terms.begin();
	     it != m_terms.end(); it++) {
	    m_vterms.push_back(it->second);
	    m_vnostemexps.push_back(m_nste[it->first]);
	}
	return true;
    }

    int alltermcount() const {
        return m_alltermcount;
    }
    int lastpos() const {
        return m_lastpos;
    }
    const vector<string>& terms() {
        return m_vterms;
    }
    const vector<bool>& nostemexps() {
        return m_vnostemexps;
    }
private:
    // Count of terms including stopwords: this is for adjusting
    // phrase/near slack
    int m_alltermcount;
    int m_lastpos;
    const TextSplitQ *m_ts;
    vector<string> m_vterms;
    vector<bool>   m_vnostemexps;
    map<int, string> m_terms;
    map<int, bool> m_nste;
};


/** Expand term into term list, using appropriate mode: stem, wildcards,
 *  diacritics...
 *
 * @param mods stem expansion, case and diacritics sensitivity control.
 * @param term input single word
 * @param oexp output expansion list
 * @param sterm output original input term if there were no wildcards
 * @param prefix field prefix in index. We could recompute it, but the caller
 *  has it already. Used in the simple case where there is nothing to expand,
 *  and we just return the prefixed term (else Db::termMatch deals with it).
 * @param multiwords it may happen that synonym processing results in multi-word
 *   expansions which should be processed as phrases.
 */
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
					string& ermsg, int mods,
					const string& term,
					vector<string>& oexp, string &sterm,
					const string& prefix,
					vector<string>* multiwords
    )
{
    LOGDEB0("expandTerm: mods 0x"  << (mods) << " fld ["  << (m_field) << "] trm ["  << (term) << "] lang ["  << (getStemLang()) << "]\n" );
    sterm.clear();
    oexp.clear();
    if (term.empty())
	return true;

    bool maxexpissoft = false;
    int maxexpand = getSoftMaxExp();
    if (maxexpand != -1) {
	maxexpissoft = true;
    } else {
	maxexpand = getMaxExp();
    }

    bool haswild = term.find_first_of(cstr_minwilds) != string::npos;

    // If there are no wildcards, add term to the list of user-entered terms
    if (!haswild) {
	m_hldata.uterms.insert(term);
        sterm = term;
    }
    // No stem expansion if there are wildcards or if prevented by caller
    bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
    if (haswild || getStemLang().empty()) {
	LOGDEB2("expandTerm: found wildcards or stemlang empty: no exp\n" );
	nostemexp = true;
    }

    bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
    bool case_sensitive = (mods & SDCM_CASESENS) != 0;
    bool synonyms = (mods & SDCM_NOSYNS) == 0;

    // noexpansion can be modified further down by possible case/diac expansion
    bool noexpansion = nostemexp && !haswild && !synonyms;

    if (o_index_stripchars) {
	diac_sensitive = case_sensitive = false;
    } else {
	// If we are working with a raw index, apply the rules for case and
	// diacritics sensitivity.

	// If any character has a diacritic, we become
	// diacritic-sensitive. Note that the way that the test is
	// performed (conversion+comparison) will automatically ignore
	// accented characters which are actually a separate letter
	if (getAutoDiac() && unachasaccents(term)) {
	    LOGDEB0("expandTerm: term has accents -> diac-sensitive\n" );
	    diac_sensitive = true;
	}

	// If any character apart the first is uppercase, we become
	// case-sensitive.  The first character is reserved for
	// turning off stemming. You need to use a query language
	// modifier to search for Floor in a case-sensitive way.
	Utf8Iter it(term);
	it++;
	if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
	    LOGDEB0("expandTerm: term has uppercase -> case-sensitive\n" );
	    case_sensitive = true;
	}

	// If we are sensitive to case or diacritics turn stemming off
	if (diac_sensitive || case_sensitive) {
	    LOGDEB0("expandTerm: diac or case sens set -> stemexpand and synonyms off\n" );
	    nostemexp = true;
            synonyms = false;
	}

	if (!case_sensitive || !diac_sensitive)
	    noexpansion = false;
    }


    if (noexpansion) {
	oexp.push_back(prefix + term);
	m_hldata.terms[term] = term;
	LOGDEB("ExpandTerm: noexpansion: final: "  << (stringsToString(oexp)) << "\n" );
	return true;
    }

    int termmatchsens = 0;
    if (case_sensitive)
	termmatchsens |= Db::ET_CASESENS;
    if (diac_sensitive)
	termmatchsens |= Db::ET_DIACSENS;
    if (synonyms)
	termmatchsens |= Db::ET_SYNEXP;

    Db::MatchType mtyp = haswild ? Db::ET_WILD :
	nostemexp ? Db::ET_NONE : Db::ET_STEM;
    TermMatchResult res;
    if (!db.termMatch(mtyp | termmatchsens, getStemLang(),
		      term, res, maxexpand,  m_field, multiwords)) {
	// Let it go through
    }

    // Term match entries to vector of terms
    if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
	ermsg = "Maximum term expansion size exceeded."
	    " Maybe use case/diacritics sensitivity or increase maxTermExpand.";
	return false;
    }
    for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
	 it != res.entries.end(); it++) {
	oexp.push_back(it->term);
    }
    // If the term does not exist at all in the db, the return from
    // termMatch() is going to be empty, which is not what we want (we
    // would then compute an empty Xapian query)
    if (oexp.empty())
	oexp.push_back(prefix + term);

    // Remember the uterm-to-expansion links
    for (vector<string>::const_iterator it = oexp.begin();
	 it != oexp.end(); it++) {
	m_hldata.terms[strip_prefix(*it)] = term;
    }
    LOGDEB("ExpandTerm: final: "  << (stringsToString(oexp)) << "\n" );
    return true;
}

// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
void multiply_groups(vector<vector<string> >::const_iterator vvit,
		     vector<vector<string> >::const_iterator vvend,
		     vector<string>& comb,
		     vector<vector<string> >&allcombs)
{
    // Remember my string vector and compute next, for recursive calls.
    vector<vector<string> >::const_iterator myvit = vvit++;

    // Walk the string vector I'm called upon and, for each string,
    // add it to current result, an call myself recursively on the
    // next string vector. The last call (last element of the vector of
    // vectors), adds the elementary result to the output

    // Walk my string vector
    for (vector<string>::const_iterator strit = (*myvit).begin();
	 strit != (*myvit).end(); strit++) {

	// Add my current value to the string vector we're building
	comb.push_back(*strit);

	if (vvit == vvend) {
	    // Last call: store current result
	    allcombs.push_back(comb);
	} else {
	    // Call recursively on next string vector
	    multiply_groups(vvit, vvend, comb, allcombs);
	}
	// Pop the value I just added (make room for the next element in my
	// vector)
	comb.pop_back();
    }
}

static void prefix_vector(vector<string>& v, const string& prefix)
{
    for (vector<string>::iterator it = v.begin(); it != v.end(); it++) {
	*it = prefix + *it;
    }
}

void SearchDataClauseSimple::
processSimpleSpan(Rcl::Db &db, string& ermsg,
		  const string& span,
		  int mods, void * pq)
{
    vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
    LOGDEB0("StringToXapianQ::processSimpleSpan: ["  << (span) << "] mods 0x"  << ((unsigned int)mods) << "\n" );
    vector<string> exp;
    string sterm; // dumb version of user term

    string prefix;
    const FieldTraits *ftp;
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
	if (ftp->noterms)
	    addModifier(SDCM_NOTERMS); // Don't add terms to highlight data
	prefix = wrap_prefix(ftp->pfx);
    }

    vector<string> multiwords;
    if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix, &multiwords))
	return;

    // Set up the highlight data. No prefix should go in there
    for (vector<string>::const_iterator it = exp.begin();
	 it != exp.end(); it++) {
	m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
	m_hldata.slacks.push_back(0);
	m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
    }

    // Push either term or OR of stem-expanded set
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
    m_curcl += exp.size();

    // If sterm (simplified original user term) is not null, give it a
    // relevance boost. We do this even if no expansion occurred (else
    // the non-expanded terms in a term list would end-up with even
    // less wqf). This does not happen if there are wildcards anywhere
    // in the search.
    // We normally boost the original term in the stem expansion list. Don't
    // do it if there are wildcards anywhere, this would skew the results.
    bool doBoostUserTerm =
	(m_parentSearch && !m_parentSearch->haveWildCards()) ||
	(m_parentSearch == 0 && !m_haveWildCards);
    if (doBoostUserTerm && !sterm.empty()) {
        xq = Xapian::Query(Xapian::Query::OP_OR, xq,
			   Xapian::Query(prefix+sterm,
					 original_term_wqf_booster));
    }

    // Push phrases for the multi-word expansions
    for (vector<string>::const_iterator mwp = multiwords.begin();
	 mwp != multiwords.end(); mwp++) {
	vector<string> phr;
	// We just do a basic split to keep things a bit simpler here
	// (no textsplit). This means though that no punctuation is
	// allowed in multi-word synonyms.
	stringToTokens(*mwp, phr);
	if (!prefix.empty())
	    prefix_vector(phr, prefix);
	xq = Xapian::Query(Xapian::Query::OP_OR, xq,
			   Xapian::Query(Xapian::Query::OP_PHRASE,
					 phr.begin(), phr.end()));
	m_curcl++;
    }

    pqueries.push_back(xq);
}

// User entry element had several terms: transform into a PHRASE or
// NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
						 TermProcQ *splitData,
						 int mods, void *pq,
						 bool useNear, int slack)
{
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
	Xapian::Query::OP_PHRASE;
    vector<Xapian::Query> orqueries;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
    bool hadmultiple = false;
#endif
    vector<vector<string> >groups;

    string prefix;
    const FieldTraits *ftp;
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
	prefix = wrap_prefix(ftp->pfx);
    }

    if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
	orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
	slack++;
    }

    // Go through the list and perform stem/wildcard expansion for each element
    vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
    for (vector<string>::const_iterator it = splitData->terms().begin();
	 it != splitData->terms().end(); it++, nxit++) {
	LOGDEB0("ProcessPhrase: processing ["  << *it << "]\n" );
	// Adjust when we do stem expansion. Not if disabled by
	// caller, not inside phrases, and some versions of xapian
	// will accept only one OR clause inside NEAR.
	bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
	    || hadmultiple
#endif // single OR inside NEAR
	    ;
	int lmods = mods;
	if (nostemexp)
	    lmods |= SearchDataClause::SDCM_NOSTEMMING;
	string sterm;
	vector<string> exp;
	if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
	    return;
	LOGDEB0("ProcessPhraseOrNear: exp size "  << (exp.size()) << ", exp: "  << (stringsToString(exp)) << "\n" );
	// groups is used for highlighting, we don't want prefixes in there.
	vector<string> noprefs;
	for (vector<string>::const_iterator it = exp.begin();
	     it != exp.end(); it++) {
	    noprefs.push_back(it->substr(prefix.size()));
	}
	groups.push_back(noprefs);
	orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
					  exp.begin(), exp.end()));
	m_curcl += exp.size();
	if (m_curcl >= getMaxCl())
	    return;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
	if (exp.size() > 1)
	    hadmultiple = true;
#endif
    }

    if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
	orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
	slack++;
    }

    // Generate an appropriate PHRASE/NEAR query with adjusted slack
    // For phrases, give a relevance boost like we do for original terms
    LOGDEB2("PHRASE/NEAR:  alltermcount "  << (splitData->alltermcount()) << " lastpos "  << (splitData->lastpos()) << "\n" );
    Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
		     splitData->lastpos() + 1 + slack);
    if (op == Xapian::Query::OP_PHRASE)
	xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
			   original_term_wqf_booster);
    pqueries.push_back(xq);

    // Add all combinations of NEAR/PHRASE groups to the highlighting data.
    vector<vector<string> > allcombs;
    vector<string> comb;
    multiply_groups(groups.begin(), groups.end(), comb, allcombs);

    // Insert the search groups and slacks in the highlight data, with
    // a reference to the user entry that generated them:
    m_hldata.groups.insert(m_hldata.groups.end(),
			   allcombs.begin(), allcombs.end());
    m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
    m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
			      m_hldata.ugroups.size() - 1);
}

// Trim string beginning with ^ or ending with $ and convert to flags
static int stringToMods(string& s)
{
    int mods = 0;
    // Check for an anchored search
    trimstring(s);
    if (s.length() > 0 && s[0] == '^') {
	mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
	s.erase(0, 1);
    }
    if (s.length() > 0 && s[s.length()-1] == '$') {
	mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
	s.erase(s.length()-1);
    }
    return mods;
}

/**
 * Turn user entry string (NOT query language) into a list of xapian queries.
 * We just separate words and phrases, and do wildcard and stem expansion,
 *
 * This is used to process data entered into an OR/AND/NEAR/PHRASE field of
 * the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
 * entry).
 *
 * This appears awful, and it would seem that the split into
 * terms/phrases should be performed in the upper layer so that we
 * only receive pure term or near/phrase pure elements here, but in
 * fact there are things that would appear like terms to naive code,
 * and which will actually may be turned into phrases (ie: tom:jerry),
 * in a manner which intimately depends on the index implementation,
 * so that it makes sense to process this here.
 *
 * The final list contains one query for each term or phrase
 *   - Elements corresponding to a stem-expanded part are an OP_OR
 *     composition of the stem-expanded terms (or a single term query).
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
 *     composition of the phrase terms (no stem expansion in this case)
 * @return the subquery count (either or'd stem-expanded terms or phrase word
 *   count)
 */
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
					       string &ermsg, void *pq,
					       int slack, bool useNear)
{
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
    int mods = m_modifiers;

    LOGDEB("StringToXapianQ:pUS:: qstr ["  << (iq) << "] fld ["  << (m_field) << "] mods 0x"  << (mods) << " slack "  << (slack) << " near "  << (useNear) << "\n" );
    ermsg.erase();
    m_curcl = 0;
    const StopList stops = db.getStopList();

    // Simple whitespace-split input into user-level words and
    // double-quoted phrases: word1 word2 "this is a phrase".
    //
    // The text splitter may further still decide that the resulting
    // "words" are really phrases, this depends on separators:
    // [paul@dom.net] would still be a word (span), but [about:me]
    // will probably be handled as a phrase.
    vector<string> phrases;
    TextSplit::stringToStrings(iq, phrases);

    // Process each element: textsplit into terms, handle stem/wildcard
    // expansion and transform into an appropriate Xapian::Query
    try {
	for (vector<string>::iterator it = phrases.begin();
	     it != phrases.end(); it++) {
	    LOGDEB0("strToXapianQ: phrase/word: ["  << *it << "]\n" );
	    // Anchoring modifiers
	    int amods = stringToMods(*it);
	    int terminc = amods != 0 ? 1 : 0;
	    mods |= amods;
	    // If there are multiple spans in this element, including
	    // at least one composite, we have to increase the slack
	    // else a phrase query including a span would fail.
	    // Ex: "term0@term1 term2" is onlyspans-split as:
	    //   0 term0@term1             0   12
	    //   2 term2                  13   18
	    // The position of term2 is 2, not 1, so a phrase search
	    // would fail.
	    // We used to do  word split, searching for
	    // "term0 term1 term2" instead, which may have worse
	    // performance, but will succeed.
	    // We now adjust the phrase/near slack by comparing the term count
	    // and the last position

	    // The term processing pipeline:
            // split -> [unac/case ->] stops -> store terms
	    TermProcQ tpq;
	    TermProc *nxt = &tpq;
            TermProcStop tpstop(nxt, stops); nxt = &tpstop;
            //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
            //tpcommon.onlygrams(true);
	    TermProcPrep tpprep(nxt);
	    if (o_index_stripchars)
		nxt = &tpprep;

	    TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
						 TextSplit::TXTS_KEEPWILD),
				nxt);
	    tpq.setTSQ(&splitter);
	    splitter.text_to_words(*it);

	    slack += tpq.lastpos() - int(tpq.terms().size()) + 1;

	    LOGDEB0("strToXapianQ: termcount: "  << (tpq.terms().size()) << "\n" );
	    switch (tpq.terms().size() + terminc) {
	    case 0:
		continue;// ??
	    case 1: {
		int lmods = mods;
		if (tpq.nostemexps().front())
		    lmods |= SearchDataClause::SDCM_NOSTEMMING;
		m_hldata.ugroups.push_back(tpq.terms());
		processSimpleSpan(db, ermsg, tpq.terms().front(),
				  lmods, &pqueries);
	    }
		break;
	    default:
		m_hldata.ugroups.push_back(tpq.terms());
		processPhraseOrNear(db, ermsg, &tpq, mods, &pqueries,
				    useNear, slack);
	    }
	    if (m_curcl >= getMaxCl()) {
		ermsg = maxXapClauseMsg;
		if (!o_index_stripchars)
		    ermsg += maxXapClauseCaseDiacMsg;
		break;
	    }
	}
    } catch (const Xapian::Error &e) {
	ermsg = e.get_msg();
    } catch (const string &s) {
	ermsg = s;
    } catch (const char *s) {
	ermsg = s;
    } catch (...) {
	ermsg = "Caught unknown exception";
    }
    if (!ermsg.empty()) {
	LOGERR("stringToXapianQueries: "  << (ermsg) << "\n" );
	return false;
    }
    return true;
}

// Translate a simple OR or AND search clause.
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
{
    LOGDEB("SearchDataClauseSimple::toNativeQuery: fld ["  << (m_field) << "] val ["  << (m_text) << "] stemlang ["  << (getStemLang()) << "]\n" );

    Xapian::Query *qp = (Xapian::Query *)p;
    *qp = Xapian::Query();

    Xapian::Query::op op;
    switch (m_tp) {
    case SCLT_AND: op = Xapian::Query::OP_AND; break;
    case SCLT_OR: op = Xapian::Query::OP_OR; break;
    default:
	LOGERR("SearchDataClauseSimple: bad m_tp "  << (m_tp) << "\n" );
        m_reason = "Internal error";
	return false;
    }

    vector<Xapian::Query> pqueries;
    if (!processUserString(db, m_text, m_reason, &pqueries))
	return false;
    if (pqueries.empty()) {
	LOGERR("SearchDataClauseSimple: resolved to null query\n" );
        m_reason = string("Resolved to null query. Term too long ? : [" +
                          m_text + string("]"));
	return false;
    }

    *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
    if  (m_weight != 1.0) {
	*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
    }
    return true;
}

// Translate a FILENAME search clause. This always comes
// from a "filename" search from the gui or recollq. A query language
// "filename:"-prefixed field will not go through here, but through
// the generic field-processing code.
//
// We do not split the entry any more (used to do some crazy thing
// about expanding multiple fragments in the past). We just take the
// value blanks and all and expand this against the indexed unsplit
// file names
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
{
    Xapian::Query *qp = (Xapian::Query *)p;
    *qp = Xapian::Query();

    int maxexp = getSoftMaxExp();
    if (maxexp == -1)
	maxexp = getMaxExp();

    vector<string> names;
    db.filenameWildExp(m_text, names, maxexp);
    *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());

    if (m_weight != 1.0) {
	*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
    }
    return true;
}

// Translate a dir: path filtering clause. See comments in .h
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
{
    LOGDEB("SearchDataClausePath::toNativeQuery: ["  << (m_text) << "]\n" );
    Xapian::Query *qp = (Xapian::Query *)p;
    *qp = Xapian::Query();

    if (m_text.empty()) {
	LOGERR("SearchDataClausePath: empty path??\n" );
	m_reason = "Empty path ?";
	return false;
    }

    vector<Xapian::Query> orqueries;

    if (path_isabsolute(m_text))
	orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
    else
        m_text = path_tildexpand(m_text);

    vector<string> vpath;
    stringToTokens(m_text, vpath, "/");

    for (vector<string>::const_iterator pit = vpath.begin();
	 pit != vpath.end(); pit++){

	string sterm;
	vector<string> exp;
	if (!expandTerm(db, m_reason,
			SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
			*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
	    return false;
	}
	LOGDEB0("SDataPath::toNative: exp size "  << (exp.size()) << ". Exp: "  << (stringsToString(exp)) << "\n" );
	if (exp.size() == 1)
	    orqueries.push_back(Xapian::Query(exp[0]));
	else
	    orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
					      exp.begin(), exp.end()));
	m_curcl += exp.size();
	if (m_curcl >= getMaxCl())
	    return false;
    }

    *qp = Xapian::Query(Xapian::Query::OP_PHRASE,
			orqueries.begin(), orqueries.end());

    if (m_weight != 1.0) {
	*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
    }
    return true;
}

// Translate NEAR or PHRASE clause.
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
{
    LOGDEB("SearchDataClauseDist::toNativeQuery\n" );

    Xapian::Query *qp = (Xapian::Query *)p;
    *qp = Xapian::Query();

    vector<Xapian::Query> pqueries;

    // We produce a single phrase out of the user entry then use
    // stringToXapianQueries() to lowercase and simplify the phrase
    // terms etc. This will result into a single (complex)
    // Xapian::Query.
    if (m_text.find('\"') != string::npos) {
	m_text = neutchars(m_text, "\"");
    }
    string s = cstr_dquote + m_text + cstr_dquote;
    bool useNear = (m_tp == SCLT_NEAR);
    if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
	return false;
    if (pqueries.empty()) {
	LOGERR("SearchDataClauseDist: resolved to null query\n" );
        m_reason = string("Resolved to null query. Term too long ? : [" +
                          m_text + string("]"));
	return false;
    }

    *qp = *pqueries.begin();
    if (m_weight != 1.0) {
	*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
    }
    return true;
}

} // Namespace Rcl