recoll/src/rcldb/rclquery.cpp

/* Copyright (C) 2008 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#include <vector>
#include <sstream>
using namespace std;

#include "xapian.h"

#include "cstr.h"
#include "rclconfig.h"
#include "debuglog.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "rclquery.h"
#include "rclquery_p.h"
#include "conftree.h"
#include "smallut.h"
#include "searchdata.h"
#include "unacpp.h"

namespace Rcl {
// This is used as a marker inside the abstract frag lists, but
// normally doesn't remain in final output (which is built with a
// custom sep. by our caller).
static const string cstr_ellipsis("...");

// Field names inside the index data record may differ from the rcldoc ones
// (esp.: caption / title)
static const string& docfToDatf(const string& df)
{
    if (!df.compare(Doc::keytt)) {
	return cstr_caption;
    } else if (!df.compare(Doc::keymt)) {
	return cstr_dmtime;
    } else {
	return df;
    }
}

// Sort helper class. As Xapian sorting is lexicographic, we do some
// special processing for special fields like dates and sizes. User
// custom field data will have to be processed before insertion to
// achieve equivalent results.
#if XAPIAN_MAJOR_VERSION == 1 && XAPIAN_MINOR_VERSION < 2
class QSorter : public Xapian::Sorter {
#else
class QSorter : public Xapian::KeyMaker {
#endif
public:
    QSorter(const string& f)
	: m_fld(docfToDatf(f) + "=")
    {
	m_ismtime = !m_fld.compare("dmtime=");
	if (m_ismtime)
	    m_issize = false;
	else
	    m_issize = !m_fld.compare("fbytes=") || !m_fld.compare("dbytes=") ||
		!m_fld.compare("pcbytes=");
    }

    virtual std::string operator()(const Xapian::Document& xdoc) const
    {
	string data = xdoc.get_data();
	// It would be simpler to do the record->Rcl::Doc thing, but
	// hand-doing this will be faster. It makes more assumptions
	// about the format than a ConfTree though:
	string::size_type i1, i2;
	i1 = data.find(m_fld);
	if (i1 == string::npos) {
	    if (m_ismtime) {
		// Ugly: specialcase mtime as it's either dmtime or fmtime
		i1 = data.find("fmtime=");
		if (i1 == string::npos) {
		    return string();
		}
	    } else {
		return string();
	    }
	}
	i1 += m_fld.length();
	if (i1 >= data.length())
	    return string();
	i2 = data.find_first_of("\n\r", i1);
	if (i2 == string::npos)
	    return string();

	string term = data.substr(i1, i2-i1);
	if (m_ismtime) {
	    return term;
	} else if (m_issize) {
	    // Left zeropad values for appropriate numeric sorting
	    leftzeropad(term, 12);
	    return term;
	}

	// Process data for better sorting. We should actually do the
	// unicode thing
	// (http://unicode.org/reports/tr10/#Introduction), but just
	// removing accents and majuscules will remove the most
	// glaring weirdnesses (or not, depending on your national
	// approach to collating...)
	string sortterm;
	// We're not even sure the term is utf8 here (ie: url)
	if (!unacmaybefold(term, sortterm, "UTF-8", UNACOP_UNACFOLD)) {
	    sortterm = term;
	}
	// Also remove some common uninteresting starting characters
	i1 = sortterm.find_first_not_of(" \t\\\"'([*+,.#/");
	if (i1 != 0 && i1 != string::npos) {
	    sortterm = sortterm.substr(i1, sortterm.size()-i1);
	}

	LOGDEB2(("QSorter: [%s] -> [%s]\n", term.c_str(), sortterm.c_str()));
	return sortterm;
    }

private:
    string m_fld;
    bool   m_ismtime;
    bool   m_issize;
};

Query::Query(Db *db)
    : m_nq(new Native(this)), m_db(db), m_sorter(0), m_sortAscending(true),
      m_collapseDuplicates(false), m_resCnt(-1)
{
}

Query::~Query()
{
    deleteZ(m_nq);
    if (m_sorter) {
	delete (QSorter*)m_sorter;
	m_sorter = 0;
    }
}

string Query::getReason() const
{
    return m_reason;
}

Db *Query::whatDb()
{
    return m_db;
}

void Query::setSortBy(const string& fld, bool ascending) {
    if (fld.empty()) {
	m_sortField.erase();
    } else {
	m_sortField = m_db->getConf()->fieldCanon(fld);
	m_sortAscending = ascending;
    }
    LOGDEB0(("RclQuery::setSortBy: [%s] %s\n", m_sortField.c_str(),
	     m_sortAscending ? "ascending" : "descending"));
}

//#define ISNULL(X) (X).isNull()
#define ISNULL(X) !(X)

// Prepare query out of user search data
bool Query::setQuery(RefCntr<SearchData> sdata)
{
    LOGDEB(("Query::setQuery:\n"));

    if (!m_db || ISNULL(m_nq)) {
	LOGERR(("Query::setQuery: not initialised!\n"));
	return false;
    }
    m_resCnt = -1;
    m_reason.erase();

    m_nq->clear();
    m_sd = sdata;

    Xapian::Query xq;
    if (!sdata->toNativeQuery(*m_db, &xq)) {
	m_reason += sdata->getReason();
	return false;
    }

    m_nq->xquery = xq;

    string d;
    for (int tries = 0; tries < 2; tries++) {
	try {
            m_nq->xenquire = new Xapian::Enquire(m_db->m_ndb->xrdb);
            if (m_collapseDuplicates) {
                m_nq->xenquire->set_collapse_key(Rcl::VALUE_MD5);
            } else {
                m_nq->xenquire->set_collapse_key(Xapian::BAD_VALUENO);
            }
	    m_nq->xenquire->set_docid_order(Xapian::Enquire::DONT_CARE);
            if (!m_sortField.empty()) {
                if (m_sorter) {
                    delete (QSorter*)m_sorter;
                    m_sorter = 0;
                }
                m_sorter = new QSorter(m_sortField);
                // It really seems there is a xapian bug about sort order, we
                // invert here.
                m_nq->xenquire->set_sort_by_key((QSorter*)m_sorter,
                                                !m_sortAscending);
            }
            m_nq->xenquire->set_query(m_nq->xquery);
            m_nq->xmset = Xapian::MSet();
            // Get the query description and trim the "Xapian::Query"
            d = m_nq->xquery.get_description();
            m_reason.erase();
            break;
	} catch (const Xapian::DatabaseModifiedError &e) {
            m_reason = e.get_msg();
	    m_db->m_ndb->xrdb.reopen();
            continue;
	} XCATCHERROR(m_reason);
        break;
    }

    if (!m_reason.empty()) {
	LOGDEB(("Query::SetQuery: xapian error %s\n", m_reason.c_str()));
	return false;
    }

    if (d.find("Xapian::Query") == 0)
	d.erase(0, strlen("Xapian::Query"));

    sdata->setDescription(d);
    m_sd = sdata;
    LOGDEB(("Query::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
    return true;
}

bool Query::getQueryTerms(vector<string>& terms)
{
    if (ISNULL(m_nq))
	return false;

    terms.clear();
    Xapian::TermIterator it;
    string ermsg;
    try {
	for (it = m_nq->xquery.get_terms_begin();
	     it != m_nq->xquery.get_terms_end(); it++) {
	    terms.push_back(*it);
	}
    } XCATCHERROR(ermsg);
    if (!ermsg.empty()) {
	LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str()));
	return false;
    }
    return true;
}

bool Query::getMatchTerms(const Doc& doc, vector<string>& terms)
{
    return getMatchTerms(doc.xdocid, terms);
}
bool Query::getMatchTerms(unsigned long xdocid, vector<string>& terms)
{
    if (ISNULL(m_nq) || !m_nq->xenquire) {
	LOGERR(("Query::getMatchTerms: no query opened\n"));
	return -1;
    }

    terms.clear();
    Xapian::TermIterator it;
    Xapian::docid id = Xapian::docid(xdocid);

    XAPTRY(terms.insert(terms.begin(),
                        m_nq->xenquire->get_matching_terms_begin(id),
                        m_nq->xenquire->get_matching_terms_end(id)),
           m_db->m_ndb->xrdb, m_reason);

    if (!m_reason.empty()) {
	LOGERR(("getMatchTerms: xapian error: %s\n", m_reason.c_str()));
	return false;
    }

    return true;
}

abstract_result Query::makeDocAbstract(Doc &doc,
				       vector<Snippet>& abstract,
				       int maxoccs, int ctxwords)
{
    LOGDEB(("makeDocAbstract: maxoccs %d ctxwords %d\n", maxoccs, ctxwords));
    if (!m_db || !m_db->m_ndb || !m_db->m_ndb->m_isopen || !m_nq) {
	LOGERR(("Query::makeDocAbstract: no db or no nq\n"));
	return ABSRES_ERROR;
    }
    abstract_result ret = ABSRES_ERROR;
    XAPTRY(ret = m_nq->makeAbstract(doc.xdocid, abstract, maxoccs, ctxwords),
           m_db->m_ndb->xrdb, m_reason);
    if (!m_reason.empty())
	return ABSRES_ERROR;
    return ret;
}

bool Query::makeDocAbstract(Doc &doc, vector<string>& abstract)
{
    vector<Snippet> vpabs;
    if (!makeDocAbstract(doc, vpabs))
	return false;
    for (vector<Snippet>::const_iterator it = vpabs.begin();
	 it != vpabs.end(); it++) {
	string chunk;
	if (it->page > 0) {
	    doc.haspages = true;
	    ostringstream ss;
	    ss << it->page;
	    chunk += string(" [p ") + ss.str() + "] ";
	}
	chunk += it->snippet;
	abstract.push_back(chunk);
    }
    return true;
}

bool Query::makeDocAbstract(Doc &doc, string& abstract)
{
    vector<Snippet> vpabs;
    if (!makeDocAbstract(doc, vpabs))
	return false;
    for (vector<Snippet>::const_iterator it = vpabs.begin();
	 it != vpabs.end(); it++) {
	abstract.append(it->snippet);
	abstract.append(cstr_ellipsis);
    }
    return m_reason.empty() ? true : false;
}

int Query::getFirstMatchPage(Doc &doc, string& term)
{
    LOGDEB1(("Db::getFirstMatchPages\n"));;
    if (!m_nq) {
	LOGERR(("Query::getFirstMatchPage: no nq\n"));
	return false;
    }
    int pagenum = -1;
    XAPTRY(pagenum = m_nq->getFirstMatchPage(Xapian::docid(doc.xdocid), term),
	   m_db->m_ndb->xrdb, m_reason);
    return m_reason.empty() ? pagenum : -1;
}


// Mset size
static const int qquantum = 50;

// Get estimated result count for query. Xapian actually does most of
// the search job in there, this can be long
int Query::getResCnt()
{
    if (ISNULL(m_nq) || !m_nq->xenquire) {
	LOGERR(("Query::getResCnt: no query opened\n"));
	return -1;
    }
    if (m_resCnt >= 0)
	return m_resCnt;

    m_resCnt = -1;
    if (m_nq->xmset.size() <= 0) {
        Chrono chron;

        XAPTRY(m_nq->xmset =
               m_nq->xenquire->get_mset(0, qquantum, 1000);
               m_resCnt = m_nq->xmset.get_matches_lower_bound(),
               m_db->m_ndb->xrdb, m_reason);

        LOGDEB(("Query::getResCnt: %d mS\n", chron.millis()));
	if (!m_reason.empty())
	    LOGERR(("xenquire->get_mset: exception: %s\n", m_reason.c_str()));
    } else {
        m_resCnt = m_nq->xmset.get_matches_lower_bound();
    }
    return m_resCnt;
}


// Get document at rank xapi in query results.  We check if the
// current mset has the doc, else ask for an other one. We use msets
// of qquantum documents.
//
// Note that as stated by a Xapian developer, Enquire searches from
// scratch each time get_mset() is called. So the better performance
// on subsequent calls is probably only due to disk caching.
bool Query::getDoc(int xapi, Doc &doc)
{
    LOGDEB1(("Query::getDoc: xapian enquire index %d\n", xapi));
    if (ISNULL(m_nq) || !m_nq->xenquire) {
	LOGERR(("Query::getDoc: no query opened\n"));
	return false;
    }

    int first = m_nq->xmset.get_firstitem();
    int last = first + m_nq->xmset.size() -1;

    if (!(xapi >= first && xapi <= last)) {
	LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));

	XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(xapi, qquantum,
						      (const Xapian::RSet *)0),
               m_db->m_ndb->xrdb, m_reason);

        if (!m_reason.empty()) {
            LOGERR(("enquire->get_mset: exception: %s\n", m_reason.c_str()));
            return false;
	}
	if (m_nq->xmset.empty()) {
            LOGDEB(("enquire->get_mset: got empty result\n"));
	    return false;
        }
	first = m_nq->xmset.get_firstitem();
	last = first + m_nq->xmset.size() -1;
    }

    LOGDEB1(("Query::getDoc: Qry [%s] win [%d-%d] Estimated results: %d",
            m_nq->query.get_description().c_str(),
            first, last, m_nq->xmset.get_matches_lower_bound()));

    Xapian::Document xdoc;
    Xapian::docid docid = 0;
    int pc = 0;
    int collapsecount = 0;
    string data;
    string udi;
    m_reason.erase();
    for (int xaptries=0; xaptries < 2; xaptries++) {
        try {
            xdoc = m_nq->xmset[xapi-first].get_document();
	    collapsecount = m_nq->xmset[xapi-first].get_collapse_count();
            docid = *(m_nq->xmset[xapi-first]);
            pc = m_nq->xmset.convert_to_percent(m_nq->xmset[xapi-first]);
            data = xdoc.get_data();
            m_reason.erase();
            Chrono chron;
            Xapian::TermIterator it = xdoc.termlist_begin();
            it.skip_to(wrap_prefix(udi_prefix));
            if (it != xdoc.termlist_end()) {
                udi = *it;
                if (!udi.empty())
                    udi = udi.substr(wrap_prefix(udi_prefix).size());
            }
            LOGDEB2(("Query::getDoc: %d ms for udi [%s], collapse count %d\n",
		     chron.millis(), udi.c_str(), collapsecount));
            break;
        } catch (Xapian::DatabaseModifiedError &error) {
            // retry or end of loop
            m_reason = error.get_msg();
            continue;
        }
        XCATCHERROR(m_reason);
        break;
    }
    if (!m_reason.empty()) {
        LOGERR(("Query::getDoc: %s\n", m_reason.c_str()));
        return false;
    }
    doc.meta[Rcl::Doc::keyudi] = udi;

    doc.pc = pc;
    char buf[200];
    if (collapsecount>0) {
	sprintf(buf,"%3d%% (%d)", pc, collapsecount+1);
    } else {
	sprintf(buf,"%3d%%", pc);
    }
    doc.meta[Doc::keyrr] = buf;

    sprintf(buf, "%d", collapsecount);
    doc.meta[Rcl::Doc::keycc] = buf;


    // Parse xapian document's data and populate doc fields
    return m_db->m_ndb->dbDataToRclDoc(docid, data, doc);
}

vector<string> Query::expand(const Doc &doc)
{
    LOGDEB(("Rcl::Query::expand()\n"));
    vector<string> res;
    if (ISNULL(m_nq) || !m_nq->xenquire) {
	LOGERR(("Query::expand: no query opened\n"));
	return res;
    }

    for (int tries = 0; tries < 2; tries++) {
	try {
	    Xapian::RSet rset;
	    rset.add_document(Xapian::docid(doc.xdocid));
	    // We don't exclude the original query terms.
	    Xapian::ESet eset = m_nq->xenquire->get_eset(20, rset, false);
	    LOGDEB(("ESet terms:\n"));
	    // We filter out the special terms
	    for (Xapian::ESetIterator it = eset.begin();
		 it != eset.end(); it++) {
		LOGDEB((" [%s]\n", (*it).c_str()));
		if ((*it).empty() || has_prefix(*it))
		    continue;
		res.push_back(*it);
		if (res.size() >= 10)
		    break;
	    }
            m_reason.erase();
            break;
	} catch (const Xapian::DatabaseModifiedError &e) {
            m_reason = e.get_msg();
            m_db->m_ndb->xrdb.reopen();
            continue;
	} XCATCHERROR(m_reason);
	break;
    }

    if (!m_reason.empty()) {
        LOGERR(("Query::expand: xapian error %s\n", m_reason.c_str()));
        res.clear();
    }

    return res;
}

}