recollindex and python module building and running

2015-08-02 17:40:51 +02:00 · 2015-08-02 17:40:51 +02:00 · 220d28c912
commit 220d28c912
915 changed files with 386448 additions and 0 deletions
--- a/src/rcldb/rclabstract.cpp
+++ b/src/rcldb/rclabstract.cpp
@ -0,0 +1,621 @@
+/* Copyright (C) 2004 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+#include "autoconfig.h"
+
+#include <math.h>
+
+#include <map>
+#include "unordered_defs.h"
+
+using namespace std;
+
+#include "debuglog.h"
+#include "rcldb.h"
+#include "rcldb_p.h"
+#include "rclquery.h"
+#include "rclquery_p.h"
+#include "textsplit.h"
+#include "searchdata.h"
+#include "utf8iter.h"
+#include "hldata.h"
+
+namespace Rcl {
+// This is used as a marker inside the abstract frag lists, but
+// normally doesn't remain in final output (which is built with a
+// custom sep. by our caller).
+static const string cstr_ellipsis("...");
+// This is used to mark positions overlapped by a multi-word match term
+static const string occupiedmarker("?");
+
+#undef DEBUGABSTRACT  
+#ifdef DEBUGABSTRACT
+#define LOGABS LOGDEB
+static void listList(const string& what, const vector<string>&l)
+{
+    string a;
+    for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
+        a = a + *it + " ";
+    }
+    LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
+}
+#else
+#define LOGABS LOGDEB2
+static void listList(const string&, const vector<string>&)
+{
+}
+#endif
+
+// Unprefix terms. Actually it's not completely clear if we should
+// remove prefixes and keep all terms or prune the prefixed
+// ones. There is no good way to be sure what will provide the best
+// result in general.
+static const bool prune_prefixed_terms = true; 
+static void noPrefixList(const vector<string>& in, vector<string>& out) 
+{
+    for (vector<string>::const_iterator qit = in.begin(); 
+	 qit != in.end(); qit++) {
+	if (prune_prefixed_terms) {
+	    if (has_prefix(*qit))
+		continue;
+	}
+	out.push_back(strip_prefix(*qit));
+    }
+    sort(out.begin(), out.end());
+    vector<string>::iterator it = unique(out.begin(), out.end());
+    out.resize(it - out.begin());
+}
+
+bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
+{
+    if (!xenquire) {
+	LOGERR(("Query::getMatchTerms: no query opened\n"));
+	return -1;
+    }
+
+    terms.clear();
+    Xapian::TermIterator it;
+    Xapian::docid id = Xapian::docid(xdocid);
+    vector<string> iterms;
+    XAPTRY(iterms.insert(iterms.begin(),
+                        xenquire->get_matching_terms_begin(id),
+                        xenquire->get_matching_terms_end(id)),
+           m_q->m_db->m_ndb->xrdb, m_q->m_reason);
+    if (!m_q->m_reason.empty()) {
+	LOGERR(("getMatchTerms: xapian error: %s\n", m_q->m_reason.c_str()));
+	return false;
+    }
+    noPrefixList(iterms, terms);
+    return true;
+}
+
+// Retrieve db-wide frequencies for the query terms and store them in
+// the query object. This is done at most once for a query, and the data is used
+// while computing abstracts for the different result documents.
+void Query::Native::setDbWideQTermsFreqs()
+{
+    // Do it once only for a given query.
+    if (!termfreqs.empty())
+	return;
+
+    vector<string> qterms;
+    {
+	vector<string> iqterms;
+	m_q->getQueryTerms(iqterms);
+	noPrefixList(iqterms, qterms);
+    }
+    // listList("Query terms: ", qterms);
+    Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
+
+    double doccnt = xrdb.get_doccount();
+    if (doccnt == 0) 
+	doccnt = 1;
+
+    for (vector<string>::const_iterator qit = qterms.begin(); 
+	 qit != qterms.end(); qit++) {
+	termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
+	LOGABS(("setDbWideQTermFreqs: [%s] db freq %.1e\n", qit->c_str(), 
+		termfreqs[*qit]));
+    }
+}
+
+// Compute matched terms quality coefficients for a matched document by
+// retrieving the Within Document Frequencies and multiplying by
+// overal term frequency, then using log-based thresholds.
+// 2012: it's not too clear to me why exactly we do the log thresholds thing.
+//  Preferring terms wich are rare either or both in the db and the document 
+//  seems reasonable though
+// To avoid setting a high quality for a low frequency expansion of a
+// common stem, which seems wrong, we group the terms by
+// root, compute a frequency for the group from the sum of member
+// occurrences, and let the frequency for each group member be the
+// aggregated frequency.
+double Query::Native::qualityTerms(Xapian::docid docid, 
+				   const vector<string>& terms,
+				   multimap<double, vector<string> >& byQ)
+{
+    LOGABS(("qualityTerms\n"));
+    setDbWideQTermsFreqs();
+
+    map<string, double> termQcoefs;
+    double totalweight = 0;
+
+    Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
+    double doclen = xrdb.get_doclength(docid);
+    if (doclen == 0) 
+	doclen = 1;
+    HighlightData hld;
+    if (!m_q->m_sd.isNull()) {
+	m_q->m_sd->getTerms(hld);
+    }
+
+#ifdef DEBUGABSTRACT
+    {
+	string deb;
+	hld.toString(deb);
+	LOGABS(("qualityTerms: hld: %s\n", deb.c_str()));
+    }
+#endif
+
+    // Group the input terms by the user term they were possibly expanded from
+    map<string, vector<string> > byRoot;
+    for (vector<string>::const_iterator qit = terms.begin(); 
+	 qit != terms.end(); qit++) {
+	bool found = false;
+	map<string, string>::const_iterator eit = hld.terms.find(*qit);
+	if (eit != hld.terms.end()) {
+	    byRoot[eit->second].push_back(*qit);
+	} else {
+	    LOGDEB0(("qualityTerms: [%s] not found in hld\n", (*qit).c_str()));
+	    byRoot[*qit].push_back(*qit);
+	}
+    }
+
+#ifdef DEBUGABSTRACT
+    {
+	string byRootstr;
+	for (map<string, vector<string> >::const_iterator debit = 
+		 byRoot.begin();  debit != byRoot.end(); debit++) {
+	    byRootstr.append("[").append(debit->first).append("]->");
+	    for (vector<string>::const_iterator it = debit->second.begin();
+		 it != debit->second.end(); it++) {
+		byRootstr.append("[").append(*it).append("] ");
+	    }
+	    byRootstr.append("\n");
+	}
+	LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str()));
+    }
+#endif
+
+    // Compute in-document and global frequencies for the groups.
+    map<string, double> grpwdfs;
+    map<string, double> grptfreqs;
+    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
+	 git != byRoot.end(); git++) {
+	for (vector<string>::const_iterator qit = git->second.begin(); 
+	     qit != git->second.end(); qit++) {
+	    Xapian::TermIterator term = xrdb.termlist_begin(docid);
+	    term.skip_to(*qit);
+	    if (term != xrdb.termlist_end(docid) && *term == *qit) {
+		if (grpwdfs.find(git->first) != grpwdfs.end()) {
+		    grpwdfs[git->first] = term.get_wdf() / doclen;
+		    grptfreqs[git->first] = termfreqs[*qit];
+		} else {
+		    grpwdfs[git->first] += term.get_wdf() / doclen;
+		    grptfreqs[git->first] += termfreqs[*qit];
+		}
+	    }
+	}    
+    }
+
+    // Build a sorted by quality container for the groups
+    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
+	 git != byRoot.end(); git++) {
+	double q = (grpwdfs[git->first]) * grptfreqs[git->first];
+	q = -log10(q);
+	if (q < 3) {
+	    q = 0.05;
+	} else if (q < 4) {
+	    q = 0.3;
+	} else if (q < 5) {
+	    q = 0.7;
+	} else if (q < 6) {
+	    q = 0.8;
+	} else {
+	    q = 1;
+	}
+	totalweight += q;
+	byQ.insert(pair<double, vector<string> >(q, git->second));
+    }
+
+#ifdef DEBUGABSTRACT
+    for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); 
+	 mit != byQ.rend(); mit++) {
+	LOGABS(("qualityTerms: group\n"));
+	for (vector<string>::const_iterator qit = mit->second.begin();
+	     qit != mit->second.end(); qit++) {
+	    LOGABS(("%.1e->[%s]\n", mit->first, qit->c_str()));
+	}
+    }
+#endif
+    return totalweight;
+}
+
+// Return page number for first match of "significant" term.
+int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
+{
+    if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
+	LOGERR(("Query::getFirstMatchPage: no db\n"));
+	return -1;
+    }
+    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
+    Xapian::Database& xrdb(ndb->xrdb);
+
+    vector<string> terms;
+    getMatchTerms(docid, terms);
+
+    if (terms.empty()) {
+	LOGDEB(("getFirstMatchPage: empty match term list (field match?)\n"));
+	return -1;
+    }
+
+    vector<int> pagepos;
+    ndb->getPagePositions(docid, pagepos);
+    if (pagepos.empty())
+	return -1;
+	
+    setDbWideQTermsFreqs();
+
+    // We try to use a page which matches the "best" term. Get a sorted list
+    multimap<double, vector<string> > byQ;
+    double totalweight = qualityTerms(docid, terms, byQ);
+
+    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
+	 mit != byQ.rend(); mit++) {
+	for (vector<string>::const_iterator qit = mit->second.begin();
+	     qit != mit->second.end(); qit++) {
+	    string qterm = *qit;
+	    Xapian::PositionIterator pos;
+	    string emptys;
+	    try {
+		for (pos = xrdb.positionlist_begin(docid, qterm); 
+		     pos != xrdb.positionlist_end(docid, qterm); pos++) {
+		    int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
+		    if (pagenum > 0) {
+			term = qterm;
+			return pagenum;
+		    }
+		}
+	    } catch (...) {
+		// Term does not occur. No problem.
+	    }
+	}
+    }
+    return -1;
+}
+
+// Build a document abstract by extracting text chunks around the query terms
+// This uses the db termlists, not the original document.
+//
+// DatabaseModified and other general exceptions are catched and
+// possibly retried by our caller
+int Query::Native::makeAbstract(Xapian::docid docid,
+				vector<Snippet>& vabs, 
+				int imaxoccs, int ictxwords)
+{
+    Chrono chron;
+    LOGABS(("makeAbstract: docid %ld imaxoccs %d ictxwords %d\n", 
+	    long(docid), imaxoccs, ictxwords));
+
+    // The (unprefixed) terms matched by this document
+    vector<string> matchedTerms;
+    getMatchTerms(docid, matchedTerms);
+    if (matchedTerms.empty()) {
+	LOGDEB(("makeAbstract::Empty term list\n"));
+	return ABSRES_ERROR;
+    }
+
+    listList("Match terms: ", matchedTerms);
+
+    // Retrieve the term frequencies for the query terms. This is
+    // actually computed only once for a query, and for all terms in
+    // the query (not only the matches for this doc)
+    setDbWideQTermsFreqs();
+
+    // Build a sorted by quality container for the match terms We are
+    // going to try and show text around the less common search terms.
+    // Terms issued from an original one by stem expansion are
+    // aggregated by the qualityTerms() routine.
+    multimap<double, vector<string> > byQ;
+    double totalweight = qualityTerms(docid, matchedTerms, byQ);
+    LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
+    // This can't happen, but would crash us
+    if (totalweight == 0.0) {
+	LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
+	return ABSRES_ERROR;
+    }
+
+    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
+    Xapian::Database& xrdb(ndb->xrdb);
+
+    ///////////////////
+    // For each of the query terms, ask xapian for its positions list
+    // in the document. For each position entry, insert it and its
+    // neighbours in the set of 'interesting' positions
+
+    // The terms 'array' that we partially populate with the document
+    // terms, at their positions around the search terms positions:
+    map<unsigned int, string> sparseDoc;
+    // Also remember apart the search term positions so that we can list
+    // them with their snippets.
+    STD_UNORDERED_SET<unsigned int> searchTermPositions;
+
+    // Remember max position. Used to stop walking positions lists while 
+    // populating the adjacent slots.
+    unsigned int maxpos = 0;
+
+    // Total number of occurences for all terms. We stop when we have too much
+    unsigned int totaloccs = 0;
+
+    // Total number of slots we populate. The 7 is taken as
+    // average word size. It was a mistake to have the user max
+    // abstract size parameter in characters, we basically only deal
+    // with words. We used to limit the character size at the end, but
+    // this damaged our careful selection of terms
+    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
+	m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
+    int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
+    LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n", 
+	    chron.ms(), maxtotaloccs, ctxwords));
+
+    int ret = ABSRES_OK;
+
+    // Let's go populate
+    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
+	 mit != byQ.rend(); mit++) {
+	unsigned int maxgrpoccs;
+	float q;
+	if (byQ.size() == 1) {
+	    maxgrpoccs = maxtotaloccs;
+	    q = 1.0;
+	} else {
+	    // We give more slots to the better term groups
+	    q = mit->first / totalweight;
+	    maxgrpoccs = int(ceil(maxtotaloccs * q));
+	}
+	unsigned int grpoccs = 0;
+
+	for (vector<string>::const_iterator qit = mit->second.begin();
+	     qit != mit->second.end(); qit++) {
+
+	    // Group done ?
+	    if (grpoccs >= maxgrpoccs) 
+		break;
+
+	    string qterm = *qit;
+
+	    LOGABS(("makeAbstract: [%s] %d max grp occs (coef %.2f)\n", 
+		    qterm.c_str(), maxgrpoccs, q));
+
+	    // The match term may span several words
+	    int qtrmwrdcnt = 
+		TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
+
+	    Xapian::PositionIterator pos;
+	    // There may be query terms not in this doc. This raises an
+	    // exception when requesting the position list, we catch it ??
+	    // Not clear how this can happen because we are walking the
+	    // match list returned by Xapian. Maybe something with the
+	    // fields?
+	    string emptys;
+	    try {
+		for (pos = xrdb.positionlist_begin(docid, qterm); 
+		     pos != xrdb.positionlist_end(docid, qterm); pos++) {
+		    int ipos = *pos;
+		    if (ipos < int(baseTextPosition)) // Not in text body
+			continue;
+		    LOGABS(("makeAbstract: [%s] at pos %d grpoccs %d maxgrpoccs"
+			    " %d\n", qterm.c_str(), ipos, grpoccs, maxgrpoccs));
+
+		    totaloccs++;
+		    grpoccs++;
+
+		    // Add adjacent slots to the set to populate at next
+		    // step by inserting empty strings. Special provisions
+		    // for adding ellipsis and for positions overlapped by
+		    // the match term.
+		    unsigned int sta = MAX(int(baseTextPosition), 
+					   ipos - ctxwords);
+		    unsigned int sto = ipos + qtrmwrdcnt-1 + 
+			m_q->m_db->getAbsCtxLen();
+		    for (unsigned int ii = sta; ii <= sto;  ii++) {
+			if (ii == (unsigned int)ipos) {
+			    sparseDoc[ii] = qterm;
+			    searchTermPositions.insert(ii);
+			    if (ii > maxpos)
+				maxpos = ii;
+			} else if (ii > (unsigned int)ipos && 
+				   ii < (unsigned int)ipos + qtrmwrdcnt) {
+			    sparseDoc[ii] = occupiedmarker;
+			} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
+			    // For an empty slot, the test has a side
+			    // effect of inserting an empty string which
+			    // is what we want.
+			    sparseDoc[ii] = emptys;
+			}
+		    }
+		    // Add ellipsis at the end. This may be replaced later by
+		    // an overlapping extract. Take care not to replace an
+		    // empty string here, we really want an empty slot,
+		    // use find()
+		    if (sparseDoc.find(sto+1) == sparseDoc.end()) {
+			sparseDoc[sto+1] = cstr_ellipsis;
+		    }
+
+		    // Group done ?
+		    if (grpoccs >= maxgrpoccs) {
+			ret |= ABSRES_TRUNC;
+			LOGABS(("Db::makeAbstract: max group occs cutoff\n"));
+			break;
+		    }
+		    // Global done ?
+		    if (totaloccs >= maxtotaloccs) {
+			ret |= ABSRES_TRUNC;
+			LOGABS(("Db::makeAbstract: max occurrences cutoff\n"));
+			break;
+		    }
+		}
+	    } catch (...) {
+		// Term does not occur. No problem.
+	    }
+
+	    if (totaloccs >= maxtotaloccs) {
+		ret |= ABSRES_TRUNC;
+		LOGABS(("Db::makeAbstract: max1 occurrences cutoff\n"));
+		break;
+	    }
+	}
+    }
+    maxpos += ctxwords + 1;
+
+    LOGABS(("makeAbstract:%d:chosen number of positions %d\n", 
+	    chron.millis(), totaloccs));
+    // This can happen if there are term occurences in the keywords
+    // etc. but not elsewhere ?
+    if (totaloccs == 0) {
+	LOGDEB(("makeAbstract: no occurrences\n"));
+	return ABSRES_OK;
+    }
+
+    // Walk all document's terms position lists and populate slots
+    // around the query terms. We arbitrarily truncate the list to
+    // avoid taking forever. If we do cutoff, the abstract may be
+    // inconsistant (missing words, potentially altering meaning),
+    // which is bad. 
+    { 
+	Xapian::TermIterator term;
+	int cutoff = m_q->m_snipMaxPosWalk;
+	for (term = xrdb.termlist_begin(docid);
+	     term != xrdb.termlist_end(docid); term++) {
+	    // Ignore prefixed terms
+	    if (has_prefix(*term))
+		continue;
+	    if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
+		ret |= ABSRES_TERMMISS;
+		LOGDEB0(("makeAbstract: max term count cutoff %d\n", 
+			 m_q->m_snipMaxPosWalk));
+		break;
+	    }
+
+	    map<unsigned int, string>::iterator vit;
+	    Xapian::PositionIterator pos;
+	    for (pos = xrdb.positionlist_begin(docid, *term); 
+		 pos != xrdb.positionlist_end(docid, *term); pos++) {
+		if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
+		    ret |= ABSRES_TERMMISS;
+		    LOGDEB0(("makeAbstract: max term count cutoff %d\n", 
+			    m_q->m_snipMaxPosWalk));
+		    break;
+		}
+		// If we are beyond the max possible position, stop
+		// for this term
+		if (*pos > maxpos) {
+		    break;
+		}
+		if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
+		    // Don't replace a term: the terms list is in
+		    // alphabetic order, and we may have several terms
+		    // at the same position, we want to keep only the
+		    // first one (ie: dockes and dockes@wanadoo.fr)
+		    if (vit->second.empty()) {
+			LOGDEB2(("makeAbstract: populating: [%s] at %d\n", 
+				 (*term).c_str(), *pos));
+			sparseDoc[*pos] = *term;
+		    }
+		}
+	    }
+	}
+    }
+
+#if 0
+    // Debug only: output the full term[position] vector
+    bool epty = false;
+    int ipos = 0;
+    for (map<unsigned int, string>::iterator it = sparseDoc.begin(); 
+	 it != sparseDoc.end();
+	 it++, ipos++) {
+	if (it->empty()) {
+	    if (!epty)
+		LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
+	    epty=true;
+	} else {
+	    epty = false;
+	    LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
+	}
+    }
+#endif
+
+    vector<int> vpbreaks;
+    ndb->getPagePositions(docid, vpbreaks);
+
+    LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(),
+	    vpbreaks.size()));
+    // Finally build the abstract by walking the map (in order of position)
+    vabs.clear();
+    string chunk;
+    bool incjk = false;
+    int page = 0;
+    string term;
+    for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
+	 it != sparseDoc.end(); it++) {
+	LOGDEB2(("Abtract:output %u -> [%s]\n", it->first, it->second.c_str()));
+	if (!occupiedmarker.compare(it->second)) {
+	    LOGDEB(("Abstract: qtrm position not filled ??\n"));
+	    continue;
+	}
+	if (chunk.empty() && !vpbreaks.empty()) {
+	    page =  ndb->getPageNumberForPosition(vpbreaks, it->first);
+	    if (page < 0) 
+		page = 0;
+	    term.clear();
+	}
+	Utf8Iter uit(it->second);
+	bool newcjk = false;
+	if (TextSplit::isCJK(*uit))
+	    newcjk = true;
+	if (!incjk || (incjk && !newcjk))
+	    chunk += " ";
+	incjk = newcjk;
+	if (searchTermPositions.find(it->first) != searchTermPositions.end())
+	    term = it->second;
+	if (it->second == cstr_ellipsis) {
+	    vabs.push_back(Snippet(page, chunk).setTerm(term));
+	    chunk.clear();
+	} else {
+	    if (it->second.compare(end_of_field_term) && 
+		it->second.compare(start_of_field_term))
+		chunk += it->second;
+	}
+    }
+    if (!chunk.empty())
+	vabs.push_back(Snippet(page, chunk).setTerm(term));
+
+    LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
+    return ret;
+}
+
+
+}