Separate count and context for snippets in the snippets popup from the default values for the result list

This commit is contained in:
Jean-Francois Dockes 2012-09-23 18:19:43 +02:00
parent 694755a2d0
commit 7a3cfa6c77
10 changed files with 99 additions and 33 deletions

View file

@ -6,8 +6,8 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>516</width> <width>640</width>
<height>395</height> <height>400</height>
</rect> </rect>
</property> </property>
<property name="windowTitle"> <property name="windowTitle">

View file

@ -50,6 +50,17 @@ void SnippetsW::init()
if (m_source.isNull()) if (m_source.isNull())
return; return;
// Make title out of file name if none yet
string titleOrFilename;
string utf8fn;
m_doc.getmeta(Rcl::Doc::keytt, &titleOrFilename);
m_doc.getmeta(Rcl::Doc::keyfn, &utf8fn);
if (titleOrFilename.empty()) {
titleOrFilename = utf8fn;
}
setWindowTitle(QString::fromUtf8(titleOrFilename.c_str()));
vector<pair<int, string> > vpabs; vector<pair<int, string> > vpabs;
m_source->getAbstract(m_doc, vpabs); m_source->getAbstract(m_doc, vpabs);

View file

@ -98,7 +98,6 @@ class DocSequence {
virtual bool getAbstract(Rcl::Doc& doc, virtual bool getAbstract(Rcl::Doc& doc,
std::vector<std::pair<int, std::string> >& abs) std::vector<std::pair<int, std::string> >& abs)
{ {
fprintf(stderr, "DocSequence::getAbstract/pair\n");
abs.push_back(std::pair<int, std::string>(0, abs.push_back(std::pair<int, std::string>(0,
doc.meta[Rcl::Doc::keyabs])); doc.meta[Rcl::Doc::keyabs]));
return true; return true;

View file

@ -65,19 +65,32 @@ int DocSequenceDb::getResCnt()
return m_rescnt; return m_rescnt;
} }
// This one only gets called to fill-up the snippets window
// We ignore most abstract/snippets preferences.
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, bool DocSequenceDb::getAbstract(Rcl::Doc &doc,
vector<pair<int, string> >& vpabs) vector<pair<int, string> >& vpabs)
{ {
LOGDEB(("DocSequenceDb::getAbstract/pair\n")); LOGDEB(("DocSequenceDb::getAbstract/pair\n"));
setQuery(); setQuery();
if (m_q->whatDb() &&
m_queryBuildAbstract && (doc.syntabs || m_queryReplaceAbstract)) { // Have to put the limit somewhere.
m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), vpabs); int maxoccs = 500;
Rcl::abstract_result ret = Rcl::ABSRES_ERROR;
if (m_q->whatDb()) {
ret = m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), vpabs,
maxoccs,
m_q->whatDb()->getAbsCtxLen()+ 2);
} }
if (vpabs.empty()) if (vpabs.empty())
vpabs.push_back(pair<int, string>(0, doc.meta[Rcl::Doc::keyabs])); vpabs.push_back(pair<int, string>(0, doc.meta[Rcl::Doc::keyabs]));
// If the list was probably truncated, indicate it.
if (ret == Rcl::ABSRES_TRUNC)
vpabs.push_back(pair<int, string>(-1, "[...]"));
return true; return true;
} }
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs) bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
{ {
setQuery(); setQuery();

View file

@ -31,7 +31,11 @@ class DocSequenceDb : public DocSequence {
virtual bool getDoc(int num, Rcl::Doc &doc, string * = 0); virtual bool getDoc(int num, Rcl::Doc &doc, string * = 0);
virtual int getResCnt(); virtual int getResCnt();
virtual void getTerms(HighlightData& hld); virtual void getTerms(HighlightData& hld);
// Called to fill-up the snippets window. Ignoers
// buildabstract/replaceabstract and syntabslen
virtual bool getAbstract(Rcl::Doc &doc, vector<pair<int, string> >&); virtual bool getAbstract(Rcl::Doc &doc, vector<pair<int, string> >&);
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&); virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
virtual int getFirstMatchPage(Rcl::Doc&); virtual int getFirstMatchPage(Rcl::Doc&);
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc); virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);

View file

@ -244,7 +244,7 @@ void Db::Native::setDbWideQTermsFreqs(Query *query)
for (vector<string>::const_iterator qit = qterms.begin(); for (vector<string>::const_iterator qit = qterms.begin();
qit != qterms.end(); qit++) { qit != qterms.end(); qit++) {
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt; query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), LOGABS(("set..QTermFreqs: [%s] db freq %.1e\n", qit->c_str(),
query->m_nq->termfreqs[*qit])); query->m_nq->termfreqs[*qit]));
} }
} }
@ -298,6 +298,7 @@ double Db::Native::qualityTerms(Xapian::docid docid,
} }
#ifdef DEBUGABSTRACT #ifdef DEBUGABSTRACT
LOGDEB(("Db::qualityTerms:\n"));
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
qit != byQ.rend(); qit++) { qit != byQ.rend(); qit++) {
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str())); LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
@ -415,12 +416,13 @@ int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
// //
// DatabaseModified and other general exceptions are catched and // DatabaseModified and other general exceptions are catched and
// possibly retried by our caller // possibly retried by our caller
bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, abstract_result Db::Native::makeAbstract(Xapian::docid docid, Query *query,
vector<pair<int, string> >& vabs) vector<pair<int, string> >& vabs,
int imaxoccs, int ictxwords)
{ {
Chrono chron; Chrono chron;
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d imaxoccs %d\n", chron.ms(),
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen)); m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen, imaxoccs));
// The (unprefixed) terms matched by this document // The (unprefixed) terms matched by this document
vector<string> matchedTerms; vector<string> matchedTerms;
@ -430,7 +432,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
noPrefixList(iterms, matchedTerms); noPrefixList(iterms, matchedTerms);
if (matchedTerms.empty()) { if (matchedTerms.empty()) {
LOGDEB(("makeAbstract::Empty term list\n")); LOGDEB(("makeAbstract::Empty term list\n"));
return false; return ABSRES_ERROR;
} }
} }
listList("Match terms: ", matchedTerms); listList("Match terms: ", matchedTerms);
@ -453,7 +455,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
// This can't happen, but would crash us // This can't happen, but would crash us
if (totalweight == 0.0) { if (totalweight == 0.0) {
LOGERR(("makeAbstract: totalweight == 0.0 !\n")); LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
return false; return ABSRES_ERROR;
} }
/////////////////// ///////////////////
@ -474,13 +476,17 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
// abstract size parameter in characters, we basically only deal // abstract size parameter in characters, we basically only deal
// with words. We used to limit the character size at the end, but // with words. We used to limit the character size at the end, but
// this damaged our careful selection of terms // this damaged our careful selection of terms
const unsigned int maxtotaloccs = const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1)); m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs)); int ctxwords = ictxwords == -1 ? m_rcldb->m_synthAbsWordCtxLen : ictxwords;
LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n",
chron.ms(), maxtotaloccs, ctxwords));
// This is used to mark positions overlapped by a multi-word match term // This is used to mark positions overlapped by a multi-word match term
const string occupiedmarker("?"); const string occupiedmarker("?");
abstract_result ret = ABSRES_OK;
// Let's go populate // Let's go populate
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
qit != byQ.rend(); qit++) { qit != byQ.rend(); qit++) {
@ -522,7 +528,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
// step by inserting empty strings. Special provisions // step by inserting empty strings. Special provisions
// for adding ellipsis and for positions overlapped by // for adding ellipsis and for positions overlapped by
// the match term. // the match term.
unsigned int sta = MAX(0, ipos-m_rcldb->m_synthAbsWordCtxLen); unsigned int sta = MAX(0, ipos - ctxwords);
unsigned int sto = ipos + qtrmwrdcnt-1 + unsigned int sto = ipos + qtrmwrdcnt-1 +
m_rcldb->m_synthAbsWordCtxLen; m_rcldb->m_synthAbsWordCtxLen;
for (unsigned int ii = sta; ii <= sto; ii++) { for (unsigned int ii = sta; ii <= sto; ii++) {
@ -548,15 +554,21 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
// Limit to allocated occurences and total size // Limit to allocated occurences and total size
if (++occurrences >= maxoccs || if (++occurrences >= maxoccs ||
totaloccs >= maxtotaloccs) totaloccs >= maxtotaloccs) {
ret = ABSRES_TRUNC;
LOGDEB(("Db::makeAbstract: max occurrences cutoff\n"));
break; break;
} }
}
} catch (...) { } catch (...) {
// Term does not occur. No problem. // Term does not occur. No problem.
} }
if (totaloccs >= maxtotaloccs) if (totaloccs >= maxtotaloccs) {
ret = ABSRES_TRUNC;
LOGDEB(("Db::makeAbstract: max1 occurrences cutoff\n"));
break; break;
} }
}
LOGABS(("makeAbstract:%d:chosen number of positions %d\n", LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
chron.millis(), totaloccs)); chron.millis(), totaloccs));
@ -564,7 +576,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
// etc. but not elsewhere ? // etc. but not elsewhere ?
if (totaloccs == 0) { if (totaloccs == 0) {
LOGDEB1(("makeAbstract: no occurrences\n")); LOGDEB1(("makeAbstract: no occurrences\n"));
return false; return ABSRES_ERROR;
} }
// Walk all document's terms position lists and populate slots // Walk all document's terms position lists and populate slots
@ -582,6 +594,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z') if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
continue; continue;
if (cutoff-- < 0) { if (cutoff-- < 0) {
ret = ABSRES_TRUNC;
LOGDEB0(("makeAbstract: max term count cutoff\n")); LOGDEB0(("makeAbstract: max term count cutoff\n"));
break; break;
} }
@ -590,6 +603,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
for (pos = xrdb.positionlist_begin(docid, *term); for (pos = xrdb.positionlist_begin(docid, *term);
pos != xrdb.positionlist_end(docid, *term); pos++) { pos != xrdb.positionlist_end(docid, *term); pos++) {
if (cutoff-- < 0) { if (cutoff-- < 0) {
ret = ABSRES_TRUNC;
LOGDEB0(("makeAbstract: max term count cutoff\n")); LOGDEB0(("makeAbstract: max term count cutoff\n"));
break; break;
} }
@ -600,7 +614,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
// at the same position, we want to keep only the // at the same position, we want to keep only the
// first one (ie: dockes and dockes@wanadoo.fr) // first one (ie: dockes and dockes@wanadoo.fr)
if (vit->second.empty()) { if (vit->second.empty()) {
LOGABS(("makeAbstract: populating: [%s] at %d\n", LOGDEB2(("makeAbstract: populating: [%s] at %d\n",
(*term).c_str(), *pos)); (*term).c_str(), *pos));
sparseDoc[*pos] = *term; sparseDoc[*pos] = *term;
} }
@ -665,7 +679,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
vabs.push_back(pair<int, string>(page, chunk)); vabs.push_back(pair<int, string>(page, chunk));
LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis())); LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
return true; return ret;
} }
/* Rcl::Db methods ///////////////////////////////// */ /* Rcl::Db methods ///////////////////////////////// */
@ -2119,17 +2133,22 @@ bool Db::stemDiffers(const string& lang, const string& word,
return true; return true;
} }
bool Db::makeDocAbstract(Doc &doc, Query *query, abstract_result Db::makeDocAbstract(Doc &doc, Query *query,
vector<pair<int, string> >& abstract) vector<pair<int, string> >& abstract,
int maxoccs, int ctxwords)
{ {
LOGDEB(("makeDocAbstract: maxoccs %d ctxwords %d\n", maxoccs, ctxwords));
if (!m_ndb || !m_ndb->m_isopen) { if (!m_ndb || !m_ndb->m_isopen) {
LOGERR(("Db::makeDocAbstract: no db\n")); LOGERR(("Db::makeDocAbstract: no db\n"));
return false; return ABSRES_ERROR;
} }
bool ret = false; abstract_result ret = ABSRES_ERROR;
XAPTRY(ret = m_ndb->makeAbstract(doc.xdocid, query, abstract), XAPTRY(ret = m_ndb->makeAbstract(doc.xdocid, query, abstract,
maxoccs, ctxwords),
m_ndb->xrdb, m_reason); m_ndb->xrdb, m_reason);
return (ret && m_reason.empty()) ? true : false; if (!m_reason.empty())
return ABSRES_ERROR;
return ret;
} }
bool Db::makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract) bool Db::makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract)

View file

@ -66,6 +66,11 @@ enum value_slot {
VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size
}; };
enum abstract_result {
ABSRES_ERROR = 0,
ABSRES_OK = 1,
ABSRES_TRUNC = 2
};
class SearchData; class SearchData;
class TermIter; class TermIter;
class Query; class Query;
@ -220,6 +225,10 @@ class Db {
/** Set parameters for synthetic abstract generation */ /** Set parameters for synthetic abstract generation */
void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen); void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
int getAbsCtxLen() const
{
return m_synthAbsWordCtxLen;
}
/** Build synthetic abstract for document, extracting chunks relevant for /** Build synthetic abstract for document, extracting chunks relevant for
* the input query. This uses index data only (no access to the file) */ * the input query. This uses index data only (no access to the file) */
@ -227,9 +236,10 @@ class Db {
bool makeDocAbstract(Doc &doc, Query *query, string& abstract); bool makeDocAbstract(Doc &doc, Query *query, string& abstract);
// Returned as a snippets vector // Returned as a snippets vector
bool makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract); bool makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract);
// Returned as a vector of page,snippet page is 0 if unknown // Returned as a vector of pair<page,snippet> page is 0 if unknown
bool makeDocAbstract(Doc &doc, Query *query, abstract_result makeDocAbstract(Doc &doc, Query *query,
vector<pair<int, string> >& abstract); vector<pair<int, string> >& abstract,
int maxoccs= -1, int ctxwords = -1);
/** Retrieve detected page breaks positions */ /** Retrieve detected page breaks positions */
int getFirstMatchPage(Doc &doc, Query *query); int getFirstMatchPage(Doc &doc, Query *query);

View file

@ -89,8 +89,9 @@ class Db::Native {
const vector<string>& terms, const vector<string>& terms,
std::multimap<double, string>& byQ); std::multimap<double, string>& byQ);
void setDbWideQTermsFreqs(Query *query); void setDbWideQTermsFreqs(Query *query);
bool makeAbstract(Xapian::docid id, Query *query, abstract_result makeAbstract(Xapian::docid id, Query *query,
vector<pair<int, string> >&); vector<pair<int, string> >&, int maxoccs = -1,
int ctxwords = -1);
bool getPagePositions(Xapian::docid docid, vector<int>& vpos); bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
int getFirstMatchPage(Xapian::docid docid, Query *query); int getFirstMatchPage(Xapian::docid docid, Query *query);
int getPageNumberForPosition(const vector<int>& pbreaks, unsigned int pos); int getPageNumberForPosition(const vector<int>& pbreaks, unsigned int pos);

View file

@ -81,6 +81,8 @@ indexstemminglanguages = english
# unac_except_trans = Ää Öö Üü ää öö üü ßss # unac_except_trans = Ää Öö Üü ää öö üü ßss
# In French, you probably want to decompose oe and ae # In French, you probably want to decompose oe and ae
# unac_except_trans = œoe Œoe æae Æae # unac_except_trans = œoe Œoe æae Æae
# Actually, this seems a reasonable default for all until someone protests.
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
# Where to store the database (directory). This may be an absolute path, # Where to store the database (directory). This may be an absolute path,
# else it is taken as relative to the configuration directory (-c argument # else it is taken as relative to the configuration directory (-c argument

View file

@ -86,6 +86,13 @@
<h2>News</h2> <h2>News</h2>
<div class="news"> <div class="news">
<ul> <ul>
<li>2012-09-21: an
<a href="https://bitbucket.org/medoc/recoll/wiki/ElinksBeagle">easy
way</a> to extend the "Beagle queue"
Recoll web history indexing mechanism to other browsers than
Firefox (Elinks in this case).
</li>
<li>2012-09-13: the next Recoll version will maybe acquire switchable <li>2012-09-13: the next Recoll version will maybe acquire switchable
case and diacritics sensitivity. I am writing case and diacritics sensitivity. I am writing
a few pages about the a few pages about the