Snippet generation: limit positions walk to max hit position. Return status code when truncated walk possibly generated incomplete snippets. Implement config variabl for max pos walk

This commit is contained in:
Jean-Francois Dockes 2012-10-08 14:30:14 +02:00
parent 46b7f87e51
commit 97bc58201b
5 changed files with 32 additions and 15 deletions

View file

@ -342,6 +342,10 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid,
// them with their snippets.
unordered_set<unsigned int> searchTermPositions;
// Remember max position. Used to stop walking positions lists while
// populating the adjacent slots.
unsigned int maxpos = 0;
// Total number of occurences for all terms. We stop when we have too much
unsigned int totaloccs = 0;
@ -419,6 +423,8 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid,
if (ii == (unsigned int)ipos) {
sparseDoc[ii] = qterm;
searchTermPositions.insert(ii);
if (ii > maxpos)
maxpos = ii;
} else if (ii > (unsigned int)ipos &&
ii < (unsigned int)ipos + qtrmwrdcnt) {
sparseDoc[ii] = occupiedmarker;
@ -460,6 +466,7 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid,
}
LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
chron.millis(), totaloccs));
maxpos += ctxwords + 1;
// This can happen if there are term occurences in the keywords
// etc. but not elsewhere ?
@ -475,28 +482,34 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid,
// which is bad.
{
Xapian::TermIterator term;
int cutoff = 500 * 1000;
int cutoff = m_q->m_snipMaxPosWalk;
for (term = xrdb.termlist_begin(docid);
term != xrdb.termlist_end(docid); term++) {
// Ignore prefixed terms
if (has_prefix(*term))
continue;
if (cutoff-- < 0) {
ret = ABSRES_TRUNC;
LOGDEB0(("makeAbstract: max term count cutoff\n"));
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
ret = ABSRES_TERMMISS;
LOGDEB0(("makeAbstract: max term count cutoff %d\n",
m_q->m_snipMaxPosWalk));
break;
}
map<unsigned int, string>::iterator vit;
Xapian::PositionIterator pos;
for (pos = xrdb.positionlist_begin(docid, *term);
pos != xrdb.positionlist_end(docid, *term); pos++) {
if (cutoff-- < 0) {
ret = ABSRES_TRUNC;
LOGDEB0(("makeAbstract: max term count cutoff\n"));
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
ret = ABSRES_TERMMISS;
LOGDEB0(("makeAbstract: max term count cutoff %d\n",
m_q->m_snipMaxPosWalk));
break;
}
// If we are beyond the max possible position, stop
// for this term
if (*pos > maxpos) {
break;
}
map<unsigned int, string>::iterator vit;
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
// Don't replace a term: the terms list is in
// alphabetic order, and we may have several terms

View file

@ -1618,7 +1618,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
case 0: is = prefix; break;
default: is = prefix + droot.substr(0, es); break;
}
LOGDEB(("termMatch: initsec: [%s]\n", is.c_str()));
LOGDEB1(("termMatch: initsec: [%s]\n", is.c_str()));
for (int tries = 0; tries < 2; tries++) {
try {

View file

@ -141,8 +141,10 @@ private:
Query::Query(Db *db)
: m_nq(new Native(this)), m_db(db), m_sorter(0), m_sortAscending(true),
m_collapseDuplicates(false), m_resCnt(-1)
m_collapseDuplicates(false), m_resCnt(-1), m_snipMaxPosWalk(1000000)
{
if (db)
db->getConf()->getConfParam("snippetMaxPosWalk", &m_snipMaxPosWalk);
}
Query::~Query()

View file

@ -32,7 +32,8 @@ class Doc;
enum abstract_result {
ABSRES_ERROR = 0,
ABSRES_OK = 1,
ABSRES_TRUNC = 2
ABSRES_TRUNC = 2,
ABSRES_TERMMISS = 3
};
// Snippet entry for makeDocAbstract
@ -126,6 +127,7 @@ private:
bool m_collapseDuplicates;
int m_resCnt;
RefCntr<SearchData> m_sd;
int m_snipMaxPosWalk;
/* Copyconst and assignement private and forbidden */
Query(const Query &) {}

View file

@ -598,7 +598,7 @@ public:
if (m_ts->lastpos < pos)
m_ts->lastpos = pos;
bool noexpand = be ? m_ts->curnostemexp : true;
LOGDEB(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
term.c_str(), pos, noexpand));
if (m_terms[pos].size() < term.size()) {
m_terms[pos] = term;