Fix the page break recording function for multiple page break at same term position
This commit is contained in:
parent
4733fa826b
commit
44469d723b
1 changed files with 87 additions and 9 deletions
|
@ -22,10 +22,10 @@
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
@ -70,6 +70,8 @@ static const string cstr_RCL_IDX_VERSION("1");
|
||||||
// (abstract, keywords, etc.. are stored before this)
|
// (abstract, keywords, etc.. are stored before this)
|
||||||
static const unsigned int baseTextPosition = 100000;
|
static const unsigned int baseTextPosition = 100000;
|
||||||
|
|
||||||
|
static const string cstr_mbreaks("rclmbreaks");
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
#endif
|
#endif
|
||||||
|
@ -302,6 +304,27 @@ double Db::Native::qualityTerms(Xapian::docid docid,
|
||||||
// Return the positions list for the page break term
|
// Return the positions list for the page break term
|
||||||
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
|
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
|
||||||
{
|
{
|
||||||
|
// Need to retrieve the document record to check for multiple page breaks
|
||||||
|
// that we store there for lack of better place
|
||||||
|
map<int, int> mbreaksmap;
|
||||||
|
try {
|
||||||
|
Xapian::Document xdoc = xrdb.get_document(docid);
|
||||||
|
string data = xdoc.get_data();
|
||||||
|
Doc doc;
|
||||||
|
string mbreaks;
|
||||||
|
if (dbDataToRclDoc(docid, data, doc) &&
|
||||||
|
doc.getmeta(cstr_mbreaks, &mbreaks)) {
|
||||||
|
vector<string> values;
|
||||||
|
stringToTokens(mbreaks, values, ",");
|
||||||
|
for (unsigned int i = 0; i < values.size() / 2; i += 2) {
|
||||||
|
int pos = atoi(values[i].c_str()) + baseTextPosition;
|
||||||
|
int incr = atoi(values[i+1].c_str());
|
||||||
|
mbreaksmap[pos] = incr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
}
|
||||||
|
|
||||||
string qterm = page_break_term;
|
string qterm = page_break_term;
|
||||||
Xapian::PositionIterator pos;
|
Xapian::PositionIterator pos;
|
||||||
try {
|
try {
|
||||||
|
@ -312,6 +335,13 @@ bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
|
||||||
// Not in text body. Strange...
|
// Not in text body. Strange...
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
map<int, int>::iterator it = mbreaksmap.find(ipos);
|
||||||
|
if (it != mbreaksmap.end()) {
|
||||||
|
LOGDEB1(("getPagePositions: found multibreak at %d incr %d\n",
|
||||||
|
ipos, it->second));
|
||||||
|
for (int i = 0 ; i < it->second; i++)
|
||||||
|
vpos.push_back(ipos);
|
||||||
|
}
|
||||||
vpos.push_back(ipos);
|
vpos.push_back(ipos);
|
||||||
}
|
}
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
|
@ -357,11 +387,10 @@ int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
|
||||||
if (ipos < int(baseTextPosition)) // Not in text body
|
if (ipos < int(baseTextPosition)) // Not in text body
|
||||||
continue;
|
continue;
|
||||||
// What page ?
|
// What page ?
|
||||||
LOGABS(("getFirstPageMatch: looking for match for [%s]\n",
|
LOGABS(("getFirstPageMatch: search match for [%s] pos %d\n",
|
||||||
qterm.c_str()));
|
qterm.c_str(), ipos));
|
||||||
vector<int>::const_iterator it =
|
vector<int>::const_iterator it =
|
||||||
lower_bound(pagepos.begin(), pagepos.end(), ipos);
|
upper_bound(pagepos.begin(), pagepos.end(), ipos);
|
||||||
if (it != pagepos.end())
|
|
||||||
return it - pagepos.begin() + 1;
|
return it - pagepos.begin() + 1;
|
||||||
}
|
}
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
|
@ -1002,7 +1031,7 @@ out:
|
||||||
|
|
||||||
class TermProcIdx : public TermProc {
|
class TermProcIdx : public TermProc {
|
||||||
public:
|
public:
|
||||||
TermProcIdx() : TermProc(0), m_ts(0) {}
|
TermProcIdx() : TermProc(0), m_ts(0), m_lastpagepos(0), m_pageincr(0) {}
|
||||||
void setTSD(TextSplitDb *ts) {m_ts = ts;}
|
void setTSD(TextSplitDb *ts) {m_ts = ts;}
|
||||||
|
|
||||||
bool takeword(const std::string &term, int pos, int, int)
|
bool takeword(const std::string &term, int pos, int, int)
|
||||||
|
@ -1033,10 +1062,47 @@ public:
|
||||||
void newpage(int pos)
|
void newpage(int pos)
|
||||||
{
|
{
|
||||||
pos += m_ts->basepos;
|
pos += m_ts->basepos;
|
||||||
|
LOGDEB2(("newpage: %d\n", pos));
|
||||||
|
if (pos < int(baseTextPosition))
|
||||||
|
return;
|
||||||
|
|
||||||
m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
|
m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
|
||||||
|
if (pos == m_lastpagepos) {
|
||||||
|
m_pageincr++;
|
||||||
|
LOGDEB2(("newpage: same pos, pageincr %d lastpagepos %d\n",
|
||||||
|
m_pageincr, m_lastpagepos));
|
||||||
|
} else {
|
||||||
|
LOGDEB2(("newpage: pos change, pageincr %d lastpagepos %d\n",
|
||||||
|
m_pageincr, m_lastpagepos));
|
||||||
|
if (m_pageincr > 0) {
|
||||||
|
// Remember the multiple page break at this position
|
||||||
|
m_pageincrvec.push_back(
|
||||||
|
pair<int, int>(m_lastpagepos - baseTextPosition,
|
||||||
|
m_pageincr));
|
||||||
}
|
}
|
||||||
private:
|
m_pageincr = 0;
|
||||||
|
}
|
||||||
|
m_lastpagepos = pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool flush()
|
||||||
|
{
|
||||||
|
if (m_pageincr > 0) {
|
||||||
|
m_pageincrvec.push_back(
|
||||||
|
pair<int, int>(m_lastpagepos - baseTextPosition,
|
||||||
|
m_pageincr));
|
||||||
|
m_pageincr = 0;
|
||||||
|
}
|
||||||
|
return TermProc::flush();
|
||||||
|
}
|
||||||
|
|
||||||
TextSplitDb *m_ts;
|
TextSplitDb *m_ts;
|
||||||
|
// Auxiliary page breaks data for positions with multiple page breaks.
|
||||||
|
int m_lastpagepos;
|
||||||
|
// increment of page breaks at same pos. Normally 0, 1.. when several
|
||||||
|
// breaks at the same pos
|
||||||
|
int m_pageincr;
|
||||||
|
vector <pair<int, int> > m_pageincrvec;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -1375,6 +1441,18 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If empty pages (multiple break at same pos) were recorded, save
|
||||||
|
// them (this is because we have no way to record them in the
|
||||||
|
// Xapian list
|
||||||
|
if (!tpidx.m_pageincrvec.empty()) {
|
||||||
|
ostringstream multibreaks;
|
||||||
|
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
|
||||||
|
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
|
||||||
|
tpidx.m_pageincrvec[i].second;
|
||||||
|
}
|
||||||
|
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
|
||||||
|
}
|
||||||
|
|
||||||
// If the file's md5 was computed, add value. This is optionally
|
// If the file's md5 was computed, add value. This is optionally
|
||||||
// used for query result duplicate elimination.
|
// used for query result duplicate elimination.
|
||||||
string& md5 = doc.meta[Doc::keymd5];
|
string& md5 = doc.meta[Doc::keymd5];
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue