add capability to remember page breaks generated by, e.g. pdftotext, and use them to start an external viewer on a match page

This commit is contained in:
Jean-Francois Dockes 2012-08-21 15:03:02 +02:00
parent a0398a6afd
commit c96b5d11f0
15 changed files with 292 additions and 85 deletions

View file

@ -44,6 +44,7 @@ src/doc/user/rcl.search.custom.html
src/doc/user/rcl.search.desktop.html src/doc/user/rcl.search.desktop.html
src/doc/user/rcl.search.history.html src/doc/user/rcl.search.history.html
src/doc/user/rcl.search.html src/doc/user/rcl.search.html
src/doc/user/rcl.search.kio.html
src/doc/user/rcl.search.lang.html src/doc/user/rcl.search.lang.html
src/doc/user/rcl.search.multidb.html src/doc/user/rcl.search.multidb.html
src/doc/user/rcl.search.preview.html src/doc/user/rcl.search.preview.html
@ -55,6 +56,7 @@ src/doc/user/rcl.search.wildcards.html
src/doc/user/rcl.searchkcl.html src/doc/user/rcl.searchkcl.html
src/doc/user/rcl.searchkio.html src/doc/user/rcl.searchkio.html
src/doc/user/rcl.searchkio.searchabledocs.html src/doc/user/rcl.searchkio.searchabledocs.html
src/doc/user/usermanual-xml.html
src/doc/user/usermanual.aux src/doc/user/usermanual.aux
src/doc/user/usermanual.html src/doc/user/usermanual.html
src/doc/user/usermanual.html-text src/doc/user/usermanual.html-text
@ -64,6 +66,7 @@ src/doc/user/usermanual.pdf
src/doc/user/usermanual.tex-pdf src/doc/user/usermanual.tex-pdf
src/doc/user/usermanual.tex-pdf-tmp src/doc/user/usermanual.tex-pdf-tmp
src/doc/user/usermanual.txt src/doc/user/usermanual.txt
src/doc/user/usermanual.xml
src/filters/rclexecm.pyc src/filters/rclexecm.pyc
src/filters/rcllatinclass.pyc src/filters/rcllatinclass.pyc
src/index/alldeps src/index/alldeps

View file

@ -86,7 +86,7 @@ public:
for (i = 0; i < strlen(wild); i++) for (i = 0; i < strlen(wild); i++)
charclasses[int(wild[i])] = WILD; charclasses[int(wild[i])] = WILD;
char special[] = ".@+-,#'_\n\r"; char special[] = ".@+-,#'_\n\r\f";
for (i = 0; i < strlen(special); i++) for (i = 0; i < strlen(special); i++)
charclasses[int(special[i])] = special[i]; charclasses[int(special[i])] = special[i];
@ -316,6 +316,7 @@ bool TextSplit::text_to_words(const string &in)
m_inNumber = false; m_inNumber = false;
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0; m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
int curspanglue = 0; int curspanglue = 0;
bool pagepending = false;
// Running count of non-alphanum chars. Reset when we see one; // Running count of non-alphanum chars. Reset when we see one;
int nonalnumcnt = 0; int nonalnumcnt = 0;
@ -369,6 +370,10 @@ bool TextSplit::text_to_words(const string &in)
return false; return false;
m_inNumber = false; m_inNumber = false;
} }
if (pagepending) {
pagepending = false;
newpage(m_wordpos);
}
break; break;
case WILD: case WILD:
@ -521,7 +526,10 @@ bool TextSplit::text_to_words(const string &in)
goto SPACE; goto SPACE;
} }
break; break;
case '\f':
pagepending = true;
goto SPACE;
break;
#ifdef RCL_SPLIT_CAMELCASE #ifdef RCL_SPLIT_CAMELCASE
// Camelcase handling. // Camelcase handling.
// If we get uppercase ascii after lowercase ascii, emit word. // If we get uppercase ascii after lowercase ascii, emit word.

View file

@ -20,10 +20,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#ifndef NO_NAMESPACES
using std::string; using std::string;
using std::vector; using std::vector;
#endif
class Utf8Iter; class Utf8Iter;
@ -78,6 +76,12 @@ public:
int bte // byte offset of first char after term int bte // byte offset of first char after term
) = 0; ) = 0;
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
* Mostly or exclusively used with pdftoxx output. Other filters mostly
* just don't know about pages. */
virtual void newpage(int /*pos*/)
{
}
// Static utility functions: // Static utility functions:

View file

@ -18,8 +18,11 @@
# check if it was actually incorrect or just mis-understood by qtextedit # check if it was actually incorrect or just mis-understood by qtextedit
# (tobedone) # (tobedone)
# Comment the following if you get better results without # Uncomment the following if you get better results without. The
optionraw=-raw # pdftotext manual says that the option is no longer recommended The
# difference in output seems mostly the removal of soft-hyphens when
# -raw is not set
# optionraw=-raw
# set variables # set variables
LANG=C ; export LANG LANG=C ; export LANG

View file

@ -35,6 +35,7 @@ using std::pair;
#include <qscrollbar.h> #include <qscrollbar.h>
#include <qmenu.h> #include <qmenu.h>
#include <qtextedit.h> #include <qtextedit.h>
#include <qtextbrowser.h>
#include <qprogressdialog.h> #include <qprogressdialog.h>
#include <qevent.h> #include <qevent.h>
#include <qlabel.h> #include <qlabel.h>
@ -732,10 +733,10 @@ class LoadThread : public QThread {
FileInterner::FIF_forPreview); FileInterner::FIF_forPreview);
FIMissingStore mst; FIMissingStore mst;
interner.setMissingStore(&mst); interner.setMissingStore(&mst);
// We don't set the interner's target mtype to html because we // Even when previewHtml is set, we don't set the interner's
// do want the html filter to do its work: we won't use the // target mtype to html because we do want the html filter to
// text, but we need the conversion to utf-8 // do its work: we won't use the text/plain, but we want the
// interner.setTargetMType("text/html"); // text/html to be converted to utf-8 (for highlight processing)
try { try {
string ipath = idoc.ipath; string ipath = idoc.ipath;
FileInterner::Status ret = interner.internfile(out, ipath); FileInterner::Status ret = interner.internfile(out, ipath);
@ -883,6 +884,22 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
// while still inserting at bottom // while still inserting at bottom
list<QString> qrichlst; list<QString> qrichlst;
PreviewTextEdit *editor = currentEditor(); PreviewTextEdit *editor = currentEditor();
// For an actual html file, if we want to have the images and
// style loaded in the preview, we need to set the search
// path. Not too sure this is a good idea as I find them rather
// distracting when looking for text, esp. with qtextedit
// relatively limited html support (text sometimes get hidden by
// images).
#if 0
string path = fileurltolocalpath(idoc.url);
if (!path.empty()) {
path = path_getfather(path);
QStringList paths(QString::fromLocal8Bit(path.c_str()));
editor->setSearchPaths(paths);
}
#endif
editor->setHtml(""); editor->setHtml("");
editor->m_format = Qt::RichText; editor->m_format = Qt::RichText;
bool inputishtml = !fdoc.mimetype.compare("text/html"); bool inputishtml = !fdoc.mimetype.compare("text/html");
@ -1073,7 +1090,7 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
} }
PreviewTextEdit::PreviewTextEdit(QWidget* parent, const char* nm, Preview *pv) PreviewTextEdit::PreviewTextEdit(QWidget* parent, const char* nm, Preview *pv)
: QTextEdit(parent), m_preview(pv), : QTextBrowser(parent), m_preview(pv),
m_plaintorich(new PlainToRichQtPreview()), m_plaintorich(new PlainToRichQtPreview()),
m_dspflds(false), m_docnum(-1) m_dspflds(false), m_docnum(-1)
{ {
@ -1081,6 +1098,8 @@ PreviewTextEdit::PreviewTextEdit(QWidget* parent, const char* nm, Preview *pv)
setObjectName(nm); setObjectName(nm);
connect(this, SIGNAL(customContextMenuRequested(const QPoint&)), connect(this, SIGNAL(customContextMenuRequested(const QPoint&)),
this, SLOT(createPopupMenu(const QPoint&))); this, SLOT(createPopupMenu(const QPoint&)));
setOpenExternalLinks(false);
setOpenLinks(false);
} }
PreviewTextEdit::~PreviewTextEdit() PreviewTextEdit::~PreviewTextEdit()

View file

@ -17,11 +17,24 @@
#ifndef _PREVIEW_W_H_INCLUDED_ #ifndef _PREVIEW_W_H_INCLUDED_
#define _PREVIEW_W_H_INCLUDED_ #define _PREVIEW_W_H_INCLUDED_
// Always use a qtextbrowser for now, there is no compelling reason to
// switch to webkit here
#if 1 || defined(RESLIST_TEXTBROWSER)
#define PREVIEW_TEXTBROWSER
#endif
#include <stdio.h> #include <stdio.h>
#include <qvariant.h> #include <qvariant.h>
#include <qwidget.h> #include <qwidget.h>
#include <qtextedit.h>
#ifdef PREVIEW_TEXTBROWSER
#include <QTextBrowser>
#define PREVIEW_PARENTCLASS QTextBrowser
#else
#include <QtWebKit/QWebView>
#define PREVIEW_PARENTCLASS QWebView
#endif
#include <qimage.h> #include <qimage.h>
#include "rcldb.h" #include "rcldb.h"
@ -31,13 +44,12 @@
class QTabWidget; class QTabWidget;
class QLabel; class QLabel;
class QLineEdit;
class QPushButton; class QPushButton;
class QCheckBox; class QCheckBox;
class Preview; class Preview;
class PlainToRichQtPreview; class PlainToRichQtPreview;
class PreviewTextEdit : public QTextEdit { class PreviewTextEdit : public PREVIEW_PARENTCLASS {
Q_OBJECT; Q_OBJECT;
public: public:
PreviewTextEdit(QWidget* parent, const char* name, Preview *pv); PreviewTextEdit(QWidget* parent, const char* name, Preview *pv);

View file

@ -1520,6 +1520,14 @@ void RclMain::startNativeViewer(Rcl::Doc doc)
return; return;
} }
int pagenum = 1;
if (m_source.isNotNull())
pagenum = m_source->getFirstMatchPage(doc);
if (pagenum == -1)
pagenum = 1;
char cpagenum[20];
sprintf(cpagenum, "%d", pagenum);
// Extract possible viewer attributes // Extract possible viewer attributes
ConfSimple attrs; ConfSimple attrs;
string cmd; string cmd;
@ -1657,6 +1665,7 @@ void RclMain::startNativeViewer(Rcl::Doc doc)
subs["F"] = orgfn; subs["F"] = orgfn;
subs["i"] = doc.ipath; subs["i"] = doc.ipath;
subs["M"] = doc.mimetype; subs["M"] = doc.mimetype;
subs["p"] = cpagenum;
subs["U"] = url; subs["U"] = url;
subs["u"] = url; subs["u"] = url;
// Let %(xx) access all metadata. // Let %(xx) access all metadata.

View file

@ -95,6 +95,11 @@ class DocSequence {
abs.push_back(doc.meta[Rcl::Doc::keyabs]); abs.push_back(doc.meta[Rcl::Doc::keyabs]);
return true; return true;
} }
virtual int getFirstMatchPage(Rcl::Doc&)
{
return -1;
}
virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0; virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0;
/** Get estimated total count in results */ /** Get estimated total count in results */

View file

@ -74,10 +74,18 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
} }
if (vabs.empty()) if (vabs.empty())
vabs.push_back(doc.meta[Rcl::Doc::keyabs]); vabs.push_back(doc.meta[Rcl::Doc::keyabs]);
return true; return true;
} }
int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc)
{
setQuery();
if (m_q->whatDb()) {
return m_q->whatDb()->getFirstMatchPage(doc, m_q.getptr());
}
return -1;
}
bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc) bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
{ {
setQuery(); setQuery();

View file

@ -32,6 +32,7 @@ class DocSequenceDb : public DocSequence {
virtual int getResCnt(); virtual int getResCnt();
virtual void getTerms(HighlightData& hld); virtual void getTerms(HighlightData& hld);
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&); virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
virtual int getFirstMatchPage(Rcl::Doc&);
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc); virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
virtual string getDescription(); virtual string getDescription();
virtual list<string> expand(Rcl::Doc &doc); virtual list<string> expand(Rcl::Doc &doc);

View file

@ -77,6 +77,7 @@ namespace Rcl {
const string pathelt_prefix = "XP"; const string pathelt_prefix = "XP";
const string start_of_field_term = "XXST"; const string start_of_field_term = "XXST";
const string end_of_field_term = "XXND"; const string end_of_field_term = "XXND";
const string page_break_term = "XXPG";
// This is used as a marker inside the abstract frag lists, but // This is used as a marker inside the abstract frag lists, but
// normally doesn't remain in final output (which is built with a // normally doesn't remain in final output (which is built with a
@ -245,31 +246,21 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
return true; return true;
} }
// Remove prefixes (caps) from terms. // Keep only non-prefixed terms. We use to remove prefixes and keep
// the terms instead, but field terms are normally also indexed
// un-prefixed, so this is simpler and better.
static void noPrefixList(const vector<string>& in, vector<string>& out) static void noPrefixList(const vector<string>& in, vector<string>& out)
{ {
for (vector<string>::const_iterator qit = in.begin(); for (vector<string>::const_iterator qit = in.begin();
qit != in.end(); qit++) { qit != in.end(); qit++) {
if ('A' <= qit->at(0) && qit->at(0) <= 'Z') { if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
string term = *qit;
while (term.length() && 'A' <= term.at(0) && term.at(0) <= 'Z')
term.erase(0, 1);
if (term.length())
out.push_back(term);
continue;
} else {
out.push_back(*qit); out.push_back(*qit);
} }
} }
}
//#define DEBUGABSTRACT 1 #undef DEBUGABSTRACT
#ifdef DEBUGABSTRACT #ifdef DEBUGABSTRACT
#define LOGABS LOGDEB #define LOGABS LOGDEB
#else
#define LOGABS LOGDEB2
#endif
#if 0
static void listList(const string& what, const vector<string>&l) static void listList(const string& what, const vector<string>&l)
{ {
string a; string a;
@ -278,58 +269,55 @@ static void listList(const string& what, const vector<string>&l)
} }
LOGDEB(("%s: %s\n", what.c_str(), a.c_str())); LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
} }
#else
#define LOGABS LOGDEB2
static void listList(const string&, const vector<string>&)
{
}
#endif #endif
// Build a document abstract by extracting text chunks around the query terms // Retrieve and store db-wide frequencies for the query terms.
// This uses the db termlists, not the original document. void Db::Native::setDbWideQTermsFreqs(Query *query)
//
// DatabaseModified and other general exceptions are catched and
// possibly retried by our caller
vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
{ {
Chrono chron; // Do it once only for a given query.
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), if (!query->m_nq->termfreqs.empty())
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen)); return;
vector<string> terms;
vector<string> qterms;
{ {
vector<string> iterms; vector<string> iqterms;
query->getMatchTerms(docid, iterms);
noPrefixList(iterms, terms);
if (terms.empty()) {
LOGDEB(("makeAbstract::Empty term list\n"));
return vector<string>();
}
}
// listList("Match terms: ", terms);
// Retrieve db-wide frequencies for the query terms (we do this once per
// query, using all the query terms, not only the document match terms)
if (query->m_nq->termfreqs.empty()) {
vector<string> iqterms, qterms;
query->getQueryTerms(iqterms); query->getQueryTerms(iqterms);
noPrefixList(iqterms, qterms); noPrefixList(iqterms, qterms);
}
// listList("Query terms: ", qterms); // listList("Query terms: ", qterms);
double doccnt = xrdb.get_doccount(); double doccnt = xrdb.get_doccount();
if (doccnt == 0) doccnt = 1; if (doccnt == 0)
doccnt = 1;
for (vector<string>::const_iterator qit = qterms.begin(); for (vector<string>::const_iterator qit = qterms.begin();
qit != qterms.end(); qit++) { qit != qterms.end(); qit++) {
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt; query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
query->m_nq->termfreqs[*qit])); query->m_nq->termfreqs[*qit]));
} }
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
} }
// Compute a term quality coefficient by retrieving the term // Compute query terms quality coefficients for a matched document by
// Within Document Frequencies and multiplying by overal term // retrieving the Within Document Frequencies and multiplying by
// frequency, then using log-based thresholds. We are going to try // overal term frequency, then using log-based thresholds.
// and show text around the less common search terms. double Db::Native::qualityTerms(Xapian::docid docid,
Query *query,
const vector<string>& terms,
multimap<double, string>& byQ)
{
map<string, double> termQcoefs; map<string, double> termQcoefs;
double totalweight = 0; double totalweight = 0;
double doclen = xrdb.get_doclength(docid); double doclen = xrdb.get_doclength(docid);
if (doclen == 0) doclen = 1; if (doclen == 0)
doclen = 1;
for (vector<string>::const_iterator qit = terms.begin(); for (vector<string>::const_iterator qit = terms.begin();
qit != terms.end(); qit++) { qit != terms.end(); qit++) {
Xapian::TermIterator term = xrdb.termlist_begin(docid); Xapian::TermIterator term = xrdb.termlist_begin(docid);
@ -352,10 +340,8 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
totalweight += q; totalweight += q;
} }
} }
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
// Build a sorted by quality term list. // Build a sorted by quality term list.
multimap<double, string> byQ;
for (vector<string>::const_iterator qit = terms.begin(); for (vector<string>::const_iterator qit = terms.begin();
qit != terms.end(); qit++) { qit != terms.end(); qit++) {
if (termQcoefs.find(*qit) != termQcoefs.end()) if (termQcoefs.find(*qit) != termQcoefs.end())
@ -368,8 +354,128 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str())); LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
} }
#endif #endif
return totalweight;
}
// Return the positions list for the page break term
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
{
string qterm = page_break_term;
Xapian::PositionIterator pos;
try {
for (pos = xrdb.positionlist_begin(docid, qterm);
pos != xrdb.positionlist_end(docid, qterm); pos++) {
int ipos = *pos;
if (ipos < int(baseTextPosition)) {
// Not in text body. Strange...
continue;
}
vpos.push_back(ipos);
}
} catch (...) {
// Term does not occur. No problem.
}
return true;
}
// Return page number for first match of "significant" term.
int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
{
vector<string> terms;
{
vector<string> iterms;
query->getMatchTerms(docid, iterms);
noPrefixList(iterms, terms);
}
if (terms.empty()) {
LOGDEB(("getFirstMatchPage: empty match term list (field match?)\n"));
return -1;
}
vector<int> pagepos;
getPagePositions(docid, pagepos);
if (pagepos.empty())
return -1;
setDbWideQTermsFreqs(query);
// We try to use a page which matches the "best" term. Get a sorted list
multimap<double, string> byQ;
double totalweight = qualityTerms(docid, query, terms, byQ);
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
qit != byQ.rend(); qit++) {
string qterm = qit->second;
Xapian::PositionIterator pos;
string emptys;
try {
for (pos = xrdb.positionlist_begin(docid, qterm);
pos != xrdb.positionlist_end(docid, qterm); pos++) {
int ipos = *pos;
if (ipos < int(baseTextPosition)) // Not in text body
continue;
// What page ?
LOGABS(("getFirstPageMatch: looking for match for [%s]\n",
qterm.c_str()));
vector<int>::const_iterator it =
lower_bound(pagepos.begin(), pagepos.end(), ipos);
if (it != pagepos.end())
return it - pagepos.begin() + 1;
}
} catch (...) {
// Term does not occur. No problem.
}
}
return -1;
}
// Build a document abstract by extracting text chunks around the query terms
// This uses the db termlists, not the original document.
//
// DatabaseModified and other general exceptions are catched and
// possibly retried by our caller
vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
{
Chrono chron;
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
// The (unprefixed) terms matched by this document
vector<string> terms;
{
vector<string> iterms;
query->getMatchTerms(docid, iterms);
noPrefixList(iterms, terms);
if (terms.empty()) {
LOGDEB(("makeAbstract::Empty term list\n"));
return vector<string>();
}
}
listList("Match terms: ", terms);
// Retrieve the term freqencies for the query terms. This is
// actually computed only once for a query, and for all terms in
// the query (not only the matches for this doc)
setDbWideQTermsFreqs(query);
// Build a sorted by quality container for the match terms We are
// going to try and show text around the less common search terms.
// TOBEDONE: terms issued from an original one by stem expansion
// should be somehow aggregated here, else, it may happen that
// such a group prevents displaying matches for other terms (by
// remaining its meaning to the maximum occurrences per term test
// using while walking the list below)
multimap<double, string> byQ;
double totalweight = qualityTerms(docid, query, terms, byQ);
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
// This can't happen, but would crash us
if (totalweight == 0.0) {
LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
return vector<string>();
}
///////////////////
// For each of the query terms, ask xapian for its positions list // For each of the query terms, ask xapian for its positions list
// in the document. For each position entry, remember it in // in the document. For each position entry, remember it in
// qtermposs and insert it and its neighbours in the set of // qtermposs and insert it and its neighbours in the set of
@ -390,11 +496,6 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
const unsigned int maxtotaloccs = const unsigned int maxtotaloccs =
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1)); m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs)); LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
// This can't happen, but would crash us
if (totalweight == 0.0) {
LOGERR(("makeAbstract: 0 totalweight!\n"));
return vector<string>();
}
// This is used to mark positions overlapped by a multi-word match term // This is used to mark positions overlapped by a multi-word match term
const string occupiedmarker("?"); const string occupiedmarker("?");
@ -1000,7 +1101,11 @@ public:
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
return false; return false;
} }
void newpage(int pos)
{
pos += m_ts->basepos;
m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
}
private: private:
TextSplitDb *m_ts; TextSplitDb *m_ts;
}; };
@ -2014,6 +2119,19 @@ bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
return m_reason.empty() ? true : false; return m_reason.empty() ? true : false;
} }
int Db::getFirstMatchPage(Doc &doc, Query *query)
{
LOGDEB1(("Db::getFirstMatchPages\n"));;
if (!m_ndb || !m_ndb->m_isopen) {
LOGERR(("Db::getFirstMatchPage: no db\n"));
return false;
}
int pagenum = -1;
XAPTRY(pagenum = m_ndb->getFirstMatchPage(Xapian::docid(doc.xdocid), query),
m_ndb->xrdb, m_reason);
return m_reason.empty() ? pagenum : -1;
}
// Retrieve document defined by Unique doc identifier. This is mainly used // Retrieve document defined by Unique doc identifier. This is mainly used
// by the GUI history feature // by the GUI history feature
bool Db::getDoc(const string &udi, Doc &doc) bool Db::getDoc(const string &udi, Doc &doc)

View file

@ -26,10 +26,8 @@
#include "stoplist.h" #include "stoplist.h"
#include "rclconfig.h" #include "rclconfig.h"
#ifndef NO_NAMESPACES
using std::string; using std::string;
using std::vector; using std::vector;
#endif
// rcldb defines an interface for a 'real' text database. The current // rcldb defines an interface for a 'real' text database. The current
// implementation uses xapian only, and xapian-related code is in rcldb.cpp // implementation uses xapian only, and xapian-related code is in rcldb.cpp
@ -227,6 +225,8 @@ class Db {
* the input query. This uses index data only (no access to the file) */ * the input query. This uses index data only (no access to the file) */
bool makeDocAbstract(Doc &doc, Query *query, string& abstract); bool makeDocAbstract(Doc &doc, Query *query, string& abstract);
bool makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract); bool makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract);
/** Retrieve detected page breaks positions */
int getFirstMatchPage(Doc &doc, Query *query);
/** Get document for given udi /** Get document for given udi
* *
@ -324,9 +324,8 @@ string version_string();
extern const string pathelt_prefix; extern const string pathelt_prefix;
extern const string start_of_field_term; extern const string start_of_field_term;
extern const string end_of_field_term; extern const string end_of_field_term;
#ifndef NO_NAMESPACES extern const string page_break_term;
}
#endif // NO_NAMESPACES
}
#endif /* _DB_H_INCLUDED_ */ #endif /* _DB_H_INCLUDED_ */

View file

@ -18,6 +18,8 @@
#ifndef _rcldb_p_h_included_ #ifndef _rcldb_p_h_included_
#define _rcldb_p_h_included_ #define _rcldb_p_h_included_
#include <map>
#ifdef IDX_THREADS #ifdef IDX_THREADS
#include "workqueue.h" #include "workqueue.h"
#endif // IDX_THREADS #endif // IDX_THREADS
@ -111,7 +113,14 @@ class Db::Native {
#endif // IDX_THREADS #endif // IDX_THREADS
} }
double qualityTerms(Xapian::docid docid,
Query *query,
const vector<string>& terms,
std::multimap<double, string>& byQ);
void setDbWideQTermsFreqs(Query *query);
vector<string> makeAbstract(Xapian::docid id, Query *query); vector<string> makeAbstract(Xapian::docid id, Query *query);
bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
int getFirstMatchPage(Xapian::docid docid, Query *query);
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc); bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);

View file

@ -245,7 +245,6 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
return true; return true;
} }
bool Query::getQueryTerms(vector<string>& terms) bool Query::getQueryTerms(vector<string>& terms)
{ {
if (ISNULL(m_nq)) if (ISNULL(m_nq))
@ -288,7 +287,7 @@ bool Query::getMatchTerms(unsigned long xdocid, vector<string>& terms)
m_db->m_ndb->xrdb, m_reason); m_db->m_ndb->xrdb, m_reason);
if (!m_reason.empty()) { if (!m_reason.empty()) {
LOGERR(("getQueryTerms: xapian error: %s\n", m_reason.c_str())); LOGERR(("getMatchTerms: xapian error: %s\n", m_reason.c_str()));
return false; return false;
} }

View file

@ -51,6 +51,11 @@ public:
else else
return true; return true;
} }
virtual void newpage(int pos)
{
if (m_next)
m_next->newpage(pos);
}
virtual bool flush() virtual bool flush()
{ {
if (m_next) if (m_next)
@ -91,6 +96,11 @@ public:
else else
return true; return true;
} }
virtual void newpage(int pos)
{
if (m_prc)
return m_prc->newpage(pos);
}
private: private:
TermProc *m_prc; TermProc *m_prc;