add capability to remember page breaks generated by, e.g. pdftotext, and use them to start an external viewer on a match page
This commit is contained in:
parent
a0398a6afd
commit
c96b5d11f0
15 changed files with 292 additions and 85 deletions
|
@ -44,6 +44,7 @@ src/doc/user/rcl.search.custom.html
|
|||
src/doc/user/rcl.search.desktop.html
|
||||
src/doc/user/rcl.search.history.html
|
||||
src/doc/user/rcl.search.html
|
||||
src/doc/user/rcl.search.kio.html
|
||||
src/doc/user/rcl.search.lang.html
|
||||
src/doc/user/rcl.search.multidb.html
|
||||
src/doc/user/rcl.search.preview.html
|
||||
|
@ -55,6 +56,7 @@ src/doc/user/rcl.search.wildcards.html
|
|||
src/doc/user/rcl.searchkcl.html
|
||||
src/doc/user/rcl.searchkio.html
|
||||
src/doc/user/rcl.searchkio.searchabledocs.html
|
||||
src/doc/user/usermanual-xml.html
|
||||
src/doc/user/usermanual.aux
|
||||
src/doc/user/usermanual.html
|
||||
src/doc/user/usermanual.html-text
|
||||
|
@ -64,6 +66,7 @@ src/doc/user/usermanual.pdf
|
|||
src/doc/user/usermanual.tex-pdf
|
||||
src/doc/user/usermanual.tex-pdf-tmp
|
||||
src/doc/user/usermanual.txt
|
||||
src/doc/user/usermanual.xml
|
||||
src/filters/rclexecm.pyc
|
||||
src/filters/rcllatinclass.pyc
|
||||
src/index/alldeps
|
||||
|
|
|
@ -86,7 +86,7 @@ public:
|
|||
for (i = 0; i < strlen(wild); i++)
|
||||
charclasses[int(wild[i])] = WILD;
|
||||
|
||||
char special[] = ".@+-,#'_\n\r";
|
||||
char special[] = ".@+-,#'_\n\r\f";
|
||||
for (i = 0; i < strlen(special); i++)
|
||||
charclasses[int(special[i])] = special[i];
|
||||
|
||||
|
@ -316,6 +316,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||
m_inNumber = false;
|
||||
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
||||
int curspanglue = 0;
|
||||
bool pagepending = false;
|
||||
|
||||
// Running count of non-alphanum chars. Reset when we see one;
|
||||
int nonalnumcnt = 0;
|
||||
|
@ -369,6 +370,10 @@ bool TextSplit::text_to_words(const string &in)
|
|||
return false;
|
||||
m_inNumber = false;
|
||||
}
|
||||
if (pagepending) {
|
||||
pagepending = false;
|
||||
newpage(m_wordpos);
|
||||
}
|
||||
break;
|
||||
|
||||
case WILD:
|
||||
|
@ -521,7 +526,10 @@ bool TextSplit::text_to_words(const string &in)
|
|||
goto SPACE;
|
||||
}
|
||||
break;
|
||||
|
||||
case '\f':
|
||||
pagepending = true;
|
||||
goto SPACE;
|
||||
break;
|
||||
#ifdef RCL_SPLIT_CAMELCASE
|
||||
// Camelcase handling.
|
||||
// If we get uppercase ascii after lowercase ascii, emit word.
|
||||
|
|
|
@ -20,10 +20,8 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
using std::vector;
|
||||
#endif
|
||||
|
||||
class Utf8Iter;
|
||||
|
||||
|
@ -78,6 +76,12 @@ public:
|
|||
int bte // byte offset of first char after term
|
||||
) = 0;
|
||||
|
||||
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
|
||||
* Mostly or exclusively used with pdftoxx output. Other filters mostly
|
||||
* just don't know about pages. */
|
||||
virtual void newpage(int /*pos*/)
|
||||
{
|
||||
}
|
||||
|
||||
// Static utility functions:
|
||||
|
||||
|
|
|
@ -18,8 +18,11 @@
|
|||
# check if it was actually incorrect or just mis-understood by qtextedit
|
||||
# (tobedone)
|
||||
|
||||
# Comment the following if you get better results without
|
||||
optionraw=-raw
|
||||
# Uncomment the following if you get better results without. The
|
||||
# pdftotext manual says that the option is no longer recommended The
|
||||
# difference in output seems mostly the removal of soft-hyphens when
|
||||
# -raw is not set
|
||||
# optionraw=-raw
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
|
|
|
@ -35,6 +35,7 @@ using std::pair;
|
|||
#include <qscrollbar.h>
|
||||
#include <qmenu.h>
|
||||
#include <qtextedit.h>
|
||||
#include <qtextbrowser.h>
|
||||
#include <qprogressdialog.h>
|
||||
#include <qevent.h>
|
||||
#include <qlabel.h>
|
||||
|
@ -732,10 +733,10 @@ class LoadThread : public QThread {
|
|||
FileInterner::FIF_forPreview);
|
||||
FIMissingStore mst;
|
||||
interner.setMissingStore(&mst);
|
||||
// We don't set the interner's target mtype to html because we
|
||||
// do want the html filter to do its work: we won't use the
|
||||
// text, but we need the conversion to utf-8
|
||||
// interner.setTargetMType("text/html");
|
||||
// Even when previewHtml is set, we don't set the interner's
|
||||
// target mtype to html because we do want the html filter to
|
||||
// do its work: we won't use the text/plain, but we want the
|
||||
// text/html to be converted to utf-8 (for highlight processing)
|
||||
try {
|
||||
string ipath = idoc.ipath;
|
||||
FileInterner::Status ret = interner.internfile(out, ipath);
|
||||
|
@ -883,6 +884,22 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
|
|||
// while still inserting at bottom
|
||||
list<QString> qrichlst;
|
||||
PreviewTextEdit *editor = currentEditor();
|
||||
|
||||
// For an actual html file, if we want to have the images and
|
||||
// style loaded in the preview, we need to set the search
|
||||
// path. Not too sure this is a good idea as I find them rather
|
||||
// distracting when looking for text, esp. with qtextedit
|
||||
// relatively limited html support (text sometimes get hidden by
|
||||
// images).
|
||||
#if 0
|
||||
string path = fileurltolocalpath(idoc.url);
|
||||
if (!path.empty()) {
|
||||
path = path_getfather(path);
|
||||
QStringList paths(QString::fromLocal8Bit(path.c_str()));
|
||||
editor->setSearchPaths(paths);
|
||||
}
|
||||
#endif
|
||||
|
||||
editor->setHtml("");
|
||||
editor->m_format = Qt::RichText;
|
||||
bool inputishtml = !fdoc.mimetype.compare("text/html");
|
||||
|
@ -1073,7 +1090,7 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
|
|||
}
|
||||
|
||||
PreviewTextEdit::PreviewTextEdit(QWidget* parent, const char* nm, Preview *pv)
|
||||
: QTextEdit(parent), m_preview(pv),
|
||||
: QTextBrowser(parent), m_preview(pv),
|
||||
m_plaintorich(new PlainToRichQtPreview()),
|
||||
m_dspflds(false), m_docnum(-1)
|
||||
{
|
||||
|
@ -1081,6 +1098,8 @@ PreviewTextEdit::PreviewTextEdit(QWidget* parent, const char* nm, Preview *pv)
|
|||
setObjectName(nm);
|
||||
connect(this, SIGNAL(customContextMenuRequested(const QPoint&)),
|
||||
this, SLOT(createPopupMenu(const QPoint&)));
|
||||
setOpenExternalLinks(false);
|
||||
setOpenLinks(false);
|
||||
}
|
||||
|
||||
PreviewTextEdit::~PreviewTextEdit()
|
||||
|
|
|
@ -17,11 +17,24 @@
|
|||
#ifndef _PREVIEW_W_H_INCLUDED_
|
||||
#define _PREVIEW_W_H_INCLUDED_
|
||||
|
||||
// Always use a qtextbrowser for now, there is no compelling reason to
|
||||
// switch to webkit here
|
||||
#if 1 || defined(RESLIST_TEXTBROWSER)
|
||||
#define PREVIEW_TEXTBROWSER
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <qvariant.h>
|
||||
#include <qwidget.h>
|
||||
#include <qtextedit.h>
|
||||
|
||||
#ifdef PREVIEW_TEXTBROWSER
|
||||
#include <QTextBrowser>
|
||||
#define PREVIEW_PARENTCLASS QTextBrowser
|
||||
#else
|
||||
#include <QtWebKit/QWebView>
|
||||
#define PREVIEW_PARENTCLASS QWebView
|
||||
#endif
|
||||
#include <qimage.h>
|
||||
|
||||
#include "rcldb.h"
|
||||
|
@ -31,13 +44,12 @@
|
|||
|
||||
class QTabWidget;
|
||||
class QLabel;
|
||||
class QLineEdit;
|
||||
class QPushButton;
|
||||
class QCheckBox;
|
||||
class Preview;
|
||||
class PlainToRichQtPreview;
|
||||
|
||||
class PreviewTextEdit : public QTextEdit {
|
||||
class PreviewTextEdit : public PREVIEW_PARENTCLASS {
|
||||
Q_OBJECT;
|
||||
public:
|
||||
PreviewTextEdit(QWidget* parent, const char* name, Preview *pv);
|
||||
|
|
|
@ -1520,6 +1520,14 @@ void RclMain::startNativeViewer(Rcl::Doc doc)
|
|||
return;
|
||||
}
|
||||
|
||||
int pagenum = 1;
|
||||
if (m_source.isNotNull())
|
||||
pagenum = m_source->getFirstMatchPage(doc);
|
||||
if (pagenum == -1)
|
||||
pagenum = 1;
|
||||
char cpagenum[20];
|
||||
sprintf(cpagenum, "%d", pagenum);
|
||||
|
||||
// Extract possible viewer attributes
|
||||
ConfSimple attrs;
|
||||
string cmd;
|
||||
|
@ -1657,6 +1665,7 @@ void RclMain::startNativeViewer(Rcl::Doc doc)
|
|||
subs["F"] = orgfn;
|
||||
subs["i"] = doc.ipath;
|
||||
subs["M"] = doc.mimetype;
|
||||
subs["p"] = cpagenum;
|
||||
subs["U"] = url;
|
||||
subs["u"] = url;
|
||||
// Let %(xx) access all metadata.
|
||||
|
|
|
@ -95,6 +95,11 @@ class DocSequence {
|
|||
abs.push_back(doc.meta[Rcl::Doc::keyabs]);
|
||||
return true;
|
||||
}
|
||||
virtual int getFirstMatchPage(Rcl::Doc&)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0;
|
||||
|
||||
/** Get estimated total count in results */
|
||||
|
|
|
@ -74,10 +74,18 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
|
|||
}
|
||||
if (vabs.empty())
|
||||
vabs.push_back(doc.meta[Rcl::Doc::keyabs]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc)
|
||||
{
|
||||
setQuery();
|
||||
if (m_q->whatDb()) {
|
||||
return m_q->whatDb()->getFirstMatchPage(doc, m_q.getptr());
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
|
||||
{
|
||||
setQuery();
|
||||
|
|
|
@ -32,6 +32,7 @@ class DocSequenceDb : public DocSequence {
|
|||
virtual int getResCnt();
|
||||
virtual void getTerms(HighlightData& hld);
|
||||
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
|
||||
virtual int getFirstMatchPage(Rcl::Doc&);
|
||||
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
|
||||
virtual string getDescription();
|
||||
virtual list<string> expand(Rcl::Doc &doc);
|
||||
|
|
|
@ -77,6 +77,7 @@ namespace Rcl {
|
|||
const string pathelt_prefix = "XP";
|
||||
const string start_of_field_term = "XXST";
|
||||
const string end_of_field_term = "XXND";
|
||||
const string page_break_term = "XXPG";
|
||||
|
||||
// This is used as a marker inside the abstract frag lists, but
|
||||
// normally doesn't remain in final output (which is built with a
|
||||
|
@ -245,31 +246,21 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
|||
return true;
|
||||
}
|
||||
|
||||
// Remove prefixes (caps) from terms.
|
||||
// Keep only non-prefixed terms. We use to remove prefixes and keep
|
||||
// the terms instead, but field terms are normally also indexed
|
||||
// un-prefixed, so this is simpler and better.
|
||||
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
||||
{
|
||||
for (vector<string>::const_iterator qit = in.begin();
|
||||
qit != in.end(); qit++) {
|
||||
if ('A' <= qit->at(0) && qit->at(0) <= 'Z') {
|
||||
string term = *qit;
|
||||
while (term.length() && 'A' <= term.at(0) && term.at(0) <= 'Z')
|
||||
term.erase(0, 1);
|
||||
if (term.length())
|
||||
out.push_back(term);
|
||||
continue;
|
||||
} else {
|
||||
if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
|
||||
out.push_back(*qit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#define DEBUGABSTRACT 1
|
||||
#undef DEBUGABSTRACT
|
||||
#ifdef DEBUGABSTRACT
|
||||
#define LOGABS LOGDEB
|
||||
#else
|
||||
#define LOGABS LOGDEB2
|
||||
#endif
|
||||
#if 0
|
||||
static void listList(const string& what, const vector<string>&l)
|
||||
{
|
||||
string a;
|
||||
|
@ -278,58 +269,55 @@ static void listList(const string& what, const vector<string>&l)
|
|||
}
|
||||
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
|
||||
}
|
||||
#else
|
||||
#define LOGABS LOGDEB2
|
||||
static void listList(const string&, const vector<string>&)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
// Build a document abstract by extracting text chunks around the query terms
|
||||
// This uses the db termlists, not the original document.
|
||||
//
|
||||
// DatabaseModified and other general exceptions are catched and
|
||||
// possibly retried by our caller
|
||||
vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||
// Retrieve and store db-wide frequencies for the query terms.
|
||||
void Db::Native::setDbWideQTermsFreqs(Query *query)
|
||||
{
|
||||
Chrono chron;
|
||||
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
||||
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
|
||||
|
||||
vector<string> terms;
|
||||
// Do it once only for a given query.
|
||||
if (!query->m_nq->termfreqs.empty())
|
||||
return;
|
||||
|
||||
vector<string> qterms;
|
||||
{
|
||||
vector<string> iterms;
|
||||
query->getMatchTerms(docid, iterms);
|
||||
noPrefixList(iterms, terms);
|
||||
if (terms.empty()) {
|
||||
LOGDEB(("makeAbstract::Empty term list\n"));
|
||||
return vector<string>();
|
||||
}
|
||||
}
|
||||
// listList("Match terms: ", terms);
|
||||
|
||||
// Retrieve db-wide frequencies for the query terms (we do this once per
|
||||
// query, using all the query terms, not only the document match terms)
|
||||
if (query->m_nq->termfreqs.empty()) {
|
||||
vector<string> iqterms, qterms;
|
||||
vector<string> iqterms;
|
||||
query->getQueryTerms(iqterms);
|
||||
noPrefixList(iqterms, qterms);
|
||||
}
|
||||
// listList("Query terms: ", qterms);
|
||||
|
||||
double doccnt = xrdb.get_doccount();
|
||||
if (doccnt == 0) doccnt = 1;
|
||||
if (doccnt == 0)
|
||||
doccnt = 1;
|
||||
|
||||
for (vector<string>::const_iterator qit = qterms.begin();
|
||||
qit != qterms.end(); qit++) {
|
||||
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
||||
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
||||
query->m_nq->termfreqs[*qit]));
|
||||
}
|
||||
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
||||
}
|
||||
|
||||
// Compute a term quality coefficient by retrieving the term
|
||||
// Within Document Frequencies and multiplying by overal term
|
||||
// frequency, then using log-based thresholds. We are going to try
|
||||
// and show text around the less common search terms.
|
||||
// Compute query terms quality coefficients for a matched document by
|
||||
// retrieving the Within Document Frequencies and multiplying by
|
||||
// overal term frequency, then using log-based thresholds.
|
||||
double Db::Native::qualityTerms(Xapian::docid docid,
|
||||
Query *query,
|
||||
const vector<string>& terms,
|
||||
multimap<double, string>& byQ)
|
||||
{
|
||||
map<string, double> termQcoefs;
|
||||
double totalweight = 0;
|
||||
|
||||
double doclen = xrdb.get_doclength(docid);
|
||||
if (doclen == 0) doclen = 1;
|
||||
if (doclen == 0)
|
||||
doclen = 1;
|
||||
|
||||
for (vector<string>::const_iterator qit = terms.begin();
|
||||
qit != terms.end(); qit++) {
|
||||
Xapian::TermIterator term = xrdb.termlist_begin(docid);
|
||||
|
@ -352,10 +340,8 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||
totalweight += q;
|
||||
}
|
||||
}
|
||||
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
||||
|
||||
// Build a sorted by quality term list.
|
||||
multimap<double, string> byQ;
|
||||
for (vector<string>::const_iterator qit = terms.begin();
|
||||
qit != terms.end(); qit++) {
|
||||
if (termQcoefs.find(*qit) != termQcoefs.end())
|
||||
|
@ -368,8 +354,128 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
|
||||
}
|
||||
#endif
|
||||
return totalweight;
|
||||
}
|
||||
|
||||
// Return the positions list for the page break term
|
||||
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
|
||||
{
|
||||
string qterm = page_break_term;
|
||||
Xapian::PositionIterator pos;
|
||||
try {
|
||||
for (pos = xrdb.positionlist_begin(docid, qterm);
|
||||
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
||||
int ipos = *pos;
|
||||
if (ipos < int(baseTextPosition)) {
|
||||
// Not in text body. Strange...
|
||||
continue;
|
||||
}
|
||||
vpos.push_back(ipos);
|
||||
}
|
||||
} catch (...) {
|
||||
// Term does not occur. No problem.
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Return page number for first match of "significant" term.
|
||||
int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
|
||||
{
|
||||
vector<string> terms;
|
||||
{
|
||||
vector<string> iterms;
|
||||
query->getMatchTerms(docid, iterms);
|
||||
noPrefixList(iterms, terms);
|
||||
}
|
||||
if (terms.empty()) {
|
||||
LOGDEB(("getFirstMatchPage: empty match term list (field match?)\n"));
|
||||
return -1;
|
||||
}
|
||||
|
||||
vector<int> pagepos;
|
||||
getPagePositions(docid, pagepos);
|
||||
if (pagepos.empty())
|
||||
return -1;
|
||||
|
||||
setDbWideQTermsFreqs(query);
|
||||
|
||||
// We try to use a page which matches the "best" term. Get a sorted list
|
||||
multimap<double, string> byQ;
|
||||
double totalweight = qualityTerms(docid, query, terms, byQ);
|
||||
|
||||
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
||||
qit != byQ.rend(); qit++) {
|
||||
string qterm = qit->second;
|
||||
Xapian::PositionIterator pos;
|
||||
string emptys;
|
||||
try {
|
||||
for (pos = xrdb.positionlist_begin(docid, qterm);
|
||||
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
||||
int ipos = *pos;
|
||||
if (ipos < int(baseTextPosition)) // Not in text body
|
||||
continue;
|
||||
// What page ?
|
||||
LOGABS(("getFirstPageMatch: looking for match for [%s]\n",
|
||||
qterm.c_str()));
|
||||
vector<int>::const_iterator it =
|
||||
lower_bound(pagepos.begin(), pagepos.end(), ipos);
|
||||
if (it != pagepos.end())
|
||||
return it - pagepos.begin() + 1;
|
||||
}
|
||||
} catch (...) {
|
||||
// Term does not occur. No problem.
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Build a document abstract by extracting text chunks around the query terms
|
||||
// This uses the db termlists, not the original document.
|
||||
//
|
||||
// DatabaseModified and other general exceptions are catched and
|
||||
// possibly retried by our caller
|
||||
vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||
{
|
||||
Chrono chron;
|
||||
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
||||
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
|
||||
|
||||
// The (unprefixed) terms matched by this document
|
||||
vector<string> terms;
|
||||
|
||||
{
|
||||
vector<string> iterms;
|
||||
query->getMatchTerms(docid, iterms);
|
||||
noPrefixList(iterms, terms);
|
||||
if (terms.empty()) {
|
||||
LOGDEB(("makeAbstract::Empty term list\n"));
|
||||
return vector<string>();
|
||||
}
|
||||
}
|
||||
listList("Match terms: ", terms);
|
||||
|
||||
// Retrieve the term freqencies for the query terms. This is
|
||||
// actually computed only once for a query, and for all terms in
|
||||
// the query (not only the matches for this doc)
|
||||
setDbWideQTermsFreqs(query);
|
||||
|
||||
// Build a sorted by quality container for the match terms We are
|
||||
// going to try and show text around the less common search terms.
|
||||
// TOBEDONE: terms issued from an original one by stem expansion
|
||||
// should be somehow aggregated here, else, it may happen that
|
||||
// such a group prevents displaying matches for other terms (by
|
||||
// remaining its meaning to the maximum occurrences per term test
|
||||
// using while walking the list below)
|
||||
multimap<double, string> byQ;
|
||||
double totalweight = qualityTerms(docid, query, terms, byQ);
|
||||
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
||||
// This can't happen, but would crash us
|
||||
if (totalweight == 0.0) {
|
||||
LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
|
||||
return vector<string>();
|
||||
}
|
||||
|
||||
///////////////////
|
||||
// For each of the query terms, ask xapian for its positions list
|
||||
// in the document. For each position entry, remember it in
|
||||
// qtermposs and insert it and its neighbours in the set of
|
||||
|
@ -390,11 +496,6 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|||
const unsigned int maxtotaloccs =
|
||||
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
|
||||
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
|
||||
// This can't happen, but would crash us
|
||||
if (totalweight == 0.0) {
|
||||
LOGERR(("makeAbstract: 0 totalweight!\n"));
|
||||
return vector<string>();
|
||||
}
|
||||
|
||||
// This is used to mark positions overlapped by a multi-word match term
|
||||
const string occupiedmarker("?");
|
||||
|
@ -1000,7 +1101,11 @@ public:
|
|||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
void newpage(int pos)
|
||||
{
|
||||
pos += m_ts->basepos;
|
||||
m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
|
||||
}
|
||||
private:
|
||||
TextSplitDb *m_ts;
|
||||
};
|
||||
|
@ -2014,6 +2119,19 @@ bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
|
|||
return m_reason.empty() ? true : false;
|
||||
}
|
||||
|
||||
int Db::getFirstMatchPage(Doc &doc, Query *query)
|
||||
{
|
||||
LOGDEB1(("Db::getFirstMatchPages\n"));;
|
||||
if (!m_ndb || !m_ndb->m_isopen) {
|
||||
LOGERR(("Db::getFirstMatchPage: no db\n"));
|
||||
return false;
|
||||
}
|
||||
int pagenum = -1;
|
||||
XAPTRY(pagenum = m_ndb->getFirstMatchPage(Xapian::docid(doc.xdocid), query),
|
||||
m_ndb->xrdb, m_reason);
|
||||
return m_reason.empty() ? pagenum : -1;
|
||||
}
|
||||
|
||||
// Retrieve document defined by Unique doc identifier. This is mainly used
|
||||
// by the GUI history feature
|
||||
bool Db::getDoc(const string &udi, Doc &doc)
|
||||
|
|
|
@ -26,10 +26,8 @@
|
|||
#include "stoplist.h"
|
||||
#include "rclconfig.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
using std::vector;
|
||||
#endif
|
||||
|
||||
// rcldb defines an interface for a 'real' text database. The current
|
||||
// implementation uses xapian only, and xapian-related code is in rcldb.cpp
|
||||
|
@ -227,6 +225,8 @@ class Db {
|
|||
* the input query. This uses index data only (no access to the file) */
|
||||
bool makeDocAbstract(Doc &doc, Query *query, string& abstract);
|
||||
bool makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract);
|
||||
/** Retrieve detected page breaks positions */
|
||||
int getFirstMatchPage(Doc &doc, Query *query);
|
||||
|
||||
/** Get document for given udi
|
||||
*
|
||||
|
@ -324,9 +324,8 @@ string version_string();
|
|||
extern const string pathelt_prefix;
|
||||
extern const string start_of_field_term;
|
||||
extern const string end_of_field_term;
|
||||
#ifndef NO_NAMESPACES
|
||||
}
|
||||
#endif // NO_NAMESPACES
|
||||
extern const string page_break_term;
|
||||
|
||||
}
|
||||
|
||||
#endif /* _DB_H_INCLUDED_ */
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
#ifndef _rcldb_p_h_included_
|
||||
#define _rcldb_p_h_included_
|
||||
|
||||
#include <map>
|
||||
|
||||
#ifdef IDX_THREADS
|
||||
#include "workqueue.h"
|
||||
#endif // IDX_THREADS
|
||||
|
@ -111,7 +113,14 @@ class Db::Native {
|
|||
#endif // IDX_THREADS
|
||||
}
|
||||
|
||||
double qualityTerms(Xapian::docid docid,
|
||||
Query *query,
|
||||
const vector<string>& terms,
|
||||
std::multimap<double, string>& byQ);
|
||||
void setDbWideQTermsFreqs(Query *query);
|
||||
vector<string> makeAbstract(Xapian::docid id, Query *query);
|
||||
bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
|
||||
int getFirstMatchPage(Xapian::docid docid, Query *query);
|
||||
|
||||
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
|
||||
|
||||
|
|
|
@ -245,7 +245,6 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
|
|||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool Query::getQueryTerms(vector<string>& terms)
|
||||
{
|
||||
if (ISNULL(m_nq))
|
||||
|
@ -288,7 +287,7 @@ bool Query::getMatchTerms(unsigned long xdocid, vector<string>& terms)
|
|||
m_db->m_ndb->xrdb, m_reason);
|
||||
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("getQueryTerms: xapian error: %s\n", m_reason.c_str()));
|
||||
LOGERR(("getMatchTerms: xapian error: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -51,6 +51,11 @@ public:
|
|||
else
|
||||
return true;
|
||||
}
|
||||
virtual void newpage(int pos)
|
||||
{
|
||||
if (m_next)
|
||||
m_next->newpage(pos);
|
||||
}
|
||||
virtual bool flush()
|
||||
{
|
||||
if (m_next)
|
||||
|
@ -91,6 +96,11 @@ public:
|
|||
else
|
||||
return true;
|
||||
}
|
||||
virtual void newpage(int pos)
|
||||
{
|
||||
if (m_prc)
|
||||
return m_prc->newpage(pos);
|
||||
}
|
||||
|
||||
private:
|
||||
TermProc *m_prc;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue