add capability to remember page breaks generated by, e.g. pdftotext, and use them to start an external viewer on a match page
This commit is contained in:
parent
a0398a6afd
commit
c96b5d11f0
15 changed files with 292 additions and 85 deletions
|
@ -44,6 +44,7 @@ src/doc/user/rcl.search.custom.html
|
||||||
src/doc/user/rcl.search.desktop.html
|
src/doc/user/rcl.search.desktop.html
|
||||||
src/doc/user/rcl.search.history.html
|
src/doc/user/rcl.search.history.html
|
||||||
src/doc/user/rcl.search.html
|
src/doc/user/rcl.search.html
|
||||||
|
src/doc/user/rcl.search.kio.html
|
||||||
src/doc/user/rcl.search.lang.html
|
src/doc/user/rcl.search.lang.html
|
||||||
src/doc/user/rcl.search.multidb.html
|
src/doc/user/rcl.search.multidb.html
|
||||||
src/doc/user/rcl.search.preview.html
|
src/doc/user/rcl.search.preview.html
|
||||||
|
@ -55,6 +56,7 @@ src/doc/user/rcl.search.wildcards.html
|
||||||
src/doc/user/rcl.searchkcl.html
|
src/doc/user/rcl.searchkcl.html
|
||||||
src/doc/user/rcl.searchkio.html
|
src/doc/user/rcl.searchkio.html
|
||||||
src/doc/user/rcl.searchkio.searchabledocs.html
|
src/doc/user/rcl.searchkio.searchabledocs.html
|
||||||
|
src/doc/user/usermanual-xml.html
|
||||||
src/doc/user/usermanual.aux
|
src/doc/user/usermanual.aux
|
||||||
src/doc/user/usermanual.html
|
src/doc/user/usermanual.html
|
||||||
src/doc/user/usermanual.html-text
|
src/doc/user/usermanual.html-text
|
||||||
|
@ -64,6 +66,7 @@ src/doc/user/usermanual.pdf
|
||||||
src/doc/user/usermanual.tex-pdf
|
src/doc/user/usermanual.tex-pdf
|
||||||
src/doc/user/usermanual.tex-pdf-tmp
|
src/doc/user/usermanual.tex-pdf-tmp
|
||||||
src/doc/user/usermanual.txt
|
src/doc/user/usermanual.txt
|
||||||
|
src/doc/user/usermanual.xml
|
||||||
src/filters/rclexecm.pyc
|
src/filters/rclexecm.pyc
|
||||||
src/filters/rcllatinclass.pyc
|
src/filters/rcllatinclass.pyc
|
||||||
src/index/alldeps
|
src/index/alldeps
|
||||||
|
|
|
@ -86,7 +86,7 @@ public:
|
||||||
for (i = 0; i < strlen(wild); i++)
|
for (i = 0; i < strlen(wild); i++)
|
||||||
charclasses[int(wild[i])] = WILD;
|
charclasses[int(wild[i])] = WILD;
|
||||||
|
|
||||||
char special[] = ".@+-,#'_\n\r";
|
char special[] = ".@+-,#'_\n\r\f";
|
||||||
for (i = 0; i < strlen(special); i++)
|
for (i = 0; i < strlen(special); i++)
|
||||||
charclasses[int(special[i])] = special[i];
|
charclasses[int(special[i])] = special[i];
|
||||||
|
|
||||||
|
@ -316,6 +316,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
||||||
int curspanglue = 0;
|
int curspanglue = 0;
|
||||||
|
bool pagepending = false;
|
||||||
|
|
||||||
// Running count of non-alphanum chars. Reset when we see one;
|
// Running count of non-alphanum chars. Reset when we see one;
|
||||||
int nonalnumcnt = 0;
|
int nonalnumcnt = 0;
|
||||||
|
@ -369,6 +370,10 @@ bool TextSplit::text_to_words(const string &in)
|
||||||
return false;
|
return false;
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
}
|
}
|
||||||
|
if (pagepending) {
|
||||||
|
pagepending = false;
|
||||||
|
newpage(m_wordpos);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WILD:
|
case WILD:
|
||||||
|
@ -521,7 +526,10 @@ bool TextSplit::text_to_words(const string &in)
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case '\f':
|
||||||
|
pagepending = true;
|
||||||
|
goto SPACE;
|
||||||
|
break;
|
||||||
#ifdef RCL_SPLIT_CAMELCASE
|
#ifdef RCL_SPLIT_CAMELCASE
|
||||||
// Camelcase handling.
|
// Camelcase handling.
|
||||||
// If we get uppercase ascii after lowercase ascii, emit word.
|
// If we get uppercase ascii after lowercase ascii, emit word.
|
||||||
|
|
|
@ -20,10 +20,8 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
#endif
|
|
||||||
|
|
||||||
class Utf8Iter;
|
class Utf8Iter;
|
||||||
|
|
||||||
|
@ -78,6 +76,12 @@ public:
|
||||||
int bte // byte offset of first char after term
|
int bte // byte offset of first char after term
|
||||||
) = 0;
|
) = 0;
|
||||||
|
|
||||||
|
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
|
||||||
|
* Mostly or exclusively used with pdftoxx output. Other filters mostly
|
||||||
|
* just don't know about pages. */
|
||||||
|
virtual void newpage(int /*pos*/)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
// Static utility functions:
|
// Static utility functions:
|
||||||
|
|
||||||
|
|
|
@ -18,8 +18,11 @@
|
||||||
# check if it was actually incorrect or just mis-understood by qtextedit
|
# check if it was actually incorrect or just mis-understood by qtextedit
|
||||||
# (tobedone)
|
# (tobedone)
|
||||||
|
|
||||||
# Comment the following if you get better results without
|
# Uncomment the following if you get better results without. The
|
||||||
optionraw=-raw
|
# pdftotext manual says that the option is no longer recommended The
|
||||||
|
# difference in output seems mostly the removal of soft-hyphens when
|
||||||
|
# -raw is not set
|
||||||
|
# optionraw=-raw
|
||||||
|
|
||||||
# set variables
|
# set variables
|
||||||
LANG=C ; export LANG
|
LANG=C ; export LANG
|
||||||
|
|
|
@ -35,6 +35,7 @@ using std::pair;
|
||||||
#include <qscrollbar.h>
|
#include <qscrollbar.h>
|
||||||
#include <qmenu.h>
|
#include <qmenu.h>
|
||||||
#include <qtextedit.h>
|
#include <qtextedit.h>
|
||||||
|
#include <qtextbrowser.h>
|
||||||
#include <qprogressdialog.h>
|
#include <qprogressdialog.h>
|
||||||
#include <qevent.h>
|
#include <qevent.h>
|
||||||
#include <qlabel.h>
|
#include <qlabel.h>
|
||||||
|
@ -732,10 +733,10 @@ class LoadThread : public QThread {
|
||||||
FileInterner::FIF_forPreview);
|
FileInterner::FIF_forPreview);
|
||||||
FIMissingStore mst;
|
FIMissingStore mst;
|
||||||
interner.setMissingStore(&mst);
|
interner.setMissingStore(&mst);
|
||||||
// We don't set the interner's target mtype to html because we
|
// Even when previewHtml is set, we don't set the interner's
|
||||||
// do want the html filter to do its work: we won't use the
|
// target mtype to html because we do want the html filter to
|
||||||
// text, but we need the conversion to utf-8
|
// do its work: we won't use the text/plain, but we want the
|
||||||
// interner.setTargetMType("text/html");
|
// text/html to be converted to utf-8 (for highlight processing)
|
||||||
try {
|
try {
|
||||||
string ipath = idoc.ipath;
|
string ipath = idoc.ipath;
|
||||||
FileInterner::Status ret = interner.internfile(out, ipath);
|
FileInterner::Status ret = interner.internfile(out, ipath);
|
||||||
|
@ -883,6 +884,22 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
|
||||||
// while still inserting at bottom
|
// while still inserting at bottom
|
||||||
list<QString> qrichlst;
|
list<QString> qrichlst;
|
||||||
PreviewTextEdit *editor = currentEditor();
|
PreviewTextEdit *editor = currentEditor();
|
||||||
|
|
||||||
|
// For an actual html file, if we want to have the images and
|
||||||
|
// style loaded in the preview, we need to set the search
|
||||||
|
// path. Not too sure this is a good idea as I find them rather
|
||||||
|
// distracting when looking for text, esp. with qtextedit
|
||||||
|
// relatively limited html support (text sometimes get hidden by
|
||||||
|
// images).
|
||||||
|
#if 0
|
||||||
|
string path = fileurltolocalpath(idoc.url);
|
||||||
|
if (!path.empty()) {
|
||||||
|
path = path_getfather(path);
|
||||||
|
QStringList paths(QString::fromLocal8Bit(path.c_str()));
|
||||||
|
editor->setSearchPaths(paths);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
editor->setHtml("");
|
editor->setHtml("");
|
||||||
editor->m_format = Qt::RichText;
|
editor->m_format = Qt::RichText;
|
||||||
bool inputishtml = !fdoc.mimetype.compare("text/html");
|
bool inputishtml = !fdoc.mimetype.compare("text/html");
|
||||||
|
@ -1073,7 +1090,7 @@ bool Preview::loadDocInCurrentTab(const Rcl::Doc &idoc, int docnum)
|
||||||
}
|
}
|
||||||
|
|
||||||
PreviewTextEdit::PreviewTextEdit(QWidget* parent, const char* nm, Preview *pv)
|
PreviewTextEdit::PreviewTextEdit(QWidget* parent, const char* nm, Preview *pv)
|
||||||
: QTextEdit(parent), m_preview(pv),
|
: QTextBrowser(parent), m_preview(pv),
|
||||||
m_plaintorich(new PlainToRichQtPreview()),
|
m_plaintorich(new PlainToRichQtPreview()),
|
||||||
m_dspflds(false), m_docnum(-1)
|
m_dspflds(false), m_docnum(-1)
|
||||||
{
|
{
|
||||||
|
@ -1081,6 +1098,8 @@ PreviewTextEdit::PreviewTextEdit(QWidget* parent, const char* nm, Preview *pv)
|
||||||
setObjectName(nm);
|
setObjectName(nm);
|
||||||
connect(this, SIGNAL(customContextMenuRequested(const QPoint&)),
|
connect(this, SIGNAL(customContextMenuRequested(const QPoint&)),
|
||||||
this, SLOT(createPopupMenu(const QPoint&)));
|
this, SLOT(createPopupMenu(const QPoint&)));
|
||||||
|
setOpenExternalLinks(false);
|
||||||
|
setOpenLinks(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
PreviewTextEdit::~PreviewTextEdit()
|
PreviewTextEdit::~PreviewTextEdit()
|
||||||
|
|
|
@ -17,11 +17,24 @@
|
||||||
#ifndef _PREVIEW_W_H_INCLUDED_
|
#ifndef _PREVIEW_W_H_INCLUDED_
|
||||||
#define _PREVIEW_W_H_INCLUDED_
|
#define _PREVIEW_W_H_INCLUDED_
|
||||||
|
|
||||||
|
// Always use a qtextbrowser for now, there is no compelling reason to
|
||||||
|
// switch to webkit here
|
||||||
|
#if 1 || defined(RESLIST_TEXTBROWSER)
|
||||||
|
#define PREVIEW_TEXTBROWSER
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include <qvariant.h>
|
#include <qvariant.h>
|
||||||
#include <qwidget.h>
|
#include <qwidget.h>
|
||||||
#include <qtextedit.h>
|
|
||||||
|
#ifdef PREVIEW_TEXTBROWSER
|
||||||
|
#include <QTextBrowser>
|
||||||
|
#define PREVIEW_PARENTCLASS QTextBrowser
|
||||||
|
#else
|
||||||
|
#include <QtWebKit/QWebView>
|
||||||
|
#define PREVIEW_PARENTCLASS QWebView
|
||||||
|
#endif
|
||||||
#include <qimage.h>
|
#include <qimage.h>
|
||||||
|
|
||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
|
@ -31,13 +44,12 @@
|
||||||
|
|
||||||
class QTabWidget;
|
class QTabWidget;
|
||||||
class QLabel;
|
class QLabel;
|
||||||
class QLineEdit;
|
|
||||||
class QPushButton;
|
class QPushButton;
|
||||||
class QCheckBox;
|
class QCheckBox;
|
||||||
class Preview;
|
class Preview;
|
||||||
class PlainToRichQtPreview;
|
class PlainToRichQtPreview;
|
||||||
|
|
||||||
class PreviewTextEdit : public QTextEdit {
|
class PreviewTextEdit : public PREVIEW_PARENTCLASS {
|
||||||
Q_OBJECT;
|
Q_OBJECT;
|
||||||
public:
|
public:
|
||||||
PreviewTextEdit(QWidget* parent, const char* name, Preview *pv);
|
PreviewTextEdit(QWidget* parent, const char* name, Preview *pv);
|
||||||
|
|
|
@ -1520,6 +1520,14 @@ void RclMain::startNativeViewer(Rcl::Doc doc)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int pagenum = 1;
|
||||||
|
if (m_source.isNotNull())
|
||||||
|
pagenum = m_source->getFirstMatchPage(doc);
|
||||||
|
if (pagenum == -1)
|
||||||
|
pagenum = 1;
|
||||||
|
char cpagenum[20];
|
||||||
|
sprintf(cpagenum, "%d", pagenum);
|
||||||
|
|
||||||
// Extract possible viewer attributes
|
// Extract possible viewer attributes
|
||||||
ConfSimple attrs;
|
ConfSimple attrs;
|
||||||
string cmd;
|
string cmd;
|
||||||
|
@ -1657,6 +1665,7 @@ void RclMain::startNativeViewer(Rcl::Doc doc)
|
||||||
subs["F"] = orgfn;
|
subs["F"] = orgfn;
|
||||||
subs["i"] = doc.ipath;
|
subs["i"] = doc.ipath;
|
||||||
subs["M"] = doc.mimetype;
|
subs["M"] = doc.mimetype;
|
||||||
|
subs["p"] = cpagenum;
|
||||||
subs["U"] = url;
|
subs["U"] = url;
|
||||||
subs["u"] = url;
|
subs["u"] = url;
|
||||||
// Let %(xx) access all metadata.
|
// Let %(xx) access all metadata.
|
||||||
|
|
|
@ -95,6 +95,11 @@ class DocSequence {
|
||||||
abs.push_back(doc.meta[Rcl::Doc::keyabs]);
|
abs.push_back(doc.meta[Rcl::Doc::keyabs]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
virtual int getFirstMatchPage(Rcl::Doc&)
|
||||||
|
{
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0;
|
virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0;
|
||||||
|
|
||||||
/** Get estimated total count in results */
|
/** Get estimated total count in results */
|
||||||
|
|
|
@ -74,10 +74,18 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
|
||||||
}
|
}
|
||||||
if (vabs.empty())
|
if (vabs.empty())
|
||||||
vabs.push_back(doc.meta[Rcl::Doc::keyabs]);
|
vabs.push_back(doc.meta[Rcl::Doc::keyabs]);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc)
|
||||||
|
{
|
||||||
|
setQuery();
|
||||||
|
if (m_q->whatDb()) {
|
||||||
|
return m_q->whatDb()->getFirstMatchPage(doc, m_q.getptr());
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
|
bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
|
||||||
{
|
{
|
||||||
setQuery();
|
setQuery();
|
||||||
|
|
|
@ -32,6 +32,7 @@ class DocSequenceDb : public DocSequence {
|
||||||
virtual int getResCnt();
|
virtual int getResCnt();
|
||||||
virtual void getTerms(HighlightData& hld);
|
virtual void getTerms(HighlightData& hld);
|
||||||
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
|
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
|
||||||
|
virtual int getFirstMatchPage(Rcl::Doc&);
|
||||||
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
|
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
|
||||||
virtual string getDescription();
|
virtual string getDescription();
|
||||||
virtual list<string> expand(Rcl::Doc &doc);
|
virtual list<string> expand(Rcl::Doc &doc);
|
||||||
|
|
|
@ -77,6 +77,7 @@ namespace Rcl {
|
||||||
const string pathelt_prefix = "XP";
|
const string pathelt_prefix = "XP";
|
||||||
const string start_of_field_term = "XXST";
|
const string start_of_field_term = "XXST";
|
||||||
const string end_of_field_term = "XXND";
|
const string end_of_field_term = "XXND";
|
||||||
|
const string page_break_term = "XXPG";
|
||||||
|
|
||||||
// This is used as a marker inside the abstract frag lists, but
|
// This is used as a marker inside the abstract frag lists, but
|
||||||
// normally doesn't remain in final output (which is built with a
|
// normally doesn't remain in final output (which is built with a
|
||||||
|
@ -245,31 +246,21 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove prefixes (caps) from terms.
|
// Keep only non-prefixed terms. We use to remove prefixes and keep
|
||||||
|
// the terms instead, but field terms are normally also indexed
|
||||||
|
// un-prefixed, so this is simpler and better.
|
||||||
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
||||||
{
|
{
|
||||||
for (vector<string>::const_iterator qit = in.begin();
|
for (vector<string>::const_iterator qit = in.begin();
|
||||||
qit != in.end(); qit++) {
|
qit != in.end(); qit++) {
|
||||||
if ('A' <= qit->at(0) && qit->at(0) <= 'Z') {
|
if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
|
||||||
string term = *qit;
|
|
||||||
while (term.length() && 'A' <= term.at(0) && term.at(0) <= 'Z')
|
|
||||||
term.erase(0, 1);
|
|
||||||
if (term.length())
|
|
||||||
out.push_back(term);
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
out.push_back(*qit);
|
out.push_back(*qit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
//#define DEBUGABSTRACT 1
|
#undef DEBUGABSTRACT
|
||||||
#ifdef DEBUGABSTRACT
|
#ifdef DEBUGABSTRACT
|
||||||
#define LOGABS LOGDEB
|
#define LOGABS LOGDEB
|
||||||
#else
|
|
||||||
#define LOGABS LOGDEB2
|
|
||||||
#endif
|
|
||||||
#if 0
|
|
||||||
static void listList(const string& what, const vector<string>&l)
|
static void listList(const string& what, const vector<string>&l)
|
||||||
{
|
{
|
||||||
string a;
|
string a;
|
||||||
|
@ -278,58 +269,55 @@ static void listList(const string& what, const vector<string>&l)
|
||||||
}
|
}
|
||||||
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
|
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
#define LOGABS LOGDEB2
|
||||||
|
static void listList(const string&, const vector<string>&)
|
||||||
|
{
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Build a document abstract by extracting text chunks around the query terms
|
// Retrieve and store db-wide frequencies for the query terms.
|
||||||
// This uses the db termlists, not the original document.
|
void Db::Native::setDbWideQTermsFreqs(Query *query)
|
||||||
//
|
|
||||||
// DatabaseModified and other general exceptions are catched and
|
|
||||||
// possibly retried by our caller
|
|
||||||
vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
|
||||||
{
|
{
|
||||||
Chrono chron;
|
// Do it once only for a given query.
|
||||||
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
if (!query->m_nq->termfreqs.empty())
|
||||||
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
|
return;
|
||||||
|
|
||||||
vector<string> terms;
|
|
||||||
|
|
||||||
|
vector<string> qterms;
|
||||||
{
|
{
|
||||||
vector<string> iterms;
|
vector<string> iqterms;
|
||||||
query->getMatchTerms(docid, iterms);
|
|
||||||
noPrefixList(iterms, terms);
|
|
||||||
if (terms.empty()) {
|
|
||||||
LOGDEB(("makeAbstract::Empty term list\n"));
|
|
||||||
return vector<string>();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// listList("Match terms: ", terms);
|
|
||||||
|
|
||||||
// Retrieve db-wide frequencies for the query terms (we do this once per
|
|
||||||
// query, using all the query terms, not only the document match terms)
|
|
||||||
if (query->m_nq->termfreqs.empty()) {
|
|
||||||
vector<string> iqterms, qterms;
|
|
||||||
query->getQueryTerms(iqterms);
|
query->getQueryTerms(iqterms);
|
||||||
noPrefixList(iqterms, qterms);
|
noPrefixList(iqterms, qterms);
|
||||||
|
}
|
||||||
// listList("Query terms: ", qterms);
|
// listList("Query terms: ", qterms);
|
||||||
|
|
||||||
double doccnt = xrdb.get_doccount();
|
double doccnt = xrdb.get_doccount();
|
||||||
if (doccnt == 0) doccnt = 1;
|
if (doccnt == 0)
|
||||||
|
doccnt = 1;
|
||||||
|
|
||||||
for (vector<string>::const_iterator qit = qterms.begin();
|
for (vector<string>::const_iterator qit = qterms.begin();
|
||||||
qit != qterms.end(); qit++) {
|
qit != qterms.end(); qit++) {
|
||||||
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
||||||
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
||||||
query->m_nq->termfreqs[*qit]));
|
query->m_nq->termfreqs[*qit]));
|
||||||
}
|
}
|
||||||
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute a term quality coefficient by retrieving the term
|
// Compute query terms quality coefficients for a matched document by
|
||||||
// Within Document Frequencies and multiplying by overal term
|
// retrieving the Within Document Frequencies and multiplying by
|
||||||
// frequency, then using log-based thresholds. We are going to try
|
// overal term frequency, then using log-based thresholds.
|
||||||
// and show text around the less common search terms.
|
double Db::Native::qualityTerms(Xapian::docid docid,
|
||||||
|
Query *query,
|
||||||
|
const vector<string>& terms,
|
||||||
|
multimap<double, string>& byQ)
|
||||||
|
{
|
||||||
map<string, double> termQcoefs;
|
map<string, double> termQcoefs;
|
||||||
double totalweight = 0;
|
double totalweight = 0;
|
||||||
|
|
||||||
double doclen = xrdb.get_doclength(docid);
|
double doclen = xrdb.get_doclength(docid);
|
||||||
if (doclen == 0) doclen = 1;
|
if (doclen == 0)
|
||||||
|
doclen = 1;
|
||||||
|
|
||||||
for (vector<string>::const_iterator qit = terms.begin();
|
for (vector<string>::const_iterator qit = terms.begin();
|
||||||
qit != terms.end(); qit++) {
|
qit != terms.end(); qit++) {
|
||||||
Xapian::TermIterator term = xrdb.termlist_begin(docid);
|
Xapian::TermIterator term = xrdb.termlist_begin(docid);
|
||||||
|
@ -352,10 +340,8 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||||
totalweight += q;
|
totalweight += q;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
|
||||||
|
|
||||||
// Build a sorted by quality term list.
|
// Build a sorted by quality term list.
|
||||||
multimap<double, string> byQ;
|
|
||||||
for (vector<string>::const_iterator qit = terms.begin();
|
for (vector<string>::const_iterator qit = terms.begin();
|
||||||
qit != terms.end(); qit++) {
|
qit != terms.end(); qit++) {
|
||||||
if (termQcoefs.find(*qit) != termQcoefs.end())
|
if (termQcoefs.find(*qit) != termQcoefs.end())
|
||||||
|
@ -368,8 +354,128 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||||
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
|
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
return totalweight;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the positions list for the page break term
|
||||||
|
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
|
||||||
|
{
|
||||||
|
string qterm = page_break_term;
|
||||||
|
Xapian::PositionIterator pos;
|
||||||
|
try {
|
||||||
|
for (pos = xrdb.positionlist_begin(docid, qterm);
|
||||||
|
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
||||||
|
int ipos = *pos;
|
||||||
|
if (ipos < int(baseTextPosition)) {
|
||||||
|
// Not in text body. Strange...
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
vpos.push_back(ipos);
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
// Term does not occur. No problem.
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return page number for first match of "significant" term.
|
||||||
|
int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
|
||||||
|
{
|
||||||
|
vector<string> terms;
|
||||||
|
{
|
||||||
|
vector<string> iterms;
|
||||||
|
query->getMatchTerms(docid, iterms);
|
||||||
|
noPrefixList(iterms, terms);
|
||||||
|
}
|
||||||
|
if (terms.empty()) {
|
||||||
|
LOGDEB(("getFirstMatchPage: empty match term list (field match?)\n"));
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<int> pagepos;
|
||||||
|
getPagePositions(docid, pagepos);
|
||||||
|
if (pagepos.empty())
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
setDbWideQTermsFreqs(query);
|
||||||
|
|
||||||
|
// We try to use a page which matches the "best" term. Get a sorted list
|
||||||
|
multimap<double, string> byQ;
|
||||||
|
double totalweight = qualityTerms(docid, query, terms, byQ);
|
||||||
|
|
||||||
|
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
||||||
|
qit != byQ.rend(); qit++) {
|
||||||
|
string qterm = qit->second;
|
||||||
|
Xapian::PositionIterator pos;
|
||||||
|
string emptys;
|
||||||
|
try {
|
||||||
|
for (pos = xrdb.positionlist_begin(docid, qterm);
|
||||||
|
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
||||||
|
int ipos = *pos;
|
||||||
|
if (ipos < int(baseTextPosition)) // Not in text body
|
||||||
|
continue;
|
||||||
|
// What page ?
|
||||||
|
LOGABS(("getFirstPageMatch: looking for match for [%s]\n",
|
||||||
|
qterm.c_str()));
|
||||||
|
vector<int>::const_iterator it =
|
||||||
|
lower_bound(pagepos.begin(), pagepos.end(), ipos);
|
||||||
|
if (it != pagepos.end())
|
||||||
|
return it - pagepos.begin() + 1;
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
// Term does not occur. No problem.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build a document abstract by extracting text chunks around the query terms
|
||||||
|
// This uses the db termlists, not the original document.
|
||||||
|
//
|
||||||
|
// DatabaseModified and other general exceptions are catched and
|
||||||
|
// possibly retried by our caller
|
||||||
|
vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||||
|
{
|
||||||
|
Chrono chron;
|
||||||
|
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
||||||
|
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
|
||||||
|
|
||||||
|
// The (unprefixed) terms matched by this document
|
||||||
|
vector<string> terms;
|
||||||
|
|
||||||
|
{
|
||||||
|
vector<string> iterms;
|
||||||
|
query->getMatchTerms(docid, iterms);
|
||||||
|
noPrefixList(iterms, terms);
|
||||||
|
if (terms.empty()) {
|
||||||
|
LOGDEB(("makeAbstract::Empty term list\n"));
|
||||||
|
return vector<string>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
listList("Match terms: ", terms);
|
||||||
|
|
||||||
|
// Retrieve the term freqencies for the query terms. This is
|
||||||
|
// actually computed only once for a query, and for all terms in
|
||||||
|
// the query (not only the matches for this doc)
|
||||||
|
setDbWideQTermsFreqs(query);
|
||||||
|
|
||||||
|
// Build a sorted by quality container for the match terms We are
|
||||||
|
// going to try and show text around the less common search terms.
|
||||||
|
// TOBEDONE: terms issued from an original one by stem expansion
|
||||||
|
// should be somehow aggregated here, else, it may happen that
|
||||||
|
// such a group prevents displaying matches for other terms (by
|
||||||
|
// remaining its meaning to the maximum occurrences per term test
|
||||||
|
// using while walking the list below)
|
||||||
|
multimap<double, string> byQ;
|
||||||
|
double totalweight = qualityTerms(docid, query, terms, byQ);
|
||||||
|
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
||||||
|
// This can't happen, but would crash us
|
||||||
|
if (totalweight == 0.0) {
|
||||||
|
LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
|
||||||
|
return vector<string>();
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////
|
||||||
// For each of the query terms, ask xapian for its positions list
|
// For each of the query terms, ask xapian for its positions list
|
||||||
// in the document. For each position entry, remember it in
|
// in the document. For each position entry, remember it in
|
||||||
// qtermposs and insert it and its neighbours in the set of
|
// qtermposs and insert it and its neighbours in the set of
|
||||||
|
@ -390,11 +496,6 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
||||||
const unsigned int maxtotaloccs =
|
const unsigned int maxtotaloccs =
|
||||||
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
|
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
|
||||||
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
|
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
|
||||||
// This can't happen, but would crash us
|
|
||||||
if (totalweight == 0.0) {
|
|
||||||
LOGERR(("makeAbstract: 0 totalweight!\n"));
|
|
||||||
return vector<string>();
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is used to mark positions overlapped by a multi-word match term
|
// This is used to mark positions overlapped by a multi-word match term
|
||||||
const string occupiedmarker("?");
|
const string occupiedmarker("?");
|
||||||
|
@ -1000,7 +1101,11 @@ public:
|
||||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
void newpage(int pos)
|
||||||
|
{
|
||||||
|
pos += m_ts->basepos;
|
||||||
|
m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
TextSplitDb *m_ts;
|
TextSplitDb *m_ts;
|
||||||
};
|
};
|
||||||
|
@ -2014,6 +2119,19 @@ bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
|
||||||
return m_reason.empty() ? true : false;
|
return m_reason.empty() ? true : false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int Db::getFirstMatchPage(Doc &doc, Query *query)
|
||||||
|
{
|
||||||
|
LOGDEB1(("Db::getFirstMatchPages\n"));;
|
||||||
|
if (!m_ndb || !m_ndb->m_isopen) {
|
||||||
|
LOGERR(("Db::getFirstMatchPage: no db\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int pagenum = -1;
|
||||||
|
XAPTRY(pagenum = m_ndb->getFirstMatchPage(Xapian::docid(doc.xdocid), query),
|
||||||
|
m_ndb->xrdb, m_reason);
|
||||||
|
return m_reason.empty() ? pagenum : -1;
|
||||||
|
}
|
||||||
|
|
||||||
// Retrieve document defined by Unique doc identifier. This is mainly used
|
// Retrieve document defined by Unique doc identifier. This is mainly used
|
||||||
// by the GUI history feature
|
// by the GUI history feature
|
||||||
bool Db::getDoc(const string &udi, Doc &doc)
|
bool Db::getDoc(const string &udi, Doc &doc)
|
||||||
|
|
|
@ -26,10 +26,8 @@
|
||||||
#include "stoplist.h"
|
#include "stoplist.h"
|
||||||
#include "rclconfig.h"
|
#include "rclconfig.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
#endif
|
|
||||||
|
|
||||||
// rcldb defines an interface for a 'real' text database. The current
|
// rcldb defines an interface for a 'real' text database. The current
|
||||||
// implementation uses xapian only, and xapian-related code is in rcldb.cpp
|
// implementation uses xapian only, and xapian-related code is in rcldb.cpp
|
||||||
|
@ -227,6 +225,8 @@ class Db {
|
||||||
* the input query. This uses index data only (no access to the file) */
|
* the input query. This uses index data only (no access to the file) */
|
||||||
bool makeDocAbstract(Doc &doc, Query *query, string& abstract);
|
bool makeDocAbstract(Doc &doc, Query *query, string& abstract);
|
||||||
bool makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract);
|
bool makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract);
|
||||||
|
/** Retrieve detected page breaks positions */
|
||||||
|
int getFirstMatchPage(Doc &doc, Query *query);
|
||||||
|
|
||||||
/** Get document for given udi
|
/** Get document for given udi
|
||||||
*
|
*
|
||||||
|
@ -324,9 +324,8 @@ string version_string();
|
||||||
extern const string pathelt_prefix;
|
extern const string pathelt_prefix;
|
||||||
extern const string start_of_field_term;
|
extern const string start_of_field_term;
|
||||||
extern const string end_of_field_term;
|
extern const string end_of_field_term;
|
||||||
#ifndef NO_NAMESPACES
|
extern const string page_break_term;
|
||||||
}
|
|
||||||
#endif // NO_NAMESPACES
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* _DB_H_INCLUDED_ */
|
#endif /* _DB_H_INCLUDED_ */
|
||||||
|
|
|
@ -18,6 +18,8 @@
|
||||||
#ifndef _rcldb_p_h_included_
|
#ifndef _rcldb_p_h_included_
|
||||||
#define _rcldb_p_h_included_
|
#define _rcldb_p_h_included_
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
|
||||||
#ifdef IDX_THREADS
|
#ifdef IDX_THREADS
|
||||||
#include "workqueue.h"
|
#include "workqueue.h"
|
||||||
#endif // IDX_THREADS
|
#endif // IDX_THREADS
|
||||||
|
@ -111,7 +113,14 @@ class Db::Native {
|
||||||
#endif // IDX_THREADS
|
#endif // IDX_THREADS
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double qualityTerms(Xapian::docid docid,
|
||||||
|
Query *query,
|
||||||
|
const vector<string>& terms,
|
||||||
|
std::multimap<double, string>& byQ);
|
||||||
|
void setDbWideQTermsFreqs(Query *query);
|
||||||
vector<string> makeAbstract(Xapian::docid id, Query *query);
|
vector<string> makeAbstract(Xapian::docid id, Query *query);
|
||||||
|
bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
|
||||||
|
int getFirstMatchPage(Xapian::docid docid, Query *query);
|
||||||
|
|
||||||
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
|
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
|
||||||
|
|
||||||
|
|
|
@ -245,7 +245,6 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool Query::getQueryTerms(vector<string>& terms)
|
bool Query::getQueryTerms(vector<string>& terms)
|
||||||
{
|
{
|
||||||
if (ISNULL(m_nq))
|
if (ISNULL(m_nq))
|
||||||
|
@ -288,7 +287,7 @@ bool Query::getMatchTerms(unsigned long xdocid, vector<string>& terms)
|
||||||
m_db->m_ndb->xrdb, m_reason);
|
m_db->m_ndb->xrdb, m_reason);
|
||||||
|
|
||||||
if (!m_reason.empty()) {
|
if (!m_reason.empty()) {
|
||||||
LOGERR(("getQueryTerms: xapian error: %s\n", m_reason.c_str()));
|
LOGERR(("getMatchTerms: xapian error: %s\n", m_reason.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -51,6 +51,11 @@ public:
|
||||||
else
|
else
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
virtual void newpage(int pos)
|
||||||
|
{
|
||||||
|
if (m_next)
|
||||||
|
m_next->newpage(pos);
|
||||||
|
}
|
||||||
virtual bool flush()
|
virtual bool flush()
|
||||||
{
|
{
|
||||||
if (m_next)
|
if (m_next)
|
||||||
|
@ -91,6 +96,11 @@ public:
|
||||||
else
|
else
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
virtual void newpage(int pos)
|
||||||
|
{
|
||||||
|
if (m_prc)
|
||||||
|
return m_prc->newpage(pos);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
TermProc *m_prc;
|
TermProc *m_prc;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue