add functions and interface to display the duplicates of a result document
This commit is contained in:
parent
c4656c1d10
commit
a2f4bc9cf5
11 changed files with 217 additions and 9 deletions
|
@ -8,8 +8,8 @@ LIBS = librecoll.a $(LIBRECOLL)
|
|||
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o uncomp.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o daterange.o expansiondbs.o rclabstract.o rcldb.o rcldoc.o rclquery.o rclterms.o searchdata.o searchdataxml.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o strmatcher.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp uncomp.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp daterange.dep.stamp expansiondbs.dep.stamp rclabstract.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp rclterms.dep.stamp searchdata.dep.stamp searchdataxml.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp strmatcher.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o uncomp.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o daterange.o expansiondbs.o rclabstract.o rcldb.o rcldoc.o rcldups.o rclquery.o rclterms.o searchdata.o searchdataxml.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o strmatcher.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp uncomp.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp daterange.dep.stamp expansiondbs.dep.stamp rclabstract.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rcldups.dep.stamp rclquery.dep.stamp rclterms.dep.stamp searchdata.dep.stamp searchdataxml.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp strmatcher.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
|
||||
librecoll.a : $(DEPS) $(OBJS)
|
||||
ar ru librecoll.a $(OBJS)
|
||||
|
@ -109,6 +109,8 @@ rcldb.o : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
|
|||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp
|
||||
rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldoc.cpp
|
||||
rcldups.o : ../rcldb/rcldups.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldups.cpp
|
||||
rclquery.o : ../rcldb/rclquery.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rclquery.cpp
|
||||
rclterms.o : ../rcldb/rclterms.cpp $(depth)/mk/localdefs
|
||||
|
@ -320,6 +322,9 @@ rcldb.dep.stamp : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
|
|||
rcldoc.dep.stamp : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldoc.cpp > rcldoc.dep
|
||||
touch rcldoc.dep.stamp
|
||||
rcldups.dep.stamp : ../rcldb/rcldups.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldups.cpp > rcldups.dep
|
||||
touch rcldups.dep.stamp
|
||||
rclquery.dep.stamp : ../rcldb/rclquery.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rclquery.cpp > rclquery.dep
|
||||
touch rclquery.dep.stamp
|
||||
|
@ -456,6 +461,7 @@ include expansiondbs.dep
|
|||
include rclabstract.dep
|
||||
include rcldb.dep
|
||||
include rcldoc.dep
|
||||
include rcldups.dep
|
||||
include rclquery.dep
|
||||
include rclterms.dep
|
||||
include searchdata.dep
|
||||
|
|
|
@ -48,6 +48,7 @@ ${depth}/rcldb/expansiondbs.cpp \
|
|||
${depth}/rcldb/rclabstract.cpp \
|
||||
${depth}/rcldb/rcldb.cpp \
|
||||
${depth}/rcldb/rcldoc.cpp \
|
||||
${depth}/rcldb/rcldups.cpp \
|
||||
${depth}/rcldb/rclquery.cpp \
|
||||
${depth}/rcldb/rclterms.cpp \
|
||||
${depth}/rcldb/searchdata.cpp \
|
||||
|
|
|
@ -52,7 +52,7 @@
|
|||
#include "internfile.h"
|
||||
#include "indexer.h"
|
||||
#include "snippets_w.h"
|
||||
|
||||
#include "listdialog.h"
|
||||
#include "reslist.h"
|
||||
#include "moc_reslist.cpp"
|
||||
#include "rclhelp.h"
|
||||
|
@ -815,6 +815,32 @@ void ResList::newSnippetsW(const Rcl::Doc& doc)
|
|||
sp->show();
|
||||
}
|
||||
|
||||
void ResList::newDupsW(const Rcl::Doc&, const vector<Rcl::Doc>& dups)
|
||||
{
|
||||
ListDialog dialog;
|
||||
dialog.setWindowTitle(tr("Duplicate documents"));
|
||||
|
||||
dialog.groupBox->setTitle(tr("These Urls ( | ipath) share the same"
|
||||
" content:"));
|
||||
// We replace the list with an editor so that the user can copy/paste
|
||||
delete dialog.listWidget;
|
||||
QTextEdit *editor = new QTextEdit(dialog.groupBox);
|
||||
editor->setReadOnly(TRUE);
|
||||
dialog.horizontalLayout->addWidget(editor);
|
||||
|
||||
for (vector<Rcl::Doc>::const_iterator it = dups.begin();
|
||||
it != dups.end(); it++) {
|
||||
if (it->ipath.empty())
|
||||
editor->append(QString::fromLocal8Bit(it->url.c_str()));
|
||||
else
|
||||
editor->append(QString::fromLocal8Bit(it->url.c_str()) + " | " +
|
||||
QString::fromUtf8(it->ipath.c_str()));
|
||||
}
|
||||
editor->moveCursor(QTextCursor::Start);
|
||||
editor->ensureCursorVisible();
|
||||
dialog.exec();
|
||||
}
|
||||
|
||||
void ResList::linkWasClicked(const QUrl &url)
|
||||
{
|
||||
string ascurl = (const char *)url.toString().toAscii();;
|
||||
|
@ -822,6 +848,7 @@ void ResList::linkWasClicked(const QUrl &url)
|
|||
|
||||
int what = ascurl[0];
|
||||
switch (what) {
|
||||
|
||||
// Open abstract/snippets window
|
||||
case 'A':
|
||||
{
|
||||
|
@ -837,10 +864,30 @@ void ResList::linkWasClicked(const QUrl &url)
|
|||
}
|
||||
break;
|
||||
|
||||
// Show duplicates
|
||||
case 'D':
|
||||
{
|
||||
if (m_source.isNull())
|
||||
return;
|
||||
int i = atoi(ascurl.c_str()+1) - 1;
|
||||
Rcl::Doc doc;
|
||||
if (!getDoc(i, doc)) {
|
||||
LOGERR(("ResList::linkWasClicked: can't get doc for %d\n", i));
|
||||
return;
|
||||
}
|
||||
vector<Rcl::Doc> dups;
|
||||
if (m_source->docDups(doc, dups)) {
|
||||
newDupsW(doc, dups);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
// Show query details
|
||||
case 'H':
|
||||
{
|
||||
emit headerClicked();
|
||||
break;
|
||||
}
|
||||
|
||||
// Preview and edit
|
||||
case 'P':
|
||||
|
|
|
@ -146,6 +146,7 @@ class ResList : public RESLIST_PARENTCLASS
|
|||
bool scrollIsAtBottom();
|
||||
void setupArrows();
|
||||
void newSnippetsW(const Rcl::Doc &doc);
|
||||
void newDupsW(const Rcl::Doc& doc, const std::vector<Rcl::Doc>& dups);
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -108,6 +108,11 @@ class DocSequence {
|
|||
{
|
||||
return -1;
|
||||
}
|
||||
/** Get duplicates. */
|
||||
virtual bool docDups(const Rcl::Doc&, std::vector<Rcl::Doc>&)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0;
|
||||
|
||||
|
@ -185,6 +190,13 @@ public:
|
|||
return false;
|
||||
return m_seq->getAbstract(doc, abs);
|
||||
}
|
||||
/** Get duplicates. */
|
||||
virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return false;
|
||||
return m_seq->docDups(doc, dups);
|
||||
}
|
||||
|
||||
virtual bool snippetsCapable()
|
||||
{
|
||||
|
|
|
@ -239,3 +239,12 @@ bool DocSequenceDb::setQuery()
|
|||
}
|
||||
return m_lastSQStatus;
|
||||
}
|
||||
|
||||
bool DocSequenceDb::docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
|
||||
{
|
||||
if (m_q->whatDb()) {
|
||||
return m_q->whatDb()->docDups(doc, dups);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ class DocSequenceDb : public DocSequence {
|
|||
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
|
||||
virtual int getFirstMatchPage(Rcl::Doc&, std::string& term);
|
||||
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
|
||||
virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups);
|
||||
virtual string getDescription();
|
||||
virtual list<string> expand(Rcl::Doc &doc);
|
||||
virtual bool canFilter() {return true;}
|
||||
|
|
|
@ -255,6 +255,15 @@ void ResListPager::displayDoc(RclConfig *config, int i, Rcl::Doc& doc,
|
|||
linksbuf << " " << snipsbuf.str();
|
||||
}
|
||||
|
||||
string collapscnt;
|
||||
if (doc.getmeta(Rcl::Doc::keycc, &collapscnt) && !collapscnt.empty()) {
|
||||
ostringstream collpsbuf;
|
||||
int clc = atoi(collapscnt.c_str()) + 1;
|
||||
collpsbuf << "<a href=\"D" << docnumforlinks << "\">"
|
||||
<< trans("Dups") << "(" << clc << ")" << "</a> ";
|
||||
linksbuf << " " << collpsbuf.str();
|
||||
}
|
||||
|
||||
// Build the result list paragraph:
|
||||
|
||||
// Subheader: this is used by history
|
||||
|
|
|
@ -353,6 +353,9 @@ class Db {
|
|||
*/
|
||||
bool getDoc(const string &udi, Doc &doc);
|
||||
|
||||
/** Get duplicates (md5) of document */
|
||||
bool docDups(const Doc& idoc, std::vector<Doc>& odocs);
|
||||
|
||||
/* The following are mainly for the aspell module */
|
||||
/** Whole term list walking. */
|
||||
TermIter *termWalkOpen();
|
||||
|
|
117
src/rcldb/rcldups.cpp
Normal file
117
src/rcldb/rcldups.cpp
Normal file
|
@ -0,0 +1,117 @@
|
|||
/* Copyright (C) 2013 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
/** Retrieve the dups of a given document. The input has to be a query result
|
||||
* because we use the xdocid */
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
#include <xapian.h>
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "rcldb.h"
|
||||
#include "rcldb_p.h"
|
||||
#include "xmacros.h"
|
||||
#include "md5.h"
|
||||
#include "searchdata.h"
|
||||
#include "rclquery.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
// File name wild card expansion. This is a specialisation ot termMatch
|
||||
bool Db::docDups(const Doc& idoc, vector<Doc>& odocs)
|
||||
{
|
||||
if (m_ndb == 0) {
|
||||
LOGERR(("Db::docDups: no db\n"));
|
||||
return false;
|
||||
}
|
||||
if (idoc.xdocid == 0) {
|
||||
LOGERR(("Db::docDups: null xdocid in input doc\n"));
|
||||
return false;
|
||||
}
|
||||
// Get the xapian doc
|
||||
Xapian::Document xdoc;
|
||||
XAPTRY(xdoc = m_ndb->xrdb.get_document(Xapian::docid(idoc.xdocid)),
|
||||
m_ndb->xrdb, m_reason);
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::docDups: xapian error: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the md5
|
||||
string digest;
|
||||
XAPTRY(digest = xdoc.get_value(VALUE_MD5), m_ndb->xrdb, m_reason);
|
||||
if (!m_reason.empty()) {
|
||||
LOGERR(("Db::docDups: xapian error: %s\n", m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
if (digest.empty()) {
|
||||
LOGDEB(("Db::docDups: doc has no md5\n"));
|
||||
return false;
|
||||
}
|
||||
string md5;
|
||||
MD5HexPrint(digest, md5);
|
||||
|
||||
SearchData *sdp = new SearchData();
|
||||
RefCntr<SearchData> sd(sdp);
|
||||
SearchDataClauseSimple *sdc =
|
||||
new SearchDataClauseSimple(SCLT_AND, md5, "rclmd5");
|
||||
sdc->addModifier(SearchDataClause::SDCM_CASESENS);
|
||||
sdc->addModifier(SearchDataClause::SDCM_DIACSENS);
|
||||
sd->addClause(sdc);
|
||||
Query query(this);
|
||||
query.setCollapseDuplicates(0);
|
||||
if (!query.setQuery(sd)) {
|
||||
LOGERR(("Db::docDups: setQuery failed\n"));
|
||||
return false;
|
||||
}
|
||||
int cnt = query.getResCnt();
|
||||
for (int i = 0; i < cnt; i++) {
|
||||
Doc doc;
|
||||
if (!query.getDoc(i, doc)) {
|
||||
LOGERR(("Db::docDups: getDoc failed at %d (cnt %d)\n", i, cnt));
|
||||
return false;
|
||||
}
|
||||
odocs.push_back(doc);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#if 0
|
||||
{
|
||||
vector<Doc> dups;
|
||||
bool ret;
|
||||
LOGDEB(("DOCDUPS\n"));
|
||||
ret = m_db->docDups(doc, dups);
|
||||
if (!ret) {
|
||||
LOGDEB(("docDups failed\n"));
|
||||
} else if (dups.size() == 1) {
|
||||
LOGDEB(("No dups\n"));
|
||||
} else {
|
||||
for (unsigned int i = 0; i < dups.size(); i++) {
|
||||
LOGDEB(("Dup: %s\n", dups[i].url.c_str()));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
|
@ -447,15 +447,17 @@ bool Query::getDoc(int xapi, Doc &doc)
|
|||
|
||||
doc.pc = pc;
|
||||
char buf[200];
|
||||
if (collapsecount>0) {
|
||||
sprintf(buf,"%3d%% (%d)", pc, collapsecount+1);
|
||||
if (collapsecount > 0) {
|
||||
sprintf(buf,"%3d%% (%d)", pc, collapsecount + 1);
|
||||
} else {
|
||||
sprintf(buf,"%3d%%", pc);
|
||||
}
|
||||
doc.meta[Doc::keyrr] = buf;
|
||||
|
||||
if (collapsecount > 0) {
|
||||
sprintf(buf, "%d", collapsecount);
|
||||
doc.meta[Rcl::Doc::keycc] = buf;
|
||||
}
|
||||
|
||||
// Parse xapian document's data and populate doc fields
|
||||
return m_db->m_ndb->dbDataToRclDoc(docid, data, doc);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue