add functions and interface to display the duplicates of a result document

This commit is contained in:
Jean-Francois Dockes 2013-04-17 09:36:46 +02:00
parent c4656c1d10
commit a2f4bc9cf5
11 changed files with 217 additions and 9 deletions

View file

@ -8,8 +8,8 @@ LIBS = librecoll.a $(LIBRECOLL)
all: $(LIBS) all: $(LIBS)
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o uncomp.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o daterange.o expansiondbs.o rclabstract.o rcldb.o rcldoc.o rclquery.o rclterms.o searchdata.o searchdataxml.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o strmatcher.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o uncomp.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o daterange.o expansiondbs.o rclabstract.o rcldb.o rcldoc.o rcldups.o rclquery.o rclterms.o searchdata.o searchdataxml.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o strmatcher.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp uncomp.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp daterange.dep.stamp expansiondbs.dep.stamp rclabstract.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp rclterms.dep.stamp searchdata.dep.stamp searchdataxml.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp strmatcher.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp uncomp.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp daterange.dep.stamp expansiondbs.dep.stamp rclabstract.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rcldups.dep.stamp rclquery.dep.stamp rclterms.dep.stamp searchdata.dep.stamp searchdataxml.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp strmatcher.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
librecoll.a : $(DEPS) $(OBJS) librecoll.a : $(DEPS) $(OBJS)
ar ru librecoll.a $(OBJS) ar ru librecoll.a $(OBJS)
@ -109,6 +109,8 @@ rcldb.o : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp
rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs rcldoc.o : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldoc.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldoc.cpp
rcldups.o : ../rcldb/rcldups.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldups.cpp
rclquery.o : ../rcldb/rclquery.cpp $(depth)/mk/localdefs rclquery.o : ../rcldb/rclquery.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rclquery.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rclquery.cpp
rclterms.o : ../rcldb/rclterms.cpp $(depth)/mk/localdefs rclterms.o : ../rcldb/rclterms.cpp $(depth)/mk/localdefs
@ -320,6 +322,9 @@ rcldb.dep.stamp : ../rcldb/rcldb.cpp $(depth)/mk/localdefs
rcldoc.dep.stamp : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs rcldoc.dep.stamp : ../rcldb/rcldoc.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldoc.cpp > rcldoc.dep $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldoc.cpp > rcldoc.dep
touch rcldoc.dep.stamp touch rcldoc.dep.stamp
rcldups.dep.stamp : ../rcldb/rcldups.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldups.cpp > rcldups.dep
touch rcldups.dep.stamp
rclquery.dep.stamp : ../rcldb/rclquery.cpp $(depth)/mk/localdefs rclquery.dep.stamp : ../rcldb/rclquery.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rclquery.cpp > rclquery.dep $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rclquery.cpp > rclquery.dep
touch rclquery.dep.stamp touch rclquery.dep.stamp
@ -456,6 +461,7 @@ include expansiondbs.dep
include rclabstract.dep include rclabstract.dep
include rcldb.dep include rcldb.dep
include rcldoc.dep include rcldoc.dep
include rcldups.dep
include rclquery.dep include rclquery.dep
include rclterms.dep include rclterms.dep
include searchdata.dep include searchdata.dep

View file

@ -48,6 +48,7 @@ ${depth}/rcldb/expansiondbs.cpp \
${depth}/rcldb/rclabstract.cpp \ ${depth}/rcldb/rclabstract.cpp \
${depth}/rcldb/rcldb.cpp \ ${depth}/rcldb/rcldb.cpp \
${depth}/rcldb/rcldoc.cpp \ ${depth}/rcldb/rcldoc.cpp \
${depth}/rcldb/rcldups.cpp \
${depth}/rcldb/rclquery.cpp \ ${depth}/rcldb/rclquery.cpp \
${depth}/rcldb/rclterms.cpp \ ${depth}/rcldb/rclterms.cpp \
${depth}/rcldb/searchdata.cpp \ ${depth}/rcldb/searchdata.cpp \

View file

@ -52,7 +52,7 @@
#include "internfile.h" #include "internfile.h"
#include "indexer.h" #include "indexer.h"
#include "snippets_w.h" #include "snippets_w.h"
#include "listdialog.h"
#include "reslist.h" #include "reslist.h"
#include "moc_reslist.cpp" #include "moc_reslist.cpp"
#include "rclhelp.h" #include "rclhelp.h"
@ -815,6 +815,32 @@ void ResList::newSnippetsW(const Rcl::Doc& doc)
sp->show(); sp->show();
} }
void ResList::newDupsW(const Rcl::Doc&, const vector<Rcl::Doc>& dups)
{
ListDialog dialog;
dialog.setWindowTitle(tr("Duplicate documents"));
dialog.groupBox->setTitle(tr("These Urls ( | ipath) share the same"
" content:"));
// We replace the list with an editor so that the user can copy/paste
delete dialog.listWidget;
QTextEdit *editor = new QTextEdit(dialog.groupBox);
editor->setReadOnly(TRUE);
dialog.horizontalLayout->addWidget(editor);
for (vector<Rcl::Doc>::const_iterator it = dups.begin();
it != dups.end(); it++) {
if (it->ipath.empty())
editor->append(QString::fromLocal8Bit(it->url.c_str()));
else
editor->append(QString::fromLocal8Bit(it->url.c_str()) + " | " +
QString::fromUtf8(it->ipath.c_str()));
}
editor->moveCursor(QTextCursor::Start);
editor->ensureCursorVisible();
dialog.exec();
}
void ResList::linkWasClicked(const QUrl &url) void ResList::linkWasClicked(const QUrl &url)
{ {
string ascurl = (const char *)url.toString().toAscii();; string ascurl = (const char *)url.toString().toAscii();;
@ -822,6 +848,7 @@ void ResList::linkWasClicked(const QUrl &url)
int what = ascurl[0]; int what = ascurl[0];
switch (what) { switch (what) {
// Open abstract/snippets window // Open abstract/snippets window
case 'A': case 'A':
{ {
@ -837,10 +864,30 @@ void ResList::linkWasClicked(const QUrl &url)
} }
break; break;
// Show duplicates
case 'D':
{
if (m_source.isNull())
return;
int i = atoi(ascurl.c_str()+1) - 1;
Rcl::Doc doc;
if (!getDoc(i, doc)) {
LOGERR(("ResList::linkWasClicked: can't get doc for %d\n", i));
return;
}
vector<Rcl::Doc> dups;
if (m_source->docDups(doc, dups)) {
newDupsW(doc, dups);
}
}
break;
// Show query details // Show query details
case 'H': case 'H':
{
emit headerClicked(); emit headerClicked();
break; break;
}
// Preview and edit // Preview and edit
case 'P': case 'P':

View file

@ -146,6 +146,7 @@ class ResList : public RESLIST_PARENTCLASS
bool scrollIsAtBottom(); bool scrollIsAtBottom();
void setupArrows(); void setupArrows();
void newSnippetsW(const Rcl::Doc &doc); void newSnippetsW(const Rcl::Doc &doc);
void newDupsW(const Rcl::Doc& doc, const std::vector<Rcl::Doc>& dups);
}; };

View file

@ -108,6 +108,11 @@ class DocSequence {
{ {
return -1; return -1;
} }
/** Get duplicates. */
virtual bool docDups(const Rcl::Doc&, std::vector<Rcl::Doc>&)
{
return false;
}
virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0; virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0;
@ -185,6 +190,13 @@ public:
return false; return false;
return m_seq->getAbstract(doc, abs); return m_seq->getAbstract(doc, abs);
} }
/** Get duplicates. */
virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
{
if (m_seq.isNull())
return false;
return m_seq->docDups(doc, dups);
}
virtual bool snippetsCapable() virtual bool snippetsCapable()
{ {

View file

@ -239,3 +239,12 @@ bool DocSequenceDb::setQuery()
} }
return m_lastSQStatus; return m_lastSQStatus;
} }
bool DocSequenceDb::docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
{
if (m_q->whatDb()) {
return m_q->whatDb()->docDups(doc, dups);
} else {
return false;
}
}

View file

@ -39,6 +39,7 @@ class DocSequenceDb : public DocSequence {
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&); virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
virtual int getFirstMatchPage(Rcl::Doc&, std::string& term); virtual int getFirstMatchPage(Rcl::Doc&, std::string& term);
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc); virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups);
virtual string getDescription(); virtual string getDescription();
virtual list<string> expand(Rcl::Doc &doc); virtual list<string> expand(Rcl::Doc &doc);
virtual bool canFilter() {return true;} virtual bool canFilter() {return true;}

View file

@ -255,6 +255,15 @@ void ResListPager::displayDoc(RclConfig *config, int i, Rcl::Doc& doc,
linksbuf << "&nbsp;&nbsp;" << snipsbuf.str(); linksbuf << "&nbsp;&nbsp;" << snipsbuf.str();
} }
string collapscnt;
if (doc.getmeta(Rcl::Doc::keycc, &collapscnt) && !collapscnt.empty()) {
ostringstream collpsbuf;
int clc = atoi(collapscnt.c_str()) + 1;
collpsbuf << "<a href=\"D" << docnumforlinks << "\">"
<< trans("Dups") << "(" << clc << ")" << "</a>&nbsp;&nbsp;";
linksbuf << "&nbsp;&nbsp;" << collpsbuf.str();
}
// Build the result list paragraph: // Build the result list paragraph:
// Subheader: this is used by history // Subheader: this is used by history

View file

@ -353,6 +353,9 @@ class Db {
*/ */
bool getDoc(const string &udi, Doc &doc); bool getDoc(const string &udi, Doc &doc);
/** Get duplicates (md5) of document */
bool docDups(const Doc& idoc, std::vector<Doc>& odocs);
/* The following are mainly for the aspell module */ /* The following are mainly for the aspell module */
/** Whole term list walking. */ /** Whole term list walking. */
TermIter *termWalkOpen(); TermIter *termWalkOpen();

117
src/rcldb/rcldups.cpp Normal file
View file

@ -0,0 +1,117 @@
/* Copyright (C) 2013 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
////////////////////////////////////////////////////////////////////
/** Retrieve the dups of a given document. The input has to be a query result
* because we use the xdocid */
#include "autoconfig.h"
#include <string>
using namespace std;
#include <xapian.h>
#include "debuglog.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "xmacros.h"
#include "md5.h"
#include "searchdata.h"
#include "rclquery.h"
namespace Rcl {
// File name wild card expansion. This is a specialisation ot termMatch
bool Db::docDups(const Doc& idoc, vector<Doc>& odocs)
{
if (m_ndb == 0) {
LOGERR(("Db::docDups: no db\n"));
return false;
}
if (idoc.xdocid == 0) {
LOGERR(("Db::docDups: null xdocid in input doc\n"));
return false;
}
// Get the xapian doc
Xapian::Document xdoc;
XAPTRY(xdoc = m_ndb->xrdb.get_document(Xapian::docid(idoc.xdocid)),
m_ndb->xrdb, m_reason);
if (!m_reason.empty()) {
LOGERR(("Db::docDups: xapian error: %s\n", m_reason.c_str()));
return false;
}
// Get the md5
string digest;
XAPTRY(digest = xdoc.get_value(VALUE_MD5), m_ndb->xrdb, m_reason);
if (!m_reason.empty()) {
LOGERR(("Db::docDups: xapian error: %s\n", m_reason.c_str()));
return false;
}
if (digest.empty()) {
LOGDEB(("Db::docDups: doc has no md5\n"));
return false;
}
string md5;
MD5HexPrint(digest, md5);
SearchData *sdp = new SearchData();
RefCntr<SearchData> sd(sdp);
SearchDataClauseSimple *sdc =
new SearchDataClauseSimple(SCLT_AND, md5, "rclmd5");
sdc->addModifier(SearchDataClause::SDCM_CASESENS);
sdc->addModifier(SearchDataClause::SDCM_DIACSENS);
sd->addClause(sdc);
Query query(this);
query.setCollapseDuplicates(0);
if (!query.setQuery(sd)) {
LOGERR(("Db::docDups: setQuery failed\n"));
return false;
}
int cnt = query.getResCnt();
for (int i = 0; i < cnt; i++) {
Doc doc;
if (!query.getDoc(i, doc)) {
LOGERR(("Db::docDups: getDoc failed at %d (cnt %d)\n", i, cnt));
return false;
}
odocs.push_back(doc);
}
return true;
}
#if 0
{
vector<Doc> dups;
bool ret;
LOGDEB(("DOCDUPS\n"));
ret = m_db->docDups(doc, dups);
if (!ret) {
LOGDEB(("docDups failed\n"));
} else if (dups.size() == 1) {
LOGDEB(("No dups\n"));
} else {
for (unsigned int i = 0; i < dups.size(); i++) {
LOGDEB(("Dup: %s\n", dups[i].url.c_str()));
}
}
}
#endif
}

View file

@ -447,15 +447,17 @@ bool Query::getDoc(int xapi, Doc &doc)
doc.pc = pc; doc.pc = pc;
char buf[200]; char buf[200];
if (collapsecount>0) { if (collapsecount > 0) {
sprintf(buf,"%3d%% (%d)", pc, collapsecount+1); sprintf(buf,"%3d%% (%d)", pc, collapsecount + 1);
} else { } else {
sprintf(buf,"%3d%%", pc); sprintf(buf,"%3d%%", pc);
} }
doc.meta[Doc::keyrr] = buf; doc.meta[Doc::keyrr] = buf;
if (collapsecount > 0) {
sprintf(buf, "%d", collapsecount); sprintf(buf, "%d", collapsecount);
doc.meta[Rcl::Doc::keycc] = buf; doc.meta[Rcl::Doc::keycc] = buf;
}
// Parse xapian document's data and populate doc fields // Parse xapian document's data and populate doc fields
return m_db->m_ndb->dbDataToRclDoc(docid, data, doc); return m_db->m_ndb->dbDataToRclDoc(docid, data, doc);