Add a new environment variable "RECOLL_ACTIVE_EXTRA_DBS", which helps
choose the active external indexes list.
This commit is contained in:
commit
5bd071c5a6
788 changed files with 332998 additions and 0 deletions
54
src/query/Makefile
Normal file
54
src/query/Makefile
Normal file
|
@ -0,0 +1,54 @@
|
|||
depth = ..
|
||||
include $(depth)/mk/sysconf
|
||||
|
||||
PROGS = xadump recollq #trhist qtry qxtry
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
SRCS = xadump.cpp rclqlang.cpp
|
||||
.cpp.o :
|
||||
$(CXX) -c $(ALL_CXXFLAGS) -o $@ $<
|
||||
|
||||
XADUMP_OBJS= xadump.o $(BIGLIB)
|
||||
xadump : $(XADUMP_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \
|
||||
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
|
||||
|
||||
RECOLLQ_OBJS= recollq.o $(BIGLIB)
|
||||
recollq : $(RECOLLQ_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o recollq $(RECOLLQ_OBJS) \
|
||||
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
|
||||
recollq.o : recollq.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_RECOLLQ -c recollq.cpp
|
||||
|
||||
HISTORY_OBJS= trhist.o $(BIGLIB)
|
||||
trhist : $(HISTORY_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o trhist $(HISTORY_OBJS) \
|
||||
$(LIBICONV) $(LIBXAPIAN)
|
||||
trhist.o : history.cpp history.h
|
||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_HISTORY -c -o trhist.o history.cpp
|
||||
|
||||
WASASTRINGTOQUERY_OBJS= trwasastrtoq.o $(BIGLIB)
|
||||
trwasastrtoq : $(WASASTRINGTOQUERY_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o trwasastrtoq $(WASASTRINGTOQUERY_OBJS) \
|
||||
$(LIBICONV) $(LIBXAPIAN)
|
||||
trwasastrtoq.o : wasastringtoquery.cpp wasastringtoquery.h
|
||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_WASASTRINGTOQUERY -c \
|
||||
-o trwasastrtoq.o wasastringtoquery.cpp
|
||||
|
||||
$(BIGLIB): force
|
||||
cd $(depth)/lib;$(MAKE)
|
||||
force:
|
||||
|
||||
|
||||
depend: alldeps.stamp
|
||||
alldeps.stamp : $(SRCS)
|
||||
$(CXX) -M $(ALL_CXXFLAGS) $(SRCS) > alldeps
|
||||
touch alldeps.stamp
|
||||
|
||||
clean:
|
||||
cp /dev/null alldeps
|
||||
rm -f alldeps.stamp
|
||||
rm -f *.o $(PROGS)
|
||||
|
||||
include alldeps
|
109
src/query/docseq.cpp
Normal file
109
src/query/docseq.cpp
Normal file
|
@ -0,0 +1,109 @@
|
|||
/* Copyright (C) 2005 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include "docseq.h"
|
||||
#include "filtseq.h"
|
||||
#include "sortseq.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
string DocSequence::o_sort_trans;
|
||||
string DocSequence::o_filt_trans;
|
||||
|
||||
int DocSequence::getSeqSlice(int offs, int cnt, vector<ResListEntry>& result)
|
||||
{
|
||||
int ret = 0;
|
||||
for (int num = offs; num < offs + cnt; num++, ret++) {
|
||||
result.push_back(ResListEntry());
|
||||
if (!getDoc(num, result.back().doc, &result.back().subHeader)) {
|
||||
result.pop_back();
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Remove stacked modifying sources (sort, filter) until we get to a real one
|
||||
void DocSource::stripStack()
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return;
|
||||
while (m_seq->getSourceSeq().isNotNull()) {
|
||||
m_seq = m_seq->getSourceSeq();
|
||||
}
|
||||
}
|
||||
|
||||
bool DocSource::buildStack()
|
||||
{
|
||||
LOGDEB2(("DocSource::buildStack()\n"));
|
||||
stripStack();
|
||||
|
||||
if (m_seq.isNull())
|
||||
return false;
|
||||
|
||||
// Filtering must be done before sorting, (which may
|
||||
// truncates the original list)
|
||||
if (m_seq->canFilter()) {
|
||||
if (!m_seq->setFiltSpec(m_fspec)) {
|
||||
LOGERR(("DocSource::buildStack: setfiltspec failed\n"));
|
||||
}
|
||||
} else {
|
||||
if (m_fspec.isNotNull()) {
|
||||
m_seq =
|
||||
RefCntr<DocSequence>(new DocSeqFiltered(m_config, m_seq, m_fspec));
|
||||
}
|
||||
}
|
||||
|
||||
if (m_seq->canSort()) {
|
||||
if (!m_seq->setSortSpec(m_sspec)) {
|
||||
LOGERR(("DocSource::buildStack: setsortspec failed\n"));
|
||||
}
|
||||
} else {
|
||||
if (m_sspec.isNotNull()) {
|
||||
m_seq = RefCntr<DocSequence>(new DocSeqSorted(m_seq, m_sspec));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
string DocSource::title()
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return string();
|
||||
string qual;
|
||||
if (m_fspec.isNotNull() && !m_sspec.isNotNull())
|
||||
qual = string(" (") + o_filt_trans + string(")");
|
||||
else if (!m_fspec.isNotNull() && m_sspec.isNotNull())
|
||||
qual = string(" (") + o_sort_trans + string(")");
|
||||
else if (m_fspec.isNotNull() && m_sspec.isNotNull())
|
||||
qual = string(" (") + o_sort_trans + string(",") + o_filt_trans + string(")");
|
||||
return m_seq->title() + qual;
|
||||
}
|
||||
|
||||
bool DocSource::setFiltSpec(const DocSeqFiltSpec &f)
|
||||
{
|
||||
LOGDEB2(("DocSource::setFiltSpec\n"));
|
||||
m_fspec = f;
|
||||
buildStack();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DocSource::setSortSpec(const DocSeqSortSpec &s)
|
||||
{
|
||||
LOGDEB2(("DocSource::setSortSpec\n"));
|
||||
m_sspec = s;
|
||||
buildStack();
|
||||
return true;
|
||||
}
|
227
src/query/docseq.h
Normal file
227
src/query/docseq.h
Normal file
|
@ -0,0 +1,227 @@
|
|||
/* Copyright (C) 2004 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _DOCSEQ_H_INCLUDED_
|
||||
#define _DOCSEQ_H_INCLUDED_
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
using std::list;
|
||||
using std::vector;
|
||||
#endif
|
||||
|
||||
#include "rcldoc.h"
|
||||
#include "refcntr.h"
|
||||
|
||||
// A result list entry.
|
||||
struct ResListEntry {
|
||||
Rcl::Doc doc;
|
||||
string subHeader;
|
||||
};
|
||||
|
||||
/** Sort specification. */
|
||||
class DocSeqSortSpec {
|
||||
public:
|
||||
DocSeqSortSpec() : desc(false) {}
|
||||
bool isNotNull() const {return !field.empty();}
|
||||
void reset() {field.erase();}
|
||||
string field;
|
||||
bool desc;
|
||||
};
|
||||
|
||||
/** Filtering spec. This is only used to filter by doc category for now, hence
|
||||
the rather specialized interface */
|
||||
class DocSeqFiltSpec {
|
||||
public:
|
||||
DocSeqFiltSpec() {}
|
||||
enum Crit {DSFS_MIMETYPE, DSFS_QLANG, DSFS_PASSALL};
|
||||
void orCrit(Crit crit, const string& value) {
|
||||
crits.push_back(crit);
|
||||
values.push_back(value);
|
||||
}
|
||||
std::vector<Crit> crits;
|
||||
std::vector<string> values;
|
||||
void reset() {crits.clear(); values.clear();}
|
||||
bool isNotNull() const {return crits.size() != 0;}
|
||||
};
|
||||
|
||||
/** Interface for a list of documents coming from some source.
|
||||
|
||||
The result list display data may come from different sources (ie:
|
||||
history or Db query), and be post-processed (DocSeqSorted).
|
||||
Additional functionality like filtering/sorting can either be
|
||||
obtained by stacking DocSequence objects (ie: sorting history), or
|
||||
by native capability (ex: docseqdb can sort and filter). The
|
||||
implementation might be nicer by using more sophisticated c++ with
|
||||
multiple inheritance of sort and filter virtual interfaces, but
|
||||
the current one will have to do for now.
|
||||
*/
|
||||
class DocSequence {
|
||||
public:
|
||||
DocSequence(const string &t) : m_title(t) {}
|
||||
virtual ~DocSequence() {}
|
||||
|
||||
/** Get document at given rank.
|
||||
*
|
||||
* @param num document rank in sequence
|
||||
* @param doc return data
|
||||
* @param sh subheader to display before this result (ie: date change
|
||||
* inside history)
|
||||
* @return true if ok, false for error or end of data
|
||||
*/
|
||||
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0) = 0;
|
||||
|
||||
/** Get next page of documents. This accumulates entries into the result
|
||||
* list parameter (doesn't reset it). */
|
||||
virtual int getSeqSlice(int offs, int cnt, vector<ResListEntry>& result);
|
||||
|
||||
/** Get abstract for document. This is special because it may take time.
|
||||
* The default is to return the input doc's abstract fields, but some
|
||||
* sequences can compute a better value (ie: docseqdb) */
|
||||
virtual bool getAbstract(Rcl::Doc& doc, vector<string>& abs) {
|
||||
abs.push_back(doc.meta[Rcl::Doc::keyabs]);
|
||||
return true;
|
||||
}
|
||||
virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0;
|
||||
|
||||
/** Get estimated total count in results */
|
||||
virtual int getResCnt() = 0;
|
||||
|
||||
/** Get title for result list */
|
||||
virtual string title() {return m_title;}
|
||||
|
||||
/** Get description for underlying query */
|
||||
virtual string getDescription() = 0;
|
||||
|
||||
/** Get search terms (for highlighting abstracts). Some sequences
|
||||
* may have no associated search terms. Implement this for them. */
|
||||
virtual bool getTerms(vector<string>& terms,
|
||||
vector<vector<string> >& groups,
|
||||
vector<int>& gslks)
|
||||
{
|
||||
terms.clear(); groups.clear(); gslks.clear(); return true;
|
||||
}
|
||||
/** Get user-input terms (before stemming etc.) */
|
||||
virtual void getUTerms(vector<string>& terms)
|
||||
{
|
||||
terms.clear();
|
||||
}
|
||||
virtual list<string> expand(Rcl::Doc &) {return list<string>();}
|
||||
|
||||
/** Optional functionality. */
|
||||
virtual bool canFilter() {return false;}
|
||||
virtual bool canSort() {return false;}
|
||||
virtual bool setFiltSpec(const DocSeqFiltSpec &) {return false;}
|
||||
virtual bool setSortSpec(const DocSeqSortSpec &) {return false;}
|
||||
virtual RefCntr<DocSequence> getSourceSeq() {return RefCntr<DocSequence>();}
|
||||
|
||||
static void set_translations(const string& sort, const string& filt)
|
||||
{
|
||||
o_sort_trans = sort;
|
||||
o_filt_trans = filt;
|
||||
}
|
||||
protected:
|
||||
static string o_sort_trans;
|
||||
static string o_filt_trans;
|
||||
private:
|
||||
string m_title;
|
||||
};
|
||||
|
||||
/** A modifier has a child sequence which does the real work and does
|
||||
* something with the results. Some operations are just delegated
|
||||
*/
|
||||
class DocSeqModifier : public DocSequence {
|
||||
public:
|
||||
DocSeqModifier(RefCntr<DocSequence> iseq)
|
||||
: DocSequence(""), m_seq(iseq)
|
||||
{}
|
||||
virtual ~DocSeqModifier() {}
|
||||
|
||||
virtual bool getAbstract(Rcl::Doc& doc, vector<string>& abs)
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return false;
|
||||
return m_seq->getAbstract(doc, abs);
|
||||
}
|
||||
virtual string getDescription()
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return "";
|
||||
return m_seq->getDescription();
|
||||
}
|
||||
virtual bool getTerms(vector<string>& terms,
|
||||
vector<vector<string> >& groups,
|
||||
vector<int>& gslks)
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return false;
|
||||
return m_seq->getTerms(terms, groups, gslks);
|
||||
}
|
||||
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return false;
|
||||
return m_seq->getEnclosing(doc, pdoc);
|
||||
}
|
||||
virtual void getUTerms(vector<string>& terms)
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return;
|
||||
m_seq->getUTerms(terms);
|
||||
}
|
||||
virtual string title() {return m_seq->title();}
|
||||
virtual RefCntr<DocSequence> getSourceSeq() {return m_seq;}
|
||||
|
||||
protected:
|
||||
RefCntr<DocSequence> m_seq;
|
||||
};
|
||||
|
||||
class RclConfig;
|
||||
// A DocSource can juggle docseqs of different kinds to implement
|
||||
// sorting and filtering in ways depending on the base seqs capabilities
|
||||
class DocSource : public DocSeqModifier {
|
||||
public:
|
||||
DocSource(RclConfig *config, RefCntr<DocSequence> iseq)
|
||||
: DocSeqModifier(iseq), m_config(config)
|
||||
{}
|
||||
virtual bool canFilter() {return true;}
|
||||
virtual bool canSort() {return true;}
|
||||
virtual bool setFiltSpec(const DocSeqFiltSpec &);
|
||||
virtual bool setSortSpec(const DocSeqSortSpec &);
|
||||
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0)
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return false;
|
||||
return m_seq->getDoc(num, doc, sh);
|
||||
}
|
||||
virtual int getResCnt()
|
||||
{
|
||||
if (m_seq.isNull())
|
||||
return 0;
|
||||
return m_seq->getResCnt();
|
||||
}
|
||||
virtual string title();
|
||||
private:
|
||||
bool buildStack();
|
||||
void stripStack();
|
||||
RclConfig *m_config;
|
||||
DocSeqFiltSpec m_fspec;
|
||||
DocSeqSortSpec m_sspec;
|
||||
};
|
||||
|
||||
#endif /* _DOCSEQ_H_INCLUDED_ */
|
183
src/query/docseqdb.cpp
Normal file
183
src/query/docseqdb.cpp
Normal file
|
@ -0,0 +1,183 @@
|
|||
/* Copyright (C) 2005 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "docseqdb.h"
|
||||
#include "rcldb.h"
|
||||
#include "debuglog.h"
|
||||
#include "internfile.h"
|
||||
#include "wasatorcl.h"
|
||||
|
||||
DocSequenceDb::DocSequenceDb(RefCntr<Rcl::Query> q, const string &t,
|
||||
RefCntr<Rcl::SearchData> sdata)
|
||||
: DocSequence(t), m_q(q), m_sdata(sdata), m_fsdata(sdata),
|
||||
m_rescnt(-1),
|
||||
m_queryBuildAbstract(true),
|
||||
m_queryReplaceAbstract(false),
|
||||
m_isFiltered(false),
|
||||
m_isSorted(false),
|
||||
m_needSetQuery(false)
|
||||
{
|
||||
}
|
||||
|
||||
DocSequenceDb::~DocSequenceDb()
|
||||
{
|
||||
}
|
||||
|
||||
bool DocSequenceDb::getTerms(vector<string>& terms,
|
||||
vector<vector<string> >& groups,
|
||||
vector<int>& gslks)
|
||||
{
|
||||
return m_fsdata->getTerms(terms, groups, gslks);
|
||||
}
|
||||
|
||||
void DocSequenceDb::getUTerms(vector<string>& terms)
|
||||
{
|
||||
m_sdata->getUTerms(terms);
|
||||
}
|
||||
|
||||
string DocSequenceDb::getDescription()
|
||||
{
|
||||
return m_fsdata->getDescription();
|
||||
}
|
||||
|
||||
bool DocSequenceDb::getDoc(int num, Rcl::Doc &doc, string *sh)
|
||||
{
|
||||
setQuery();
|
||||
if (sh) sh->erase();
|
||||
return m_q->getDoc(num, doc);
|
||||
}
|
||||
|
||||
int DocSequenceDb::getResCnt()
|
||||
{
|
||||
setQuery();
|
||||
if (m_rescnt < 0) {
|
||||
m_rescnt= m_q->getResCnt();
|
||||
}
|
||||
return m_rescnt;
|
||||
}
|
||||
|
||||
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
|
||||
{
|
||||
setQuery();
|
||||
if (m_q->whatDb() &&
|
||||
m_queryBuildAbstract && (doc.syntabs || m_queryReplaceAbstract)) {
|
||||
m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), vabs);
|
||||
}
|
||||
if (vabs.empty())
|
||||
vabs.push_back(doc.meta[Rcl::Doc::keyabs]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
|
||||
{
|
||||
setQuery();
|
||||
string udi;
|
||||
if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath,
|
||||
udi))
|
||||
return false;
|
||||
return m_q->whatDb()->getDoc(udi, pdoc);
|
||||
}
|
||||
|
||||
list<string> DocSequenceDb::expand(Rcl::Doc &doc)
|
||||
{
|
||||
setQuery();
|
||||
return m_q->expand(doc);
|
||||
}
|
||||
|
||||
string DocSequenceDb::title()
|
||||
{
|
||||
string qual;
|
||||
if (m_isFiltered && !m_isSorted)
|
||||
qual = string(" (") + o_filt_trans + string(")");
|
||||
else if (!m_isFiltered && m_isSorted)
|
||||
qual = string(" (") + o_sort_trans + string(")");
|
||||
else if (m_isFiltered && m_isSorted)
|
||||
qual = string(" (") + o_sort_trans + string(",") + o_filt_trans + string(")");
|
||||
return DocSequence::title() + qual;
|
||||
}
|
||||
|
||||
bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
|
||||
{
|
||||
LOGDEB(("DocSequenceDb::setFiltSpec\n"));
|
||||
if (fs.isNotNull()) {
|
||||
// We build a search spec by adding a filtering layer to the base one.
|
||||
m_fsdata = RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND));
|
||||
Rcl::SearchDataClauseSub *cl =
|
||||
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB, m_sdata);
|
||||
m_fsdata->addClause(cl);
|
||||
|
||||
for (unsigned int i = 0; i < fs.crits.size(); i++) {
|
||||
switch (fs.crits[i]) {
|
||||
case DocSeqFiltSpec::DSFS_MIMETYPE:
|
||||
m_fsdata->addFiletype(fs.values[i]);
|
||||
break;
|
||||
case DocSeqFiltSpec::DSFS_QLANG:
|
||||
{
|
||||
if (m_q.isNull())
|
||||
break;
|
||||
|
||||
string reason;
|
||||
Rcl::SearchData *sd =
|
||||
wasaStringToRcl(m_q->whatDb()->getConf(),
|
||||
fs.values[i], reason);
|
||||
if (sd) {
|
||||
Rcl::SearchDataClauseSub *cl1 =
|
||||
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB,
|
||||
RefCntr<Rcl::SearchData>(sd));
|
||||
m_fsdata->addClause(cl1);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
m_isFiltered = true;
|
||||
} else {
|
||||
m_fsdata = m_sdata;
|
||||
m_isFiltered = false;
|
||||
}
|
||||
m_needSetQuery = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DocSequenceDb::setSortSpec(const DocSeqSortSpec &spec)
|
||||
{
|
||||
LOGDEB(("DocSequenceDb::setSortSpec: fld [%s] %s\n",
|
||||
spec.field.c_str(), spec.desc ? "desc" : "asc"));
|
||||
if (spec.isNotNull()) {
|
||||
m_q->setSortBy(spec.field, !spec.desc);
|
||||
m_isSorted = true;
|
||||
} else {
|
||||
m_q->setSortBy(string(), true);
|
||||
m_isSorted = false;
|
||||
}
|
||||
m_needSetQuery = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DocSequenceDb::setQuery()
|
||||
{
|
||||
if (!m_needSetQuery)
|
||||
return true;
|
||||
m_rescnt = -1;
|
||||
m_needSetQuery = !m_q->setQuery(m_fsdata);
|
||||
return !m_needSetQuery;
|
||||
}
|
65
src/query/docseqdb.h
Normal file
65
src/query/docseqdb.h
Normal file
|
@ -0,0 +1,65 @@
|
|||
/* Copyright (C) 2004 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _DOCSEQDB_H_INCLUDED_
|
||||
#define _DOCSEQDB_H_INCLUDED_
|
||||
#include "docseq.h"
|
||||
#include "refcntr.h"
|
||||
|
||||
#include "searchdata.h"
|
||||
#include "rclquery.h"
|
||||
|
||||
/** A DocSequence from a Db query */
|
||||
class DocSequenceDb : public DocSequence {
|
||||
public:
|
||||
DocSequenceDb(RefCntr<Rcl::Query> q, const string &t,
|
||||
RefCntr<Rcl::SearchData> sdata);
|
||||
virtual ~DocSequenceDb();
|
||||
virtual bool getDoc(int num, Rcl::Doc &doc, string * = 0);
|
||||
virtual int getResCnt();
|
||||
virtual bool getTerms(vector<string>& terms,
|
||||
vector<vector<string> >& groups,
|
||||
vector<int>& gslks);
|
||||
virtual void getUTerms(vector<string>& terms);
|
||||
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
|
||||
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
|
||||
virtual string getDescription();
|
||||
virtual list<string> expand(Rcl::Doc &doc);
|
||||
virtual bool canFilter() {return true;}
|
||||
virtual bool setFiltSpec(const DocSeqFiltSpec &filtspec);
|
||||
virtual bool canSort() {return true;}
|
||||
virtual bool setSortSpec(const DocSeqSortSpec &sortspec);
|
||||
virtual void setAbstractParams(bool qba, bool qra)
|
||||
{
|
||||
m_queryBuildAbstract = qba;
|
||||
m_queryReplaceAbstract = qra;
|
||||
}
|
||||
virtual string title();
|
||||
|
||||
private:
|
||||
RefCntr<Rcl::Query> m_q;
|
||||
RefCntr<Rcl::SearchData> m_sdata;
|
||||
RefCntr<Rcl::SearchData> m_fsdata; // Filtered
|
||||
int m_rescnt;
|
||||
bool m_queryBuildAbstract;
|
||||
bool m_queryReplaceAbstract;
|
||||
bool m_isFiltered;
|
||||
bool m_isSorted;
|
||||
bool m_needSetQuery; // search data changed, need to reapply before fetch
|
||||
bool setQuery();
|
||||
};
|
||||
|
||||
#endif /* _DOCSEQDB_H_INCLUDED_ */
|
159
src/query/docseqhist.cpp
Normal file
159
src/query/docseqhist.cpp
Normal file
|
@ -0,0 +1,159 @@
|
|||
/* Copyright (C) 2005 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <cmath>
|
||||
|
||||
#include "docseqhist.h"
|
||||
#include "rcldb.h"
|
||||
#include "fileudi.h"
|
||||
#include "internfile.h"
|
||||
#include "base64.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
|
||||
// Encode document history entry:
|
||||
// U + Unix time + base64 of udi
|
||||
// The U distinguishes udi-based entries from older fn+ipath ones
|
||||
bool RclDHistoryEntry::encode(string& value)
|
||||
{
|
||||
char chartime[30];
|
||||
sprintf(chartime,"%ld", unixtime);
|
||||
string budi;
|
||||
base64_encode(udi, budi);
|
||||
value = string("U ") + string(chartime) + " " + budi;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Decode. We support historical entries which were like "time b64fn [b64ipath]"
|
||||
// Current entry format is "U time b64udi"
|
||||
bool RclDHistoryEntry::decode(const string &value)
|
||||
{
|
||||
list<string> vall;
|
||||
stringToStrings(value, vall);
|
||||
|
||||
list<string>::const_iterator it = vall.begin();
|
||||
udi.erase();
|
||||
string fn, ipath;
|
||||
switch (vall.size()) {
|
||||
case 2:
|
||||
// Old fn+ipath, null ipath case
|
||||
unixtime = atol((*it++).c_str());
|
||||
base64_decode(*it++, fn);
|
||||
break;
|
||||
case 3:
|
||||
if (!it->compare("U")) {
|
||||
// New udi-based entry
|
||||
it++;
|
||||
unixtime = atol((*it++).c_str());
|
||||
base64_decode(*it++, udi);
|
||||
} else {
|
||||
// Old fn + ipath. We happen to know how to build an udi
|
||||
unixtime = atol((*it++).c_str());
|
||||
base64_decode(*it++, fn);
|
||||
base64_decode(*it, ipath);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!fn.empty()) {
|
||||
// Old style entry found, make an udi, using the fs udi maker
|
||||
make_udi(fn, ipath, udi);
|
||||
}
|
||||
LOGDEB1(("RclDHistoryEntry::decode: udi [%s]\n", udi.c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool RclDHistoryEntry::equal(const DynConfEntry& other)
|
||||
{
|
||||
const RclDHistoryEntry& e = dynamic_cast<const RclDHistoryEntry&>(other);
|
||||
return e.udi == udi;
|
||||
}
|
||||
|
||||
bool historyEnterDoc(RclDynConf *dncf, const string& udi)
|
||||
{
|
||||
LOGDEB1(("historyEnterDoc: [%s] into %s\n",
|
||||
udi.c_str(), dncf->getFilename().c_str()));
|
||||
RclDHistoryEntry ne(time(0), udi);
|
||||
RclDHistoryEntry scratch;
|
||||
return dncf->insertNew(docHistSubKey, ne, scratch, 200);
|
||||
}
|
||||
|
||||
list<RclDHistoryEntry> getDocHistory(RclDynConf* dncf)
|
||||
{
|
||||
return dncf->getList<RclDHistoryEntry>(docHistSubKey);
|
||||
}
|
||||
|
||||
|
||||
bool DocSequenceHistory::getDoc(int num, Rcl::Doc &doc, string *sh)
|
||||
{
|
||||
// Retrieve history list
|
||||
if (!m_hist)
|
||||
return false;
|
||||
if (m_hlist.empty())
|
||||
m_hlist = getDocHistory(m_hist);
|
||||
|
||||
if (num < 0 || num >= (int)m_hlist.size())
|
||||
return false;
|
||||
int skip;
|
||||
if (m_prevnum >= 0 && num >= m_prevnum) {
|
||||
skip = num - m_prevnum;
|
||||
} else {
|
||||
skip = num;
|
||||
m_it = m_hlist.begin();
|
||||
m_prevtime = -1;
|
||||
}
|
||||
m_prevnum = num;
|
||||
while (skip--)
|
||||
m_it++;
|
||||
if (sh) {
|
||||
if (m_prevtime < 0 ||
|
||||
abs (float(m_prevtime) - float(m_it->unixtime)) > 86400) {
|
||||
m_prevtime = m_it->unixtime;
|
||||
time_t t = (time_t)(m_it->unixtime);
|
||||
*sh = string(ctime(&t));
|
||||
// Get rid of the final \n in ctime
|
||||
sh->erase(sh->length()-1);
|
||||
} else
|
||||
sh->erase();
|
||||
}
|
||||
bool ret = m_db->getDoc(m_it->udi, doc);
|
||||
if (!ret) {
|
||||
doc.url = "UNKNOWN";
|
||||
doc.ipath = "";
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool DocSequenceHistory::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
|
||||
{
|
||||
string udi;
|
||||
if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath,
|
||||
udi))
|
||||
return false;
|
||||
return m_db->getDoc(udi, pdoc);
|
||||
}
|
||||
|
||||
int DocSequenceHistory::getResCnt()
|
||||
{
|
||||
if (m_hlist.empty())
|
||||
m_hlist = getDocHistory(m_hist);
|
||||
return m_hlist.size();
|
||||
}
|
67
src/query/docseqhist.h
Normal file
67
src/query/docseqhist.h
Normal file
|
@ -0,0 +1,67 @@
|
|||
/* Copyright (C) 2004 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _DOCSEQHIST_H_INCLUDED_
|
||||
#define _DOCSEQHIST_H_INCLUDED_
|
||||
|
||||
#include "docseq.h"
|
||||
#include "dynconf.h"
|
||||
|
||||
namespace Rcl {
|
||||
class Db;
|
||||
}
|
||||
|
||||
/** DynConf Document history entry */
|
||||
class RclDHistoryEntry : public DynConfEntry {
|
||||
public:
|
||||
RclDHistoryEntry() : unixtime(0) {}
|
||||
RclDHistoryEntry(long t, const string& u)
|
||||
: unixtime(t), udi(u) {}
|
||||
virtual ~RclDHistoryEntry() {}
|
||||
virtual bool decode(const string &value);
|
||||
virtual bool encode(string& value);
|
||||
virtual bool equal(const DynConfEntry& other);
|
||||
long unixtime;
|
||||
string udi;
|
||||
};
|
||||
|
||||
/** A DocSequence coming from the history file.
|
||||
* History is kept as a list of urls. This queries the db to fetch
|
||||
* metadata for an url key */
|
||||
class DocSequenceHistory : public DocSequence {
|
||||
public:
|
||||
DocSequenceHistory(Rcl::Db *d, RclDynConf *h, const string &t)
|
||||
: DocSequence(t), m_db(d), m_hist(h), m_prevnum(-1), m_prevtime(-1) {}
|
||||
virtual ~DocSequenceHistory() {}
|
||||
|
||||
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0);
|
||||
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
|
||||
virtual int getResCnt();
|
||||
virtual string getDescription() {return m_description;}
|
||||
void setDescription(const string& desc) {m_description = desc;}
|
||||
private:
|
||||
Rcl::Db *m_db;
|
||||
RclDynConf *m_hist;
|
||||
int m_prevnum;
|
||||
long m_prevtime;
|
||||
string m_description; // This is just an nls translated 'doc history'
|
||||
list<RclDHistoryEntry> m_hlist;
|
||||
list<RclDHistoryEntry>::const_iterator m_it;
|
||||
};
|
||||
|
||||
extern bool historyEnterDoc(RclDynConf *dncf, const string& udi);
|
||||
|
||||
#endif /* _DOCSEQ_H_INCLUDED_ */
|
234
src/query/dynconf.cpp
Normal file
234
src/query/dynconf.cpp
Normal file
|
@ -0,0 +1,234 @@
|
|||
/* Copyright (C) 2005 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef TEST_HISTORY
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "dynconf.h"
|
||||
#include "base64.h"
|
||||
#include "smallut.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif
|
||||
|
||||
// Well known keys for history and external indexes.
|
||||
const string docHistSubKey = "docs";
|
||||
const string allEdbsSk = "allExtDbs";
|
||||
const string actEdbsSk = "actExtDbs";
|
||||
|
||||
|
||||
// @param sk section this is for
|
||||
// @param n new entry
|
||||
// @param s a scratch entry used for decoding and comparisons.
|
||||
// This avoids templating this routine for the actual entry type.
|
||||
bool RclDynConf::insertNew(const string &sk, DynConfEntry &n, DynConfEntry &s,
|
||||
int maxlen)
|
||||
{
|
||||
// Is this doc already in list ? If it is we remove the old entry
|
||||
list<string> names = m_data.getNames(sk);
|
||||
list<string>::const_iterator it;
|
||||
bool changed = false;
|
||||
for (it = names.begin(); it != names.end(); it++) {
|
||||
string oval;
|
||||
if (!m_data.get(*it, oval, sk)) {
|
||||
LOGDEB(("No data for %s\n", (*it).c_str()));
|
||||
continue;
|
||||
}
|
||||
s.decode(oval);
|
||||
|
||||
if (s.equal(n)) {
|
||||
LOGDEB(("Erasing old entry\n"));
|
||||
m_data.erase(*it, sk);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Maybe reget list
|
||||
if (changed)
|
||||
names = m_data.getNames(sk);
|
||||
|
||||
// Need to prune ?
|
||||
if (maxlen > 0 && names.size() >= (unsigned int)maxlen) {
|
||||
// Need to erase entries until we're back to size. Note that
|
||||
// we don't ever reset numbers. Problems will arise when
|
||||
// history is 4 billion entries old
|
||||
it = names.begin();
|
||||
for (unsigned int i = 0; i < names.size() - maxlen + 1; i++, it++) {
|
||||
m_data.erase(*it, sk);
|
||||
}
|
||||
}
|
||||
|
||||
// Increment highest number
|
||||
unsigned int hi = names.empty() ? 0 :
|
||||
(unsigned int)atoi(names.back().c_str());
|
||||
hi++;
|
||||
char nname[20];
|
||||
sprintf(nname, "%010u", hi);
|
||||
|
||||
string value;
|
||||
n.encode(value);
|
||||
LOGDEB1(("Encoded value [%s] (%d)\n", value.c_str(), value.size()));
|
||||
if (!m_data.set(string(nname), value, sk)) {
|
||||
LOGERR(("RclDHistory::insertNew: set failed\n"));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool RclDynConf::eraseAll(const string &sk)
|
||||
{
|
||||
list<string> names = m_data.getNames(sk);
|
||||
list<string>::const_iterator it;
|
||||
for (it = names.begin(); it != names.end(); it++) {
|
||||
m_data.erase(*it, sk);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Generic string list specialization ///////////////////////////////////
|
||||
|
||||
// Encode/decode simple string. base64 used to avoid problems with
|
||||
// strange chars
|
||||
bool RclSListEntry::encode(string& enc)
|
||||
{
|
||||
base64_encode(value, enc);
|
||||
return true;
|
||||
}
|
||||
bool RclSListEntry::decode(const string &enc)
|
||||
{
|
||||
base64_decode(enc, value);
|
||||
return true;
|
||||
}
|
||||
bool RclSListEntry::equal(const DynConfEntry& other)
|
||||
{
|
||||
const RclSListEntry& e = dynamic_cast<const RclSListEntry&>(other);
|
||||
return e.value == value;
|
||||
}
|
||||
bool RclDynConf::enterString(const string sk, const string value, int maxlen)
|
||||
{
|
||||
RclSListEntry ne(value);
|
||||
RclSListEntry scratch;
|
||||
return insertNew(sk, ne, scratch, maxlen);
|
||||
}
|
||||
list<string> RclDynConf::getStringList(const string sk)
|
||||
{
|
||||
list<RclSListEntry> el = getList<RclSListEntry>(sk);
|
||||
list<string> sl;
|
||||
for (list<RclSListEntry>::const_iterator it = el.begin();
|
||||
it != el.end(); it++)
|
||||
sl.push_back(it->value);
|
||||
return sl;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
#include "history.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif
|
||||
|
||||
static string thisprog;
|
||||
|
||||
static string usage =
|
||||
"trhist [opts] <filename>\n"
|
||||
" [-s <subkey>]: specify subkey (default: RclDynConf::docHistSubKey)\n"
|
||||
" [-e] : erase all\n"
|
||||
" [-a <string>] enter string (needs -s, no good for history entries\n"
|
||||
"\n"
|
||||
;
|
||||
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
cerr << thisprog << ": usage:\n" << usage;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int op_flags;
|
||||
#define OPT_e 0x2
|
||||
#define OPT_s 0x4
|
||||
#define OPT_a 0x8
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
string sk = "docs";
|
||||
string value;
|
||||
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
while (argc > 0 && **argv == '-') {
|
||||
(*argv)++;
|
||||
if (!(**argv))
|
||||
/* Cas du "adb - core" */
|
||||
Usage();
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 'a': op_flags |= OPT_a; if (argc < 2) Usage();
|
||||
value = *(++argv); argc--;
|
||||
goto b1;
|
||||
case 's': op_flags |= OPT_s; if (argc < 2) Usage();
|
||||
sk = *(++argv); argc--;
|
||||
goto b1;
|
||||
case 'e': op_flags |= OPT_e; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
b1: argc--; argv++;
|
||||
}
|
||||
if (argc != 1)
|
||||
Usage();
|
||||
string filename = *argv++;argc--;
|
||||
|
||||
RclDynConf hist(filename, 5);
|
||||
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
||||
DebugLog::setfilename("stderr");
|
||||
|
||||
if (op_flags & OPT_e) {
|
||||
hist.eraseAll(sk);
|
||||
} else if (op_flags & OPT_a) {
|
||||
if (!(op_flags & OPT_s))
|
||||
Usage();
|
||||
hist.enterString(sk, value);
|
||||
} else {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
char docname[100];
|
||||
sprintf(docname, "A very long document document name"
|
||||
"is very long indeed and this is the end of "
|
||||
"it here and exactly here:\n%d", i);
|
||||
hist.enterDoc(string(docname), "ipathx");
|
||||
}
|
||||
|
||||
list<RclDHistoryEntry> hlist = hist.getDocHistory();
|
||||
for (list<RclDHistoryEntry>::const_iterator it = hlist.begin();
|
||||
it != hlist.end(); it++) {
|
||||
printf("[%ld] [%s] [%s]\n", it->unixtime,
|
||||
it->fn.c_str(), it->ipath.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
122
src/query/dynconf.h
Normal file
122
src/query/dynconf.h
Normal file
|
@ -0,0 +1,122 @@
|
|||
/* Copyright (C) 2004 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _DYNCONF_H_INCLUDED_
|
||||
#define _DYNCONF_H_INCLUDED_
|
||||
|
||||
/**
|
||||
* Dynamic configuration storage
|
||||
*
|
||||
* This used to be called "history" because of the initial usage.
|
||||
* Used to store some parameters which would fit neither in recoll.conf,
|
||||
* basically because they change a lot, nor in the QT preferences file, mostly
|
||||
* because they are specific to a configuration directory.
|
||||
* Examples:
|
||||
* - History of documents selected for preview
|
||||
* - Active and inactive external databases (depend on the
|
||||
* configuration directory)
|
||||
* - ...
|
||||
*
|
||||
* The storage is performed in a ConfSimple file, with subkeys and
|
||||
* encodings which depend on the data stored. Under each section, the keys
|
||||
* are sequential numeric, so this basically manages a set of lists.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <utility>
|
||||
|
||||
#include "conftree.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif
|
||||
|
||||
// Entry interface.
|
||||
class DynConfEntry {
|
||||
public:
|
||||
virtual ~DynConfEntry() {}
|
||||
virtual bool decode(const string &value) = 0;
|
||||
virtual bool encode(string& value) = 0;
|
||||
virtual bool equal(const DynConfEntry &other) = 0;
|
||||
};
|
||||
|
||||
|
||||
/** String storage generic object */
|
||||
class RclSListEntry : public DynConfEntry {
|
||||
public:
|
||||
RclSListEntry() {}
|
||||
RclSListEntry(const string& v) : value(v) {}
|
||||
virtual ~RclSListEntry() {}
|
||||
virtual bool decode(const string &enc);
|
||||
virtual bool encode(string& enc);
|
||||
virtual bool equal(const DynConfEntry& other);
|
||||
|
||||
string value;
|
||||
};
|
||||
|
||||
/** The dynamic configuration class */
|
||||
class RclDynConf {
|
||||
public:
|
||||
RclDynConf(const string &fn)
|
||||
: m_data(fn.c_str()) {}
|
||||
bool ok() {return m_data.getStatus() == ConfSimple::STATUS_RW;}
|
||||
string getFilename() {return m_data.getFilename();}
|
||||
|
||||
// Generic methods
|
||||
bool eraseAll(const string& sk);
|
||||
bool insertNew(const string& sk, DynConfEntry &n, DynConfEntry &s,
|
||||
int maxlen = -1);
|
||||
template<typename Tp> list<Tp> getList(const string& sk);
|
||||
|
||||
// Specialized methods for simple string lists, designated by the
|
||||
// subkey value
|
||||
bool enterString(const string sk, const string value, int maxlen = -1);
|
||||
list<string> getStringList(const string sk);
|
||||
|
||||
private:
|
||||
unsigned int m_mlen;
|
||||
ConfSimple m_data;
|
||||
|
||||
};
|
||||
|
||||
template<typename Tp> list<Tp> RclDynConf::getList(const string &sk)
|
||||
{
|
||||
list<Tp> mlist;
|
||||
Tp entry;
|
||||
list<string> names = m_data.getNames(sk);
|
||||
for (list<string>::const_iterator it = names.begin();
|
||||
it != names.end(); it++) {
|
||||
string value;
|
||||
if (m_data.get(*it, value, sk)) {
|
||||
if (!entry.decode(value))
|
||||
continue;
|
||||
mlist.push_front(entry);
|
||||
}
|
||||
}
|
||||
return mlist;
|
||||
}
|
||||
|
||||
// Defined subkeys. Values in dynconf.cpp
|
||||
// History
|
||||
extern const string docHistSubKey;
|
||||
// All external indexes
|
||||
extern const string allEdbsSk;
|
||||
// Active external indexes
|
||||
extern const string actEdbsSk;
|
||||
|
||||
#endif /* _DYNCONF_H_INCLUDED_ */
|
125
src/query/filtseq.cpp
Normal file
125
src/query/filtseq.cpp
Normal file
|
@ -0,0 +1,125 @@
|
|||
/* Copyright (C) 2005 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "filtseq.h"
|
||||
#include "rclconfig.h"
|
||||
|
||||
using std::string;
|
||||
|
||||
static bool filter(const DocSeqFiltSpec& fs, const Rcl::Doc *x)
|
||||
{
|
||||
LOGDEB2((" Filter: ncrits %d\n", fs.crits.size()));
|
||||
// Compare using each criterion in term. We're doing an or:
|
||||
// 1st ok ends
|
||||
for (unsigned int i = 0; i < fs.crits.size(); i++) {
|
||||
switch (fs.crits[i]) {
|
||||
case DocSeqFiltSpec::DSFS_MIMETYPE:
|
||||
LOGDEB2((" filter: MIMETYPE: me [%s] doc [%s]\n",
|
||||
fs.values[i].c_str(), x->mimetype.c_str()));
|
||||
if (x->mimetype == fs.values[i])
|
||||
return true;
|
||||
break;
|
||||
case DocSeqFiltSpec::DSFS_QLANG:
|
||||
{
|
||||
LOGDEB((" filter: QLANG [%s]!!\n", fs.values[i].c_str()));
|
||||
}
|
||||
break;
|
||||
case DocSeqFiltSpec::DSFS_PASSALL:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Did all comparisons
|
||||
return false;
|
||||
}
|
||||
|
||||
DocSeqFiltered::DocSeqFiltered(RclConfig *conf, RefCntr<DocSequence> iseq,
|
||||
DocSeqFiltSpec &filtspec)
|
||||
: DocSeqModifier(iseq), m_config(conf)
|
||||
{
|
||||
setFiltSpec(filtspec);
|
||||
}
|
||||
|
||||
bool DocSeqFiltered::setFiltSpec(DocSeqFiltSpec &filtspec)
|
||||
{
|
||||
LOGDEB0(("DocSeqFiltered::setFiltSpec\n"));
|
||||
for (unsigned int i = 0; i < filtspec.crits.size(); i++) {
|
||||
switch (filtspec.crits[i]) {
|
||||
case DocSeqFiltSpec::DSFS_MIMETYPE:
|
||||
m_spec.orCrit(filtspec.crits[i], filtspec.values[i]);
|
||||
break;
|
||||
case DocSeqFiltSpec::DSFS_QLANG:
|
||||
{
|
||||
// There are very few lang constructs that we can
|
||||
// interpret. The default config uses rclcat:value
|
||||
// only. That will be all for now...
|
||||
string val = filtspec.values[i];
|
||||
if (val.find("rclcat:") == 0) {
|
||||
string catg = val.substr(7);
|
||||
list<string> tps;
|
||||
m_config->getMimeCatTypes(catg, tps);
|
||||
for (list<string>::const_iterator it = tps.begin();
|
||||
it != tps.end(); it++) {
|
||||
LOGDEB2(("Adding mime: [%s]\n", it->c_str()));
|
||||
m_spec.orCrit(DocSeqFiltSpec::DSFS_MIMETYPE, *it);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
// If m_spec ends up empty, pass everything, better than filtering all.
|
||||
if (m_spec.crits.empty()) {
|
||||
m_spec.orCrit(DocSeqFiltSpec::DSFS_PASSALL, "");
|
||||
}
|
||||
m_dbindices.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DocSeqFiltered::getDoc(int idx, Rcl::Doc &doc, string *)
|
||||
{
|
||||
LOGDEB2(("DocSeqFiltered::getDoc() fetching %d\n", idx));
|
||||
|
||||
if (idx >= (int)m_dbindices.size()) {
|
||||
// Have to fetch docs and filter until we get enough or
|
||||
// fail
|
||||
m_dbindices.reserve(idx+1);
|
||||
|
||||
// First backend seq doc we fetch is the one after last stored
|
||||
int backend_idx = m_dbindices.size() > 0 ? m_dbindices.back() + 1 : 0;
|
||||
|
||||
// Loop until we get enough docs
|
||||
Rcl::Doc tdoc;
|
||||
int i = 0;
|
||||
while (idx >= (int)m_dbindices.size()) {
|
||||
if (!m_seq->getDoc(backend_idx, tdoc))
|
||||
return false;
|
||||
if (filter(m_spec, &tdoc)) {
|
||||
m_dbindices.push_back(backend_idx);
|
||||
}
|
||||
backend_idx++;
|
||||
}
|
||||
doc = tdoc;
|
||||
} else {
|
||||
// The corresponding backend indice is already known
|
||||
if (!m_seq->getDoc(m_dbindices[idx], doc))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
49
src/query/filtseq.h
Normal file
49
src/query/filtseq.h
Normal file
|
@ -0,0 +1,49 @@
|
|||
/* Copyright (C) 2004 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _FILTSEQ_H_INCLUDED_
|
||||
#define _FILTSEQ_H_INCLUDED_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
#include "refcntr.h"
|
||||
#include "docseq.h"
|
||||
|
||||
class RclConfig;
|
||||
|
||||
/**
|
||||
* A filtered sequence is created from another one by selecting entries
|
||||
* according to the given criteria.
|
||||
*/
|
||||
class DocSeqFiltered : public DocSeqModifier {
|
||||
public:
|
||||
DocSeqFiltered(RclConfig *conf, RefCntr<DocSequence> iseq,
|
||||
DocSeqFiltSpec &filtspec);
|
||||
virtual ~DocSeqFiltered() {}
|
||||
virtual bool canFilter() {return true;}
|
||||
virtual bool setFiltSpec(DocSeqFiltSpec &filtspec);
|
||||
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0);
|
||||
virtual int getResCnt() {return m_seq->getResCnt();}
|
||||
private:
|
||||
RclConfig *m_config;
|
||||
DocSeqFiltSpec m_spec;
|
||||
vector<int> m_dbindices;
|
||||
};
|
||||
|
||||
#endif /* _FILTSEQ_H_INCLUDED_ */
|
523
src/query/plaintorich.cpp
Normal file
523
src/query/plaintorich.cpp
Normal file
|
@ -0,0 +1,523 @@
|
|||
/* Copyright (C) 2005 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <list>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::vector;
|
||||
using std::list;
|
||||
using std::pair;
|
||||
using std::set;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
#include "rcldb.h"
|
||||
#include "rclconfig.h"
|
||||
#include "debuglog.h"
|
||||
#include "textsplit.h"
|
||||
#include "utf8iter.h"
|
||||
#include "smallut.h"
|
||||
#include "plaintorich.h"
|
||||
#include "cancelcheck.h"
|
||||
#include "unacpp.h"
|
||||
|
||||
// For debug printing
|
||||
static string vecStringToString(const vector<string>& t)
|
||||
{
|
||||
string sterms;
|
||||
for (vector<string>::const_iterator it = t.begin(); it != t.end(); it++) {
|
||||
sterms += "[" + *it + "] ";
|
||||
}
|
||||
return sterms;
|
||||
}
|
||||
|
||||
// Text splitter callback used to take note of the position of query terms
|
||||
// inside the result text. This is then used to insert highlight tags.
|
||||
class TextSplitPTR : public TextSplit {
|
||||
public:
|
||||
|
||||
// Out: begin and end byte positions of query terms/groups in text
|
||||
vector<pair<int, int> > tboffs;
|
||||
|
||||
TextSplitPTR(const vector<string>& its,
|
||||
const vector<vector<string> >&groups,
|
||||
const vector<int>& slacks)
|
||||
: m_wcount(0), m_groups(groups), m_slacks(slacks)
|
||||
{
|
||||
for (vector<string>::const_iterator it = its.begin();
|
||||
it != its.end(); it++) {
|
||||
m_terms.insert(*it);
|
||||
}
|
||||
for (vector<vector<string> >::const_iterator vit = m_groups.begin();
|
||||
vit != m_groups.end(); vit++) {
|
||||
for (vector<string>::const_iterator it = (*vit).begin();
|
||||
it != (*vit).end(); it++) {
|
||||
m_gterms.insert(*it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Callback called by the text-to-words breaker for each word
|
||||
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
||||
string dumb;
|
||||
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
|
||||
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
|
||||
term.c_str()));
|
||||
return true;
|
||||
}
|
||||
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
||||
// pos, bts, bte));
|
||||
|
||||
// If this word is a search term, remember its byte-offset span.
|
||||
if (m_terms.find(dumb) != m_terms.end()) {
|
||||
tboffs.push_back(pair<int, int>(bts, bte));
|
||||
}
|
||||
|
||||
if (m_gterms.find(dumb) != m_gterms.end()) {
|
||||
// Term group (phrase/near) handling
|
||||
m_plists[dumb].push_back(pos);
|
||||
m_gpostobytes[pos] = pair<int,int>(bts, bte);
|
||||
//LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
|
||||
}
|
||||
if ((m_wcount++ & 0xfff) == 0)
|
||||
CancelCheck::instance().checkCancel();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Must be called after the split to find the phrase/near match positions
|
||||
virtual bool matchGroups();
|
||||
|
||||
private:
|
||||
virtual bool matchGroup(const vector<string>& terms, int dist);
|
||||
|
||||
int m_wcount;
|
||||
|
||||
// In: user query terms
|
||||
set<string> m_terms;
|
||||
|
||||
// In: user query groups, for near/phrase searches.
|
||||
const vector<vector<string> >& m_groups;
|
||||
const vector<int>& m_slacks;
|
||||
set<string> m_gterms;
|
||||
|
||||
// group/near terms word positions.
|
||||
map<string, vector<int> > m_plists;
|
||||
map<int, pair<int, int> > m_gpostobytes;
|
||||
};
|
||||
|
||||
|
||||
/** Sort by shorter comparison class */
|
||||
class VecIntCmpShorter {
|
||||
public:
|
||||
/** Return true if and only if a is strictly shorter than b.
|
||||
*/
|
||||
bool operator()(const vector<int> *a, const vector<int> *b) {
|
||||
return a->size() < b->size();
|
||||
}
|
||||
};
|
||||
|
||||
#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \
|
||||
if ((POS) > (STO)) (STO) = (POS);}
|
||||
|
||||
// Recursively check that each term is inside the window (which is
|
||||
// readjusted as the successive terms are found). i is the index for
|
||||
// the next position list to use (initially 1)
|
||||
static bool do_proximity_test(int window, vector<vector<int>* >& plists,
|
||||
unsigned int i, int min, int max,
|
||||
int *sp, int *ep)
|
||||
{
|
||||
int tmp = max + 1;
|
||||
// take care to avoid underflow
|
||||
if (window <= tmp)
|
||||
tmp -= window;
|
||||
else
|
||||
tmp = 0;
|
||||
vector<int>::iterator it = plists[i]->begin();
|
||||
|
||||
// Find 1st position bigger than window start
|
||||
while (it != plists[i]->end() && *it < tmp)
|
||||
it++;
|
||||
|
||||
// Try each position inside window in turn for match with other lists
|
||||
while (it != plists[i]->end()) {
|
||||
int pos = *it;
|
||||
if (pos > min + window - 1)
|
||||
return false;
|
||||
if (i + 1 == plists.size()) {
|
||||
SETMINMAX(pos, *sp, *ep);
|
||||
return true;
|
||||
}
|
||||
if (pos < min) {
|
||||
min = pos;
|
||||
} else if (pos > max) {
|
||||
max = pos;
|
||||
}
|
||||
if (do_proximity_test(window, plists, i + 1, min, max, sp, ep)) {
|
||||
SETMINMAX(pos, *sp, *ep);
|
||||
return true;
|
||||
}
|
||||
it++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if there is a NEAR match for the group of terms
|
||||
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
|
||||
{
|
||||
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
|
||||
vecStringToString(terms).c_str()));
|
||||
|
||||
// The position lists we are going to work with. We extract them from the
|
||||
// (string->plist) map
|
||||
vector<vector<int>* > plists;
|
||||
// A revert plist->term map. This is so that we can find who is who after
|
||||
// sorting the plists by length.
|
||||
map<vector<int>*, string> plistToTerm;
|
||||
// For traces
|
||||
vector<string> realgroup;
|
||||
|
||||
// Find the position list for each term in the group. Not all
|
||||
// necessarily exist (esp for NEAR where terms have been
|
||||
// stem-expanded: we don't know which matched)
|
||||
for (vector<string>::const_iterator it = terms.begin();
|
||||
it != terms.end(); it++) {
|
||||
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
||||
if (pl == m_plists.end()) {
|
||||
LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
|
||||
(*it).c_str()));
|
||||
continue;
|
||||
}
|
||||
plists.push_back(&(pl->second));
|
||||
plistToTerm[&(pl->second)] = *it;
|
||||
realgroup.push_back(*it);
|
||||
}
|
||||
LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n",
|
||||
window, vecStringToString(realgroup).c_str()));
|
||||
if (plists.size() < 2) {
|
||||
LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
|
||||
return false;
|
||||
}
|
||||
// Sort the positions lists so that the shorter is first
|
||||
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
|
||||
|
||||
{ // Debug
|
||||
map<vector<int>*, string>::iterator it;
|
||||
it = plistToTerm.find(plists[0]);
|
||||
if (it == plistToTerm.end()) {
|
||||
// SuperWeird
|
||||
LOGERR(("matchGroup: term for first list not found !?!\n"));
|
||||
return false;
|
||||
}
|
||||
LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
|
||||
it->second.c_str(), plists[0]->size()));
|
||||
}
|
||||
|
||||
// Walk the shortest plist and look for matches
|
||||
for (vector<int>::iterator it = plists[0]->begin();
|
||||
it != plists[0]->end(); it++) {
|
||||
int pos = *it;
|
||||
int sta = int(10E9), sto = 0;
|
||||
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
|
||||
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
|
||||
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
|
||||
sta, sto));
|
||||
// Maybe extend the window by 1st term position, this was not
|
||||
// done by do_prox..
|
||||
SETMINMAX(pos, sta, sto);
|
||||
// Translate the position window into a byte offset window
|
||||
int bs = 0;
|
||||
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
||||
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
||||
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
||||
LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
|
||||
i1->second.first, i2->second.second));
|
||||
tboffs.push_back(pair<int, int>(i1->second.first,
|
||||
i2->second.second));
|
||||
bs = i1->second.first;
|
||||
} else {
|
||||
LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Sort integer pairs by increasing first value and decreasing width */
|
||||
class PairIntCmpFirst {
|
||||
public:
|
||||
bool operator()(pair<int,int> a, pair<int, int>b) {
|
||||
if (a.first != b.first)
|
||||
return a.first < b.first;
|
||||
return a.second > b.second;
|
||||
}
|
||||
};
|
||||
|
||||
// Do the phrase match thing, then merge the highlight lists
|
||||
bool TextSplitPTR::matchGroups()
|
||||
{
|
||||
vector<vector<string> >::const_iterator vit = m_groups.begin();
|
||||
vector<int>::const_iterator sit = m_slacks.begin();
|
||||
for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
|
||||
matchGroup(*vit, *sit + (*vit).size());
|
||||
}
|
||||
|
||||
// Sort by start and end offsets. The merging of overlapping entries
|
||||
// will be handled during output.
|
||||
std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Fix result text for display inside the gui text window.
|
||||
//
|
||||
// To compute the term character positions in the output text, we used
|
||||
// to emulate how qt's textedit counts chars (ignoring tags and
|
||||
// duplicate whitespace etc...). This was tricky business, dependant
|
||||
// on qtextedit internals, and we don't do it any more, so we finally
|
||||
// don't know the term par/car positions in the editor text.
|
||||
// Instead, we now mark the search term positions with html anchors
|
||||
//
|
||||
// We output the result in chunks, arranging not to cut in the middle of
|
||||
// a tag, which would confuse qtextedit.
|
||||
bool PlainToRich::plaintorich(const string& in,
|
||||
list<string>& out, // Output chunk list
|
||||
const HiliteData& hdata,
|
||||
int chunksize)
|
||||
{
|
||||
Chrono chron;
|
||||
const vector<string>& terms(hdata.terms);
|
||||
const vector<vector<string> >& groups(hdata.groups);
|
||||
const vector<int>& slacks(hdata.gslks);
|
||||
|
||||
if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {
|
||||
LOGDEB0(("plaintorich: terms: \n"));
|
||||
string sterms = vecStringToString(terms);
|
||||
LOGDEB0((" %s\n", sterms.c_str()));
|
||||
sterms = "\n";
|
||||
LOGDEB0(("plaintorich: groups: \n"));
|
||||
for (vector<vector<string> >::const_iterator vit = groups.begin();
|
||||
vit != groups.end(); vit++) {
|
||||
sterms += "GROUP: ";
|
||||
sterms += vecStringToString(*vit);
|
||||
sterms += "\n";
|
||||
}
|
||||
LOGDEB0((" %s", sterms.c_str()));
|
||||
LOGDEB2((" TEXT:[%s]\n", in.c_str()));
|
||||
}
|
||||
|
||||
// Compute the positions for the query terms. We use the text
|
||||
// splitter to break the text into words, and compare the words to
|
||||
// the search terms,
|
||||
TextSplitPTR splitter(terms, groups, slacks);
|
||||
// Note: the splitter returns the term locations in byte, not
|
||||
// character, offsets.
|
||||
splitter.text_to_words(in);
|
||||
LOGDEB2(("plaintorich: split done %d mS\n", chron.millis()));
|
||||
|
||||
// Compute the positions for NEAR and PHRASE groups.
|
||||
splitter.matchGroups();
|
||||
|
||||
out.clear();
|
||||
out.push_back("");
|
||||
list<string>::iterator olit = out.begin();
|
||||
|
||||
// Rich text output
|
||||
*olit = header();
|
||||
|
||||
// Iterator for the list of input term positions. We use it to
|
||||
// output highlight tags and to compute term positions in the
|
||||
// output text
|
||||
vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
|
||||
vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();
|
||||
|
||||
#if 0
|
||||
for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
|
||||
it != splitter.tboffs.end(); it++) {
|
||||
LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Input character iterator
|
||||
Utf8Iter chariter(in);
|
||||
|
||||
// State variables used to limit the number of consecutive empty lines,
|
||||
// convert all eol to '\n', and preserve some indentation
|
||||
int eol = 0;
|
||||
int hadcr = 0;
|
||||
int inindent = 1;
|
||||
|
||||
// Value for numbered anchors at each term match
|
||||
int anchoridx = 1;
|
||||
// HTML state
|
||||
bool intag = false, inparamvalue = false;
|
||||
// My tag state
|
||||
int inrcltag = 0;
|
||||
|
||||
string::size_type headend = 0;
|
||||
if (m_inputhtml) {
|
||||
headend = in.find("</head>");
|
||||
if (headend == string::npos)
|
||||
headend = in.find("</HEAD>");
|
||||
if (headend != string::npos)
|
||||
headend += 7;
|
||||
}
|
||||
|
||||
for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
|
||||
// Check from time to time if we need to stop
|
||||
if ((pos & 0xfff) == 0) {
|
||||
CancelCheck::instance().checkCancel();
|
||||
}
|
||||
|
||||
// If we still have terms positions, check (byte) position. If
|
||||
// we are at or after a term match, mark.
|
||||
if (tPosIt != tPosEnd) {
|
||||
int ibyteidx = chariter.getBpos();
|
||||
if (ibyteidx == tPosIt->first) {
|
||||
if (!intag && ibyteidx > (int)headend) {
|
||||
*olit += startAnchor(anchoridx);
|
||||
*olit += startMatch();
|
||||
}
|
||||
anchoridx++;
|
||||
inrcltag = 1;
|
||||
} else if (ibyteidx == tPosIt->second) {
|
||||
// Output end of match region tags
|
||||
if (!intag && ibyteidx > (int)headend) {
|
||||
*olit += endMatch();
|
||||
*olit += endAnchor();
|
||||
}
|
||||
// Skip all highlight areas that would overlap this one
|
||||
int crend = tPosIt->second;
|
||||
while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
|
||||
tPosIt++;
|
||||
inrcltag = 0;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int car = *chariter;
|
||||
|
||||
if (car == '\n') {
|
||||
if (!hadcr)
|
||||
eol++;
|
||||
hadcr = 0;
|
||||
continue;
|
||||
} else if (car == '\r') {
|
||||
hadcr++;
|
||||
eol++;
|
||||
continue;
|
||||
} else if (eol) {
|
||||
// Got non eol char in line break state. Do line break;
|
||||
inindent = 1;
|
||||
hadcr = 0;
|
||||
if (eol > 2)
|
||||
eol = 2;
|
||||
while (eol) {
|
||||
if (!m_inputhtml && m_eolbr)
|
||||
*olit += "<br>";
|
||||
*olit += "\n";
|
||||
eol--;
|
||||
}
|
||||
// Maybe end this chunk, begin next. Don't do it on html
|
||||
// there is just no way to do it right (qtextedit cant grok
|
||||
// chunks cut in the middle of <a></a> for example).
|
||||
if (!m_inputhtml && !inrcltag &&
|
||||
olit->size() > (unsigned int)chunksize) {
|
||||
out.push_back(string(startChunk()));
|
||||
olit++;
|
||||
}
|
||||
}
|
||||
|
||||
switch (car) {
|
||||
case '<':
|
||||
inindent = 0;
|
||||
if (m_inputhtml) {
|
||||
if (!inparamvalue)
|
||||
intag = true;
|
||||
chariter.appendchartostring(*olit);
|
||||
} else {
|
||||
*olit += "<";
|
||||
}
|
||||
break;
|
||||
case '>':
|
||||
inindent = 0;
|
||||
if (m_inputhtml) {
|
||||
if (!inparamvalue)
|
||||
intag = false;
|
||||
}
|
||||
chariter.appendchartostring(*olit);
|
||||
break;
|
||||
case '&':
|
||||
inindent = 0;
|
||||
if (m_inputhtml) {
|
||||
chariter.appendchartostring(*olit);
|
||||
} else {
|
||||
*olit += "&";
|
||||
}
|
||||
break;
|
||||
case '"':
|
||||
inindent = 0;
|
||||
if (m_inputhtml && intag) {
|
||||
inparamvalue = !inparamvalue;
|
||||
}
|
||||
chariter.appendchartostring(*olit);
|
||||
break;
|
||||
|
||||
case ' ':
|
||||
if (m_eolbr && inindent) {
|
||||
*olit += " ";
|
||||
} else {
|
||||
chariter.appendchartostring(*olit);
|
||||
}
|
||||
break;
|
||||
case '\t':
|
||||
if (m_eolbr && inindent) {
|
||||
*olit += " ";
|
||||
} else {
|
||||
chariter.appendchartostring(*olit);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
inindent = 0;
|
||||
chariter.appendchartostring(*olit);
|
||||
}
|
||||
|
||||
} // End chariter loop
|
||||
|
||||
#if 0
|
||||
{
|
||||
FILE *fp = fopen("/tmp/debugplaintorich", "a");
|
||||
fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");
|
||||
for (list<string>::iterator it = out.begin();
|
||||
it != out.end(); it++) {
|
||||
fprintf(fp, "BEGINOFPLAINTORICHCHUNK\n");
|
||||
fprintf(fp, "%s", it->c_str());
|
||||
fprintf(fp, "ENDOFPLAINTORICHCHUNK\n");
|
||||
}
|
||||
fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
|
||||
fclose(fp);
|
||||
}
|
||||
#endif
|
||||
LOGDEB2(("plaintorich: done %d mS\n", chron.millis()));
|
||||
return true;
|
||||
}
|
95
src/query/plaintorich.h
Normal file
95
src/query/plaintorich.h
Normal file
|
@ -0,0 +1,95 @@
|
|||
/* Copyright (C) 2004 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _PLAINTORICH_H_INCLUDED_
|
||||
#define _PLAINTORICH_H_INCLUDED_
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
using std::list;
|
||||
using std::string;
|
||||
|
||||
/// Holder for plaintorich() input data: words and groups of words to
|
||||
/// be highlighted
|
||||
struct HiliteData {
|
||||
// Single terms
|
||||
vector<string> terms;
|
||||
// NEAR and PHRASE elements
|
||||
vector<vector<string> > groups;
|
||||
// Group slacks (number of permitted non-matched words).
|
||||
// Parallel vector to the above 'groups'
|
||||
vector<int> gslks;
|
||||
void reset()
|
||||
{
|
||||
terms.clear();
|
||||
groups.clear();
|
||||
gslks.clear();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* A class for highlighting search results. Overridable methods allow
|
||||
* for different styles. We can handle plain text or html input. In the latter
|
||||
* case, we may fail to highligt term groups if they are mixed with html tags.
|
||||
*/
|
||||
class PlainToRich {
|
||||
public:
|
||||
PlainToRich() : m_inputhtml(false) {}
|
||||
virtual ~PlainToRich() {}
|
||||
void set_inputhtml(bool v) {m_inputhtml = v;}
|
||||
|
||||
/**
|
||||
* Transform plain text for highlighting search terms, ie in the
|
||||
* preview window or result list entries.
|
||||
*
|
||||
* The actual tags used for highlighting and anchoring are
|
||||
* determined by deriving from this class which handles the searching for
|
||||
* terms and groups, but there is an assumption that the output will be
|
||||
* html-like: we escape characters like < or &
|
||||
*
|
||||
* Finding the search terms is relatively complicated because of
|
||||
* phrase/near searches, which need group highlights. As a matter
|
||||
* of simplification, we handle "phrase" as "near", not filtering
|
||||
* on word order.
|
||||
*
|
||||
* @param in raw text out of internfile.
|
||||
* @param out rich text output, divided in chunks (to help our caller
|
||||
* avoid inserting half tags into textedit which doesnt like it)
|
||||
* @param hdata terms and groups to be highlighted. These are
|
||||
* lowercase and unaccented.
|
||||
* @param chunksize max size of chunks in output list
|
||||
*/
|
||||
virtual bool plaintorich(const string &in, list<string> &out,
|
||||
const HiliteData& hdata,
|
||||
int chunksize = 50000
|
||||
);
|
||||
|
||||
/* Methods to ouput headers, highlighting and marking tags */
|
||||
virtual string header() {return snull;}
|
||||
virtual string startMatch() {return snull;}
|
||||
virtual string endMatch() {return snull;}
|
||||
virtual string startAnchor(int) {return snull;}
|
||||
virtual string endAnchor() {return snull;}
|
||||
virtual string startChunk() {return snull;}
|
||||
|
||||
protected:
|
||||
const string snull;
|
||||
bool m_inputhtml;
|
||||
// Use <br> to break plain text lines (else caller has used a <pre> tag)
|
||||
bool m_eolbr;
|
||||
};
|
||||
|
||||
#endif /* _PLAINTORICH_H_INCLUDED_ */
|
411
src/query/recollq.cpp
Normal file
411
src/query/recollq.cpp
Normal file
|
@ -0,0 +1,411 @@
|
|||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
// Takes a query and run it, no gui, results to stdout
|
||||
|
||||
#ifndef TEST_RECOLLQ
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
#include "rcldb.h"
|
||||
#include "rclquery.h"
|
||||
#include "rclconfig.h"
|
||||
#include "pathut.h"
|
||||
#include "rclinit.h"
|
||||
#include "debuglog.h"
|
||||
#include "wasastringtoquery.h"
|
||||
#include "wasatorcl.h"
|
||||
#include "internfile.h"
|
||||
#include "wipedir.h"
|
||||
#include "transcode.h"
|
||||
#include "textsplit.h"
|
||||
#include "smallut.h"
|
||||
#include "base64.h"
|
||||
|
||||
bool dump_contents(RclConfig *rclconfig, TempDir& tmpdir, Rcl::Doc& idoc)
|
||||
{
|
||||
FileInterner interner(idoc, rclconfig, tmpdir,
|
||||
FileInterner::FIF_forPreview);
|
||||
Rcl::Doc fdoc;
|
||||
string ipath = idoc.ipath;
|
||||
if (interner.internfile(fdoc, ipath)) {
|
||||
cout << fdoc.text << endl;
|
||||
} else {
|
||||
cout << "Cant turn to text:" << idoc.url << " | " << idoc.ipath << endl;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void output_fields(const vector<string>fields, Rcl::Doc& doc,
|
||||
Rcl::Query& query, Rcl::Db& rcldb)
|
||||
{
|
||||
for (vector<string>::const_iterator it = fields.begin();
|
||||
it != fields.end(); it++) {
|
||||
string out;
|
||||
if (!it->compare("abstract")) {
|
||||
string abstract;
|
||||
rcldb.makeDocAbstract(doc, &query, abstract);
|
||||
base64_encode(abstract, out);
|
||||
} else {
|
||||
base64_encode(doc.meta[*it], out);
|
||||
}
|
||||
cout << out << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
static char *thisprog;
|
||||
static char usage [] =
|
||||
" -P: Show the date span for all the documents present in the index\n"
|
||||
" [-o|-a|-f] [-q] <query string>\n"
|
||||
" Runs a recoll query and displays result lines. \n"
|
||||
" Default: will interpret the argument(s) as a xesam query string\n"
|
||||
" query may be like: \n"
|
||||
" implicit AND, Exclusion, field spec: t1 -t2 title:t3\n"
|
||||
" OR has priority: t1 OR t2 t3 OR t4 means (t1 OR t2) AND (t3 OR t4)\n"
|
||||
" Phrase: \"t1 t2\" (needs additional quoting on cmd line)\n"
|
||||
" -o Emulate the GUI simple search in ANY TERM mode\n"
|
||||
" -a Emulate the GUI simple search in ALL TERMS mode\n"
|
||||
" -f Emulate the GUI simple search in filename mode\n"
|
||||
" -q is just ignored (compatibility with the recoll GUI command line)\n"
|
||||
"Common options:\n"
|
||||
" -c <configdir> : specify config directory, overriding $RECOLL_CONFDIR\n"
|
||||
" -d also dump file contents\n"
|
||||
" -n [first-]<cnt> define the result slice. The default value for [first]\n"
|
||||
" is 0. Without the option, the default max count is 2000.\n"
|
||||
" Use n=0 for no limit\n"
|
||||
" -b : basic. Just output urls, no mime types or titles\n"
|
||||
" -Q : no result lines, just the processed query and result count\n"
|
||||
" -m : dump the whole document meta[] array for each result\n"
|
||||
" -A : output the document abstracts\n"
|
||||
" -S fld : sort by field <fld>\n"
|
||||
" -D : sort descending\n"
|
||||
" -i <dbdir> : additional index, several can be given\n"
|
||||
" -e use url encoding (%xx) for urls\n"
|
||||
" -F <field name list> : output exactly these fields for each result.\n"
|
||||
" The field values are encoded in base64, output in one line and \n"
|
||||
" separated by one space character. This is the recommended format \n"
|
||||
" for use by other programs. Use a normal query with option -m to \n"
|
||||
" see the field names.\n"
|
||||
;
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
cerr << thisprog << ": usage:" << endl << usage;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// ATTENTION A LA COMPATIBILITE AVEC LES OPTIONS DE recoll
|
||||
// -q, -t and -l are accepted and ignored
|
||||
// -a/f/o -c have the same meaning
|
||||
// -h is not used
|
||||
|
||||
static int op_flags;
|
||||
#define OPT_A 0x1
|
||||
#define OPT_a 0x2
|
||||
#define OPT_b 0x4
|
||||
#define OPT_c 0x8
|
||||
#define OPT_D 0x10
|
||||
#define OPT_d 0x20
|
||||
#define OPT_f 0x40
|
||||
#define OPT_i 0x80
|
||||
#define OPT_l 0x100
|
||||
#define OPT_m 0x200
|
||||
#define OPT_n 0x400
|
||||
#define OPT_o 0x800
|
||||
#define OPT_P 0x1000
|
||||
#define OPT_Q 0x2000
|
||||
#define OPT_q 0x4000
|
||||
#define OPT_S 0x8000
|
||||
#define OPT_s 0x10000
|
||||
#define OPT_t 0x20000
|
||||
#define OPT_e 0x40000
|
||||
#define OPT_F 0x80000
|
||||
|
||||
int recollq(RclConfig **cfp, int argc, char **argv)
|
||||
{
|
||||
string a_config;
|
||||
string sortfield;
|
||||
string stemlang("english");
|
||||
list<string> extra_dbs;
|
||||
string sf;
|
||||
vector<string> fields;
|
||||
|
||||
int firstres = 0;
|
||||
int maxcount = 2000;
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
while (argc > 0 && **argv == '-') {
|
||||
(*argv)++;
|
||||
if (!(**argv))
|
||||
/* Cas du "adb - core" */
|
||||
Usage();
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 'A': op_flags |= OPT_A; break;
|
||||
case 'a': op_flags |= OPT_a; break;
|
||||
case 'b': op_flags |= OPT_b; break;
|
||||
case 'c': op_flags |= OPT_c; if (argc < 2) Usage();
|
||||
a_config = *(++argv);
|
||||
argc--; goto b1;
|
||||
case 'd': op_flags |= OPT_d; break;
|
||||
case 'D': op_flags |= OPT_D; break;
|
||||
case 'e': op_flags |= OPT_e; break;
|
||||
case 'f': op_flags |= OPT_f; break;
|
||||
case 'F': op_flags |= OPT_F; if (argc < 2) Usage();
|
||||
sf = *(++argv);
|
||||
argc--; goto b1;
|
||||
case 'i': op_flags |= OPT_i; if (argc < 2) Usage();
|
||||
extra_dbs.push_back(*(++argv));
|
||||
argc--; goto b1;
|
||||
case 'l': op_flags |= OPT_l; break;
|
||||
case 'm': op_flags |= OPT_m; break;
|
||||
case 'n': op_flags |= OPT_n; if (argc < 2) Usage();
|
||||
{
|
||||
string rescnt = *(++argv);
|
||||
string::size_type dash = rescnt.find("-");
|
||||
if (dash != string::npos) {
|
||||
firstres = atoi(rescnt.substr(0, dash).c_str());
|
||||
if (dash < rescnt.size()-1) {
|
||||
maxcount = atoi(rescnt.substr(dash+1).c_str());
|
||||
}
|
||||
} else {
|
||||
maxcount = atoi(rescnt.c_str());
|
||||
}
|
||||
if (maxcount <= 0) maxcount = INT_MAX;
|
||||
}
|
||||
argc--; goto b1;
|
||||
case 'o': op_flags |= OPT_o; break;
|
||||
case 'P': op_flags |= OPT_P; break;
|
||||
case 'q': op_flags |= OPT_q; break;
|
||||
case 'Q': op_flags |= OPT_Q; break;
|
||||
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
|
||||
sortfield = *(++argv);
|
||||
argc--; goto b1;
|
||||
case 's': op_flags |= OPT_s; if (argc < 2) Usage();
|
||||
stemlang = *(++argv);
|
||||
argc--; goto b1;
|
||||
case 't': op_flags |= OPT_t; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
b1: argc--; argv++;
|
||||
}
|
||||
|
||||
string reason;
|
||||
*cfp = recollinit(0, 0, reason, &a_config);
|
||||
RclConfig *rclconfig = *cfp;
|
||||
if (!rclconfig || !rclconfig->ok()) {
|
||||
fprintf(stderr, "Recoll init failed: %s\n", reason.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (argc < 1 && !(op_flags & OPT_P)) {
|
||||
Usage();
|
||||
}
|
||||
if (op_flags & OPT_F) {
|
||||
if (op_flags & (OPT_b|OPT_d|OPT_b|OPT_Q|OPT_m|OPT_A))
|
||||
Usage();
|
||||
stringToStrings(sf, fields);
|
||||
}
|
||||
Rcl::Db rcldb(rclconfig);
|
||||
if (!extra_dbs.empty()) {
|
||||
for (list<string>::iterator it = extra_dbs.begin();
|
||||
it != extra_dbs.end(); it++) {
|
||||
if (!rcldb.addQueryDb(*it)) {
|
||||
cerr << "Can't add index: " << *it << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!rcldb.open(Rcl::Db::DbRO)) {
|
||||
cerr << "Cant open database in " << rclconfig->getDbDir() <<
|
||||
" reason: " << rcldb.getReason() << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (op_flags & OPT_P) {
|
||||
int minyear, maxyear;
|
||||
if (!rcldb.maxYearSpan(&minyear, &maxyear)) {
|
||||
cerr << "maxYearSpan failed: " << rcldb.getReason() << endl;
|
||||
exit(1);
|
||||
} else {
|
||||
cout << "Min year " << minyear << " Max year " << maxyear << endl;
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (argc < 1) {
|
||||
Usage();
|
||||
}
|
||||
string qs = *argv++;argc--;
|
||||
while (argc > 0) {
|
||||
qs += string(" ") + *argv++;argc--;
|
||||
}
|
||||
|
||||
{
|
||||
string uq;
|
||||
string charset = rclconfig->getDefCharset(true);
|
||||
int ercnt;
|
||||
if (!transcode(qs, uq, charset, "UTF-8", &ercnt)) {
|
||||
fprintf(stderr, "Can't convert command line args to utf-8\n");
|
||||
exit(1);
|
||||
} else if (ercnt) {
|
||||
fprintf(stderr, "%d errors while converting arguments from %s "
|
||||
"to utf-8\n", ercnt, charset.c_str());
|
||||
}
|
||||
qs = uq;
|
||||
}
|
||||
|
||||
Rcl::SearchData *sd = 0;
|
||||
|
||||
if (op_flags & (OPT_a|OPT_o|OPT_f)) {
|
||||
sd = new Rcl::SearchData(Rcl::SCLT_OR);
|
||||
Rcl::SearchDataClause *clp = 0;
|
||||
if (op_flags & OPT_f) {
|
||||
clp = new Rcl::SearchDataClauseFilename(qs);
|
||||
} else {
|
||||
// If there is no white space inside the query, then the user
|
||||
// certainly means it as a phrase.
|
||||
bool isreallyaphrase = false;
|
||||
if (!TextSplit::hasVisibleWhite(qs))
|
||||
isreallyaphrase = true;
|
||||
clp = isreallyaphrase ?
|
||||
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) :
|
||||
new Rcl::SearchDataClauseSimple((op_flags & OPT_o)?
|
||||
Rcl::SCLT_OR : Rcl::SCLT_AND,
|
||||
qs);
|
||||
}
|
||||
if (sd)
|
||||
sd->addClause(clp);
|
||||
} else {
|
||||
sd = wasaStringToRcl(rclconfig, qs, reason);
|
||||
}
|
||||
|
||||
if (!sd) {
|
||||
cerr << "Query string interpretation failed: " << reason << endl;
|
||||
return 1;
|
||||
}
|
||||
sd->setStemlang(stemlang);
|
||||
|
||||
RefCntr<Rcl::SearchData> rq(sd);
|
||||
Rcl::Query query(&rcldb);
|
||||
if (op_flags & OPT_S) {
|
||||
query.setSortBy(sortfield, (op_flags & OPT_D) ? false : true);
|
||||
}
|
||||
Chrono chron;
|
||||
query.setQuery(rq);
|
||||
int cnt = query.getResCnt();
|
||||
if (!(op_flags & OPT_b)) {
|
||||
cout << "Recoll query: " << rq->getDescription() << endl;
|
||||
if (firstres == 0) {
|
||||
if (cnt <= maxcount)
|
||||
cout << cnt << " results" << endl;
|
||||
else
|
||||
cout << cnt << " results (printing " << maxcount << " max):"
|
||||
<< endl;
|
||||
} else {
|
||||
cout << "Printing at most " << cnt - (firstres+maxcount) <<
|
||||
" results from first " << firstres << endl;
|
||||
}
|
||||
}
|
||||
if (op_flags & OPT_Q)
|
||||
cout << "Query setup took " << chron.millis() << " mS" << endl;
|
||||
|
||||
if (op_flags & OPT_Q)
|
||||
return(0);
|
||||
|
||||
for (int i = firstres; i < firstres + maxcount; i++) {
|
||||
Rcl::Doc doc;
|
||||
if (!query.getDoc(i, doc))
|
||||
break;
|
||||
|
||||
if (op_flags & OPT_F) {
|
||||
output_fields(fields, doc, query, rcldb);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (op_flags & OPT_e)
|
||||
doc.url = url_encode(doc.url);
|
||||
|
||||
if (op_flags & OPT_b) {
|
||||
cout << doc.url << endl;
|
||||
} else {
|
||||
string titleorfn = doc.meta[Rcl::Doc::keytt];
|
||||
if (titleorfn.empty())
|
||||
titleorfn = doc.meta[Rcl::Doc::keyfn];
|
||||
|
||||
char cpc[20];
|
||||
sprintf(cpc, "%d", doc.pc);
|
||||
cout
|
||||
<< doc.mimetype << "\t"
|
||||
<< "[" << doc.url << "]" << "\t"
|
||||
<< "[" << titleorfn << "]" << "\t"
|
||||
<< doc.fbytes << "\tbytes" << "\t"
|
||||
<< endl;
|
||||
if (op_flags & OPT_m) {
|
||||
for (map<string,string>::const_iterator it = doc.meta.begin();
|
||||
it != doc.meta.end(); it++) {
|
||||
cout << it->first << " = " << it->second << endl;
|
||||
}
|
||||
}
|
||||
if (op_flags & OPT_A) {
|
||||
string abstract;
|
||||
if (rcldb.makeDocAbstract(doc, &query, abstract)) {
|
||||
cout << "ABSTRACT" << endl;
|
||||
cout << abstract << endl;
|
||||
cout << "/ABSTRACT" << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_flags & OPT_d) {
|
||||
static TempDir tmpdir;
|
||||
if (!tmpdir.ok()) {
|
||||
cerr << "Can't create temporary directory: " <<
|
||||
tmpdir.getreason() << endl;
|
||||
return(1);
|
||||
}
|
||||
dump_contents(rclconfig, tmpdir, doc);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else // TEST_RECOLLQ The test driver is actually the useful program...
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "rclconfig.h"
|
||||
#include "recollq.h"
|
||||
|
||||
static RclConfig *rclconfig;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
return(recollq(&rclconfig, argc, argv));
|
||||
}
|
||||
#endif // TEST_RECOLLQ
|
26
src/query/recollq.h
Normal file
26
src/query/recollq.h
Normal file
|
@ -0,0 +1,26 @@
|
|||
/* Copyright (C) 2007 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef _recollq_h_included_
|
||||
#define _recollq_h_included_
|
||||
|
||||
/// Execute query, print results to stdout. This is just an api to the
|
||||
/// recollq command line program.
|
||||
class RclConfig;
|
||||
extern int recollq(RclConfig **cfp, int argc, char **argv);
|
||||
|
||||
#endif /* _recollq_h_included_ */
|
438
src/query/reslistpager.cpp
Normal file
438
src/query/reslistpager.cpp
Normal file
|
@ -0,0 +1,438 @@
|
|||
/* Copyright (C) 2007 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "autoconfig.h"
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <sstream>
|
||||
using std::ostringstream;
|
||||
using std::endl;
|
||||
|
||||
#include "cstr.h"
|
||||
#include "reslistpager.h"
|
||||
#include "debuglog.h"
|
||||
#include "rclconfig.h"
|
||||
#include "smallut.h"
|
||||
#include "plaintorich.h"
|
||||
#include "mimehandler.h"
|
||||
|
||||
// Default highlighter. No need for locking, this is query-only.
|
||||
static const string cstr_hlfontcolor("<font color=\"blue\">");
|
||||
static const string cstr_hlendfont("</font>");
|
||||
class PlainToRichHtReslist : public PlainToRich {
|
||||
public:
|
||||
virtual ~PlainToRichHtReslist() {}
|
||||
virtual string startMatch() {return cstr_hlfontcolor;}
|
||||
virtual string endMatch() {return cstr_hlendfont;}
|
||||
};
|
||||
static PlainToRichHtReslist g_hiliter;
|
||||
|
||||
ResListPager::ResListPager(int pagesize)
|
||||
: m_pagesize(pagesize),
|
||||
m_newpagesize(pagesize),
|
||||
m_winfirst(-1),
|
||||
m_hasNext(false),
|
||||
m_hiliter(&g_hiliter)
|
||||
{
|
||||
}
|
||||
|
||||
void ResListPager::resultPageNext()
|
||||
{
|
||||
if (m_docSource.isNull()) {
|
||||
LOGDEB(("ResListPager::resultPageNext: null source\n"));
|
||||
return;
|
||||
}
|
||||
|
||||
int resCnt = m_docSource->getResCnt();
|
||||
LOGDEB(("ResListPager::resultPageNext: rescnt %d, winfirst %d\n",
|
||||
resCnt, m_winfirst));
|
||||
|
||||
if (m_winfirst < 0) {
|
||||
m_winfirst = 0;
|
||||
} else {
|
||||
m_winfirst += m_respage.size();
|
||||
}
|
||||
// Get the next page of results.
|
||||
vector<ResListEntry> npage;
|
||||
int pagelen = m_docSource->getSeqSlice(m_winfirst, m_pagesize, npage);
|
||||
|
||||
// If page was truncated, there is no next
|
||||
m_hasNext = (pagelen == m_pagesize);
|
||||
|
||||
if (pagelen <= 0) {
|
||||
// No results ? This can only happen on the first page or if the
|
||||
// actual result list size is a multiple of the page pref (else
|
||||
// there would have been no Next on the last page)
|
||||
if (m_winfirst > 0) {
|
||||
// Have already results. Let them show, just disable the
|
||||
// Next button. We'd need to remove the Next link from the page
|
||||
// too.
|
||||
// Restore the m_winfirst value, let the current result vector alone
|
||||
m_winfirst -= m_respage.size();
|
||||
} else {
|
||||
// No results at all (on first page)
|
||||
m_winfirst = -1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
m_respage = npage;
|
||||
}
|
||||
|
||||
void ResListPager::resultPageFor(int docnum)
|
||||
{
|
||||
if (m_docSource.isNull()) {
|
||||
LOGDEB(("ResListPager::resultPageFor: null source\n"));
|
||||
return;
|
||||
}
|
||||
|
||||
int resCnt = m_docSource->getResCnt();
|
||||
LOGDEB(("ResListPager::resultPageFor(%d): rescnt %d, winfirst %d\n",
|
||||
docnum, resCnt, m_winfirst));
|
||||
m_winfirst = (docnum / m_pagesize) * m_pagesize;
|
||||
|
||||
// Get the next page of results.
|
||||
vector<ResListEntry> npage;
|
||||
int pagelen = m_docSource->getSeqSlice(m_winfirst, m_pagesize, npage);
|
||||
|
||||
// If page was truncated, there is no next
|
||||
m_hasNext = (pagelen == m_pagesize);
|
||||
|
||||
if (pagelen <= 0) {
|
||||
m_winfirst = -1;
|
||||
return;
|
||||
}
|
||||
m_respage = npage;
|
||||
}
|
||||
|
||||
void ResListPager::displayDoc(RclConfig *config,
|
||||
int i, Rcl::Doc& doc, const HiliteData& hdata,
|
||||
const string& sh)
|
||||
{
|
||||
ostringstream chunk;
|
||||
int percent;
|
||||
if (doc.pc == -1) {
|
||||
percent = 0;
|
||||
// Document not available, maybe other further, will go on.
|
||||
doc.meta[Rcl::Doc::keyabs] = string(trans("Unavailable document"));
|
||||
} else {
|
||||
percent = doc.pc;
|
||||
}
|
||||
|
||||
// Determine icon to display if any
|
||||
string iconurl = iconUrl(config, doc);
|
||||
|
||||
// Printable url: either utf-8 if transcoding succeeds, or url-encoded
|
||||
string url;
|
||||
printableUrl(config->getDefCharset(), doc.url, url);
|
||||
|
||||
// Make title out of file name if none yet
|
||||
string titleOrFilename;
|
||||
string utf8fn;
|
||||
doc.getmeta(Rcl::Doc::keytt, &titleOrFilename);
|
||||
doc.getmeta(Rcl::Doc::keyfn, &utf8fn);
|
||||
if (utf8fn.empty()) {
|
||||
utf8fn = path_getsimple(url);
|
||||
}
|
||||
if (titleOrFilename.empty()) {
|
||||
titleOrFilename = utf8fn;
|
||||
}
|
||||
|
||||
// Result number
|
||||
char numbuf[20];
|
||||
int docnumforlinks = m_winfirst + 1 + i;
|
||||
sprintf(numbuf, "%d", docnumforlinks);
|
||||
|
||||
// Document date: either doc or file modification time
|
||||
char datebuf[100];
|
||||
datebuf[0] = 0;
|
||||
if (!doc.dmtime.empty() || !doc.fmtime.empty()) {
|
||||
time_t mtime = doc.dmtime.empty() ?
|
||||
atol(doc.fmtime.c_str()) : atol(doc.dmtime.c_str());
|
||||
struct tm *tm = localtime(&mtime);
|
||||
strftime(datebuf, 99, dateFormat().c_str(), tm);
|
||||
}
|
||||
|
||||
// Size information. We print both doc and file if they differ a lot
|
||||
off_t fsize = -1, dsize = -1;
|
||||
if (!doc.dbytes.empty())
|
||||
dsize = atol(doc.dbytes.c_str());
|
||||
if (!doc.fbytes.empty())
|
||||
fsize = atol(doc.fbytes.c_str());
|
||||
string sizebuf;
|
||||
if (dsize > 0) {
|
||||
sizebuf = displayableBytes(dsize);
|
||||
if (fsize > 10 * dsize && fsize - dsize > 1000)
|
||||
sizebuf += string(" / ") + displayableBytes(fsize);
|
||||
} else if (fsize >= 0) {
|
||||
sizebuf = displayableBytes(fsize);
|
||||
}
|
||||
|
||||
string richabst;
|
||||
bool needabstract = parFormat().find("%A") != string::npos;
|
||||
if (needabstract && m_docSource.isNotNull()) {
|
||||
vector<string> vabs;
|
||||
m_docSource->getAbstract(doc, vabs);
|
||||
|
||||
for (vector<string>::const_iterator it = vabs.begin();
|
||||
it != vabs.end(); it++) {
|
||||
if (!it->empty()) {
|
||||
// No need to call escapeHtml(), plaintorich handles it
|
||||
list<string> lr;
|
||||
m_hiliter->set_inputhtml(false);
|
||||
m_hiliter->plaintorich(*it, lr, hdata);
|
||||
richabst += lr.front();
|
||||
richabst += absSep();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Links;
|
||||
ostringstream linksbuf;
|
||||
if (canIntern(doc.mimetype, config)) {
|
||||
linksbuf << "<a href=\"P" << docnumforlinks << "\">"
|
||||
<< trans("Preview") << "</a> ";
|
||||
}
|
||||
|
||||
string apptag;
|
||||
doc.getmeta(Rcl::Doc::keyapptg, &apptag);
|
||||
|
||||
if (!config->getMimeViewerDef(doc.mimetype, apptag).empty()) {
|
||||
linksbuf << "<a href=\"E" << docnumforlinks << "\">"
|
||||
<< trans("Open") << "</a>";
|
||||
}
|
||||
|
||||
// Build the result list paragraph:
|
||||
|
||||
// Subheader: this is used by history
|
||||
if (!sh.empty())
|
||||
chunk << "<p style='clear: both;'><b>" << sh << "</p>\n<p>";
|
||||
else
|
||||
chunk << "<p style='margin: 0px;padding: 0px;clear: both;'>";
|
||||
|
||||
// Configurable stuff
|
||||
map<string,string> subs;
|
||||
subs["A"] = !richabst.empty() ? richabst : "";
|
||||
subs["D"] = datebuf;
|
||||
subs["I"] = iconurl;
|
||||
subs["i"] = doc.ipath;
|
||||
subs["K"] = !doc.meta[Rcl::Doc::keykw].empty() ?
|
||||
string("[") + escapeHtml(doc.meta[Rcl::Doc::keykw]) + "]" : "";
|
||||
subs["L"] = linksbuf.rdbuf()->str();
|
||||
subs["N"] = numbuf;
|
||||
subs["M"] = doc.mimetype;
|
||||
subs["R"] = doc.meta[Rcl::Doc::keyrr];
|
||||
subs["S"] = sizebuf;
|
||||
subs["T"] = escapeHtml(titleOrFilename);
|
||||
subs["t"] = escapeHtml(doc.meta[Rcl::Doc::keytt]);
|
||||
subs["U"] = url;
|
||||
|
||||
// Let %(xx) access all metadata.
|
||||
subs.insert(doc.meta.begin(), doc.meta.end());
|
||||
|
||||
string formatted;
|
||||
pcSubst(parFormat(), formatted, subs);
|
||||
chunk << formatted;
|
||||
|
||||
chunk << "</p>" << endl;
|
||||
// This was to force qt 4.x to clear the margins (which it should do
|
||||
// anyway because of the paragraph's style), but we finally took
|
||||
// the table approach for 1.15 for now (in guiutils.cpp)
|
||||
// chunk << "<br style='clear:both;height:0;line-height:0;'>" << endl;
|
||||
|
||||
LOGDEB2(("Chunk: [%s]\n", (const char *)chunk.rdbuf()->str().c_str()));
|
||||
append(chunk.rdbuf()->str(), i, doc);
|
||||
}
|
||||
|
||||
void ResListPager::displayPage(RclConfig *config)
|
||||
{
|
||||
LOGDEB(("ResListPager::displayPage\n"));
|
||||
if (m_docSource.isNull()) {
|
||||
LOGDEB(("ResListPager::displayPage: null source\n"));
|
||||
return;
|
||||
}
|
||||
if (m_winfirst < 0 && !pageEmpty()) {
|
||||
LOGDEB(("ResListPager::displayPage: sequence error: winfirst < 0\n"));
|
||||
return;
|
||||
}
|
||||
|
||||
ostringstream chunk;
|
||||
|
||||
// Display list header
|
||||
// We could use a <title> but the textedit doesnt display
|
||||
// it prominently
|
||||
// Note: have to append text in chunks that make sense
|
||||
// html-wise. If we break things up too much, the editor
|
||||
// gets confused. Hence the use of the 'chunk' text
|
||||
// accumulator
|
||||
// Also note that there can be results beyond the estimated resCnt.
|
||||
chunk << "<html><head>" << endl
|
||||
<< "<meta http-equiv=\"content-type\""
|
||||
<< " content=\"text/html; charset=utf-8\">" << endl
|
||||
<< headerContent()
|
||||
<< "</head><body>" << endl
|
||||
<< pageTop()
|
||||
<< "<p><font size=+1><b>"
|
||||
<< m_docSource->title()
|
||||
<< "</b></font> ";
|
||||
|
||||
if (pageEmpty()) {
|
||||
chunk << trans("<p><b>No results found</b><br>");
|
||||
vector<string>uterms;
|
||||
m_docSource->getUTerms(uterms);
|
||||
if (!uterms.empty()) {
|
||||
map<string, vector<string> > spellings;
|
||||
suggest(uterms, spellings);
|
||||
if (!spellings.empty()) {
|
||||
chunk <<
|
||||
trans("<p><i>Alternate spellings (accents suppressed): </i>")
|
||||
<< "<br /><blockquote>";
|
||||
|
||||
for (map<string, vector<string> >::const_iterator it0 =
|
||||
spellings.begin(); it0 != spellings.end(); it0++) {
|
||||
chunk << "<b>" << it0->first << "</b> : ";
|
||||
for (vector<string>::const_iterator it =
|
||||
it0->second.begin();
|
||||
it != it0->second.end(); it++) {
|
||||
chunk << *it << " ";
|
||||
}
|
||||
chunk << "<br />";
|
||||
}
|
||||
chunk << "</blockquote></p>";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
unsigned int resCnt = m_docSource->getResCnt();
|
||||
if (m_winfirst + m_respage.size() < resCnt) {
|
||||
chunk << trans("Documents") << " <b>" << m_winfirst + 1
|
||||
<< "-" << m_winfirst + m_respage.size() << "</b> "
|
||||
<< trans("out of at least") << " "
|
||||
<< resCnt << " " << trans("for") << " " ;
|
||||
} else {
|
||||
chunk << trans("Documents") << " <b>"
|
||||
<< m_winfirst + 1 << "-" << m_winfirst + m_respage.size()
|
||||
<< "</b> " << trans("for") << " ";
|
||||
}
|
||||
}
|
||||
chunk << detailsLink();
|
||||
if (hasPrev() || hasNext()) {
|
||||
chunk << " ";
|
||||
if (hasPrev()) {
|
||||
chunk << "<a href=\"" + prevUrl() + "\"><b>"
|
||||
<< trans("Previous")
|
||||
<< "</b></a> ";
|
||||
}
|
||||
if (hasNext()) {
|
||||
chunk << "<a href=\""+ nextUrl() + "\"><b>"
|
||||
<< trans("Next")
|
||||
<< "</b></a>";
|
||||
}
|
||||
}
|
||||
chunk << "</p>" << endl;
|
||||
|
||||
append(chunk.rdbuf()->str());
|
||||
chunk.rdbuf()->str("");
|
||||
if (pageEmpty())
|
||||
return;
|
||||
|
||||
HiliteData hdata;
|
||||
m_docSource->getTerms(hdata.terms, hdata.groups, hdata.gslks);
|
||||
|
||||
// Emit data for result entry paragraph. Do it in chunks that make sense
|
||||
// html-wise, else our client may get confused
|
||||
for (int i = 0; i < (int)m_respage.size(); i++) {
|
||||
Rcl::Doc &doc(m_respage[i].doc);
|
||||
string& sh(m_respage[i].subHeader);
|
||||
displayDoc(config, i, doc, hdata, sh);
|
||||
}
|
||||
|
||||
// Footer
|
||||
chunk << "<p align=\"center\">";
|
||||
if (hasPrev() || hasNext()) {
|
||||
if (hasPrev()) {
|
||||
chunk << "<a href=\"" + prevUrl() + "\"><b>"
|
||||
<< trans("Previous")
|
||||
<< "</b></a> ";
|
||||
}
|
||||
if (hasNext()) {
|
||||
chunk << "<a href=\""+ nextUrl() + "\"><b>"
|
||||
<< trans("Next")
|
||||
<< "</b></a>";
|
||||
}
|
||||
}
|
||||
chunk << "</p>" << endl;
|
||||
chunk << "</body></html>" << endl;
|
||||
append(chunk.rdbuf()->str());
|
||||
}
|
||||
|
||||
// Default implementations for things that should be implemented by
|
||||
// specializations
|
||||
string ResListPager::nextUrl()
|
||||
{
|
||||
return "n-1";
|
||||
}
|
||||
|
||||
string ResListPager::prevUrl()
|
||||
{
|
||||
return "p-1";
|
||||
}
|
||||
|
||||
string ResListPager::iconUrl(RclConfig *config, Rcl::Doc& doc)
|
||||
{
|
||||
string iconurl;
|
||||
config->getMimeIconName(doc.mimetype, &iconurl);
|
||||
iconurl = cstr_fileu + iconurl;
|
||||
return iconurl;
|
||||
}
|
||||
|
||||
bool ResListPager::append(const string& data)
|
||||
{
|
||||
fprintf(stderr, "%s", data.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
string ResListPager::trans(const string& in)
|
||||
{
|
||||
return in;
|
||||
}
|
||||
|
||||
string ResListPager::detailsLink()
|
||||
{
|
||||
string chunk = "<a href=\"H-1\">";
|
||||
chunk += trans("(show query)") + "</a>";
|
||||
return chunk;
|
||||
}
|
||||
|
||||
const string &ResListPager::parFormat()
|
||||
{
|
||||
static const string cstr_format("<img src=\"%I\" align=\"left\">"
|
||||
"%R %S %L <b>%T</b><br>"
|
||||
"%M %D <i>%U</i><br>"
|
||||
"%A %K");
|
||||
return cstr_format;
|
||||
}
|
||||
|
||||
const string &ResListPager::dateFormat()
|
||||
{
|
||||
static const string cstr_format(" %Y-%m-%d %H:%M:%S %z");
|
||||
return cstr_format;
|
||||
}
|
||||
|
131
src/query/reslistpager.h
Normal file
131
src/query/reslistpager.h
Normal file
|
@ -0,0 +1,131 @@
|
|||
/* Copyright (C) 2007 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef _reslistpager_h_included_
|
||||
#define _reslistpager_h_included_
|
||||
|
||||
#include <vector>
|
||||
using std::vector;
|
||||
|
||||
#include "refcntr.h"
|
||||
#include "docseq.h"
|
||||
|
||||
class RclConfig;
|
||||
class PlainToRich;
|
||||
class HiliteData;
|
||||
|
||||
/**
|
||||
* Manage a paged HTML result list.
|
||||
*/
|
||||
class ResListPager {
|
||||
public:
|
||||
ResListPager(int pagesize=10);
|
||||
virtual ~ResListPager() {}
|
||||
|
||||
void setHighLighter(PlainToRich *ptr)
|
||||
{
|
||||
m_hiliter = ptr;
|
||||
}
|
||||
void setDocSource(RefCntr<DocSequence> src, int winfirst = -1)
|
||||
{
|
||||
m_pagesize = m_newpagesize;
|
||||
m_winfirst = winfirst;
|
||||
m_hasNext = false;
|
||||
m_docSource = src;
|
||||
m_respage.clear();
|
||||
}
|
||||
void setPageSize(int ps)
|
||||
{
|
||||
m_newpagesize = ps;
|
||||
}
|
||||
int pageNumber()
|
||||
{
|
||||
if (m_winfirst < 0 || m_pagesize <= 0)
|
||||
return -1;
|
||||
return m_winfirst / m_pagesize;
|
||||
}
|
||||
int pageFirstDocNum() {
|
||||
return m_winfirst;
|
||||
}
|
||||
int pageLastDocNum() {
|
||||
if (m_winfirst < 0 || m_respage.size() == 0)
|
||||
return -1;
|
||||
return m_winfirst + m_respage.size() - 1;
|
||||
}
|
||||
virtual int pageSize() const {return m_pagesize;}
|
||||
void pageNext();
|
||||
bool hasNext() {return m_hasNext;}
|
||||
bool hasPrev() {return m_winfirst > 0;}
|
||||
bool atBot() {return m_winfirst <= 0;}
|
||||
void resultPageFirst() {
|
||||
m_winfirst = -1;
|
||||
m_pagesize = m_newpagesize;
|
||||
resultPageNext();
|
||||
}
|
||||
void resultPageBack() {
|
||||
if (m_winfirst <= 0) return;
|
||||
m_winfirst -= 2 * m_pagesize;
|
||||
resultPageNext();
|
||||
}
|
||||
void resultPageNext();
|
||||
void resultPageFor(int docnum);
|
||||
void displayPage(RclConfig *);
|
||||
void displayDoc(RclConfig *, int idx, Rcl::Doc& doc,
|
||||
const HiliteData& hdata, const string& sh = "");
|
||||
bool pageEmpty() {return m_respage.size() == 0;}
|
||||
|
||||
string queryDescription() {return m_docSource.isNull() ? "" :
|
||||
m_docSource->getDescription();}
|
||||
|
||||
// Things that need to be reimplemented in the subclass:
|
||||
virtual bool append(const string& data);
|
||||
virtual bool append(const string& data, int, const Rcl::Doc&)
|
||||
{
|
||||
return append(data);
|
||||
}
|
||||
// Translation function. This is reimplemented in the qt reslist
|
||||
// object For this to work, the strings must be duplicated inside
|
||||
// reslist.cpp (see the QT_TR_NOOP in there). Very very unwieldy.
|
||||
// To repeat: any change to a string used with trans() inside
|
||||
// reslistpager.cpp must be reflected in the string table inside
|
||||
// reslist.cpp for translation to work.
|
||||
virtual string trans(const string& in);
|
||||
virtual string detailsLink();
|
||||
virtual const string &parFormat();
|
||||
virtual const string &dateFormat();
|
||||
virtual string nextUrl();
|
||||
virtual string prevUrl();
|
||||
virtual string pageTop() {return string();}
|
||||
virtual string headerContent() {return string();}
|
||||
virtual string iconUrl(RclConfig *, Rcl::Doc& doc);
|
||||
virtual void suggest(const vector<string>,
|
||||
map<string, vector<string> >& sugg) {
|
||||
sugg.clear();
|
||||
}
|
||||
virtual string absSep() {return "…";}
|
||||
private:
|
||||
int m_pagesize;
|
||||
int m_newpagesize;
|
||||
// First docnum (from docseq) in current page
|
||||
int m_winfirst;
|
||||
bool m_hasNext;
|
||||
PlainToRich *m_hiliter;
|
||||
RefCntr<DocSequence> m_docSource;
|
||||
vector<ResListEntry> m_respage;
|
||||
};
|
||||
|
||||
#endif /* _reslistpager_h_included_ */
|
76
src/query/sortseq.cpp
Normal file
76
src/query/sortseq.cpp
Normal file
|
@ -0,0 +1,76 @@
|
|||
/* Copyright (C) 2005 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include <algorithm>
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "sortseq.h"
|
||||
|
||||
using std::string;
|
||||
|
||||
class CompareDocs {
|
||||
DocSeqSortSpec ss;
|
||||
public:
|
||||
CompareDocs(const DocSeqSortSpec &sortspec) : ss(sortspec) {}
|
||||
|
||||
// It's not too clear in the std::sort doc what this should do. This
|
||||
// behaves as operator<
|
||||
int operator()(const Rcl::Doc *x, const Rcl::Doc *y)
|
||||
{
|
||||
LOGDEB1(("Comparing .. \n"));
|
||||
|
||||
map<string,string>::const_iterator xit, yit;
|
||||
xit = x->meta.find(ss.field);
|
||||
yit = y->meta.find(ss.field);
|
||||
if (xit == x->meta.end() || yit == y->meta.end())
|
||||
return 0;
|
||||
return ss.desc ? yit->second < xit->second : xit->second < yit->second;
|
||||
}
|
||||
};
|
||||
|
||||
bool DocSeqSorted::setSortSpec(DocSeqSortSpec &sortspec)
|
||||
{
|
||||
LOGDEB(("DocSeqSorted::setSortSpec\n"));
|
||||
m_spec = sortspec;
|
||||
int count = m_seq->getResCnt();
|
||||
LOGDEB(("DocSeqSorted:: count %d\n", count));
|
||||
m_docs.resize(count);
|
||||
int i;
|
||||
for (i = 0; i < count; i++) {
|
||||
if (!m_seq->getDoc(i, m_docs[i])) {
|
||||
LOGERR(("DocSeqSorted: getDoc failed for doc %d\n", i));
|
||||
count = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
m_docs.resize(count);
|
||||
m_docsp.resize(count);
|
||||
for (i = 0; i < count; i++)
|
||||
m_docsp[i] = &m_docs[i];
|
||||
|
||||
CompareDocs cmp(sortspec);
|
||||
sort(m_docsp.begin(), m_docsp.end(), cmp);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DocSeqSorted::getDoc(int num, Rcl::Doc &doc, string *)
|
||||
{
|
||||
LOGDEB(("DocSeqSorted::getDoc(%d)\n", num));
|
||||
if (num < 0 || num >= int(m_docsp.size()))
|
||||
return false;
|
||||
doc = *m_docsp[num];
|
||||
return true;
|
||||
}
|
48
src/query/sortseq.h
Normal file
48
src/query/sortseq.h
Normal file
|
@ -0,0 +1,48 @@
|
|||
/* Copyright (C) 2004 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _SORTSEQ_H_INCLUDED_
|
||||
#define _SORTSEQ_H_INCLUDED_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "refcntr.h"
|
||||
#include "docseq.h"
|
||||
|
||||
/**
|
||||
* A sorted sequence is created from the first N documents of another one,
|
||||
* and sorts them according to the given criteria.
|
||||
*/
|
||||
class DocSeqSorted : public DocSeqModifier {
|
||||
public:
|
||||
DocSeqSorted(RefCntr<DocSequence> iseq, DocSeqSortSpec &sortspec)
|
||||
: DocSeqModifier(iseq)
|
||||
{
|
||||
setSortSpec(sortspec);
|
||||
}
|
||||
virtual ~DocSeqSorted() {}
|
||||
virtual bool canSort() {return true;}
|
||||
virtual bool setSortSpec(DocSeqSortSpec &sortspec);
|
||||
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0);
|
||||
virtual int getResCnt() {return m_docsp.size();}
|
||||
private:
|
||||
DocSeqSortSpec m_spec;
|
||||
std::vector<Rcl::Doc> m_docs;
|
||||
std::vector<Rcl::Doc *> m_docsp;
|
||||
};
|
||||
|
||||
#endif /* _SORTSEQ_H_INCLUDED_ */
|
510
src/query/wasastringtoquery.cpp
Normal file
510
src/query/wasastringtoquery.cpp
Normal file
|
@ -0,0 +1,510 @@
|
|||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef TEST_WASASTRINGTOQUERY
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <regex.h>
|
||||
|
||||
#include "smallut.h"
|
||||
#include "wasastringtoquery.h"
|
||||
|
||||
#undef DEB_WASASTRINGTOQ
|
||||
#ifdef DEB_WASASTRINGTOQ
|
||||
#define DPRINT(X) fprintf X
|
||||
#define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());}
|
||||
#else
|
||||
#define DPRINT(X)
|
||||
#define DUMPQ(Q)
|
||||
#endif
|
||||
|
||||
WasaQuery::~WasaQuery()
|
||||
{
|
||||
for (vector<WasaQuery*>::iterator it = m_subs.begin();
|
||||
it != m_subs.end(); it++) {
|
||||
delete *it;
|
||||
}
|
||||
m_subs.clear();
|
||||
}
|
||||
|
||||
static const char* reltosrel(WasaQuery::Rel rel)
|
||||
{
|
||||
switch (rel) {
|
||||
case WasaQuery::REL_EQUALS: return "=";
|
||||
case WasaQuery::REL_CONTAINS: return ":";
|
||||
case WasaQuery::REL_LT: return "<";
|
||||
case WasaQuery::REL_LTE: return "<=";
|
||||
case WasaQuery::REL_GT: return ">";
|
||||
case WasaQuery::REL_GTE: return ">=";
|
||||
default: return "?";
|
||||
}
|
||||
}
|
||||
|
||||
void WasaQuery::describe(string &desc) const
|
||||
{
|
||||
desc += "(";
|
||||
string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec +
|
||||
reltosrel(m_rel);
|
||||
switch (m_op) {
|
||||
case OP_NULL:
|
||||
desc += "NULL";
|
||||
break;
|
||||
case OP_LEAF:
|
||||
desc += fieldspec + m_value;
|
||||
break;
|
||||
case OP_EXCL:
|
||||
desc += string("NOT (" ) + fieldspec + m_value + ") ";
|
||||
break;
|
||||
case OP_OR:
|
||||
case OP_AND:
|
||||
for (vector<WasaQuery *>::const_iterator it = m_subs.begin();
|
||||
it != m_subs.end(); it++) {
|
||||
(*it)->describe(desc);
|
||||
vector<WasaQuery *>::const_iterator it1 = it;
|
||||
it1++;
|
||||
if (it1 != m_subs.end())
|
||||
desc += m_op == OP_OR ? "OR ": "AND ";
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (desc[desc.length() - 1] == ' ')
|
||||
desc.erase(desc.length() - 1);
|
||||
desc += ")";
|
||||
if (m_modifiers != 0) {
|
||||
if (m_modifiers & WQM_BOOST) desc += "BOOST|";
|
||||
if (m_modifiers & WQM_CASESENS) desc += "CASESENS|";
|
||||
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
|
||||
if (m_modifiers & WQM_FUZZY) desc += "FUZZY|";
|
||||
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
|
||||
if (m_modifiers & WQM_PHRASESLACK) {
|
||||
char buf[100];
|
||||
sprintf(buf, "%d", m_slack);
|
||||
desc += "PHRASESLACK(" + string(buf) + string(")|");
|
||||
}
|
||||
if (m_modifiers & WQM_PROX) desc += "PROX|";
|
||||
if (m_modifiers & WQM_REGEX) desc += "REGEX|";
|
||||
if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|";
|
||||
if (m_modifiers & WQM_WORDS) desc += "WORDS|";
|
||||
|
||||
if (desc.length() > 0 && desc[desc.length()-1] == '|')
|
||||
desc.erase(desc.length()-1);
|
||||
}
|
||||
desc += " ";
|
||||
}
|
||||
|
||||
// The string query parser code:
|
||||
|
||||
/* Shamelessly lifted from Beagle:
|
||||
* This is our regular Expression Pattern:
|
||||
* we expect something like this:
|
||||
* -key:"Value String"modifiers
|
||||
* key:Value
|
||||
* or
|
||||
* Value
|
||||
*/
|
||||
|
||||
/* The master regular expression used to parse a query string
|
||||
* Sub-expressions in parenthesis are numbered from 1. Each opening
|
||||
* parenthesis increases the index, but we're not interested in all
|
||||
* Deviations from standard:
|
||||
* Relation: the standard-conformant line read as (release<1.16):
|
||||
"(:|=|<|>|<=|>=)" //7 Relation
|
||||
but we are not actually making use of the relation type
|
||||
(interpreting all as ":"), and this can product unexpected results
|
||||
as a (ie pasted) search for nonexfield=value will silently drop
|
||||
the nonexfield part, while the user probably was not aware of
|
||||
triggering a field search (expecting just ':' to do this).
|
||||
*/
|
||||
static const char * parserExpr =
|
||||
"(OR|\\|\\|)[[:space:]]*" //1 OR,||
|
||||
"|"
|
||||
"(AND|&&)[[:space:]]*" // 2 AND,&& (ignored, default)
|
||||
"|"
|
||||
"(" //3
|
||||
"([+-])?" //4 Force or exclude indicator
|
||||
"(" //5
|
||||
"([[:alpha:]][[:alnum:]:]*)" //6 Field spec: ie: "dc:title:letitre"
|
||||
"[[:space:]]*"
|
||||
"(:|=|>|<)" //7 Relation
|
||||
"[[:space:]]*)?"
|
||||
"(" //8
|
||||
"(\"" //9
|
||||
"([^\"]+)" //10 "A quoted term"
|
||||
"\")"
|
||||
"([bcCdDeflLoprsw.0-9]*)" //11 modifiers
|
||||
"|"
|
||||
"([^[:space:]\"]+)" //12 ANormalTerm
|
||||
")"
|
||||
")[[:space:]]*"
|
||||
;
|
||||
|
||||
// For debugging the parser. But see also NMATCH
|
||||
static const char *matchNames[] = {
|
||||
/* 0*/ "",
|
||||
/* 1*/ "OR",
|
||||
/* 2*/ "AND",
|
||||
/* 3*/ "",
|
||||
/* 4*/ "+-",
|
||||
/* 5*/ "",
|
||||
/* 6*/ "FIELD",
|
||||
/* 7*/ "RELATION",
|
||||
/* 8*/ "",
|
||||
/* 9*/ "",
|
||||
/*10*/ "QUOTEDTERM",
|
||||
/*11*/ "MODIFIERS",
|
||||
/*12*/ "TERM",
|
||||
};
|
||||
#define NMATCH (sizeof(matchNames) / sizeof(char *))
|
||||
|
||||
// Symbolic names for the interesting submatch indices
|
||||
enum SbMatchIdx {SMI_OR=1, SMI_AND=2, SMI_PM=4, SMI_FIELD=6, SMI_REL=7,
|
||||
SMI_QUOTED=10, SMI_MODIF=11, SMI_TERM=12};
|
||||
|
||||
static const int maxmatchlen = 1024;
|
||||
static const int errbuflen = 300;
|
||||
|
||||
class StringToWasaQuery::Internal {
|
||||
public:
|
||||
Internal()
|
||||
: m_rxneedsfree(false)
|
||||
{}
|
||||
~Internal()
|
||||
{
|
||||
if (m_rxneedsfree)
|
||||
regfree(&m_rx);
|
||||
}
|
||||
bool checkSubMatch(int i, char *match, string& reason)
|
||||
{
|
||||
if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) {
|
||||
//DPRINT((stderr, "checkSubMatch: no match: i %d rm_so %d\n",
|
||||
//i, m_pmatch[i].rm_so));
|
||||
return false;
|
||||
}
|
||||
if (m_pmatch[i].rm_eo - m_pmatch[i].rm_so <= 0) {
|
||||
// weird and fatal
|
||||
reason = "Internal regular expression handling error";
|
||||
return false;
|
||||
}
|
||||
//DPRINT((stderr, "checkSubMatch: so %d eo %d\n", m_pmatch[i].rm_so,
|
||||
//m_pmatch[i].rm_eo));
|
||||
memcpy(match, m_cp + m_pmatch[i].rm_so,
|
||||
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
|
||||
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
WasaQuery *stringToQuery(const string& str, string& reason);
|
||||
|
||||
friend class StringToWasaQuery;
|
||||
private:
|
||||
const char *m_cp;
|
||||
regex_t m_rx;
|
||||
bool m_rxneedsfree;
|
||||
regmatch_t m_pmatch[NMATCH];
|
||||
};
|
||||
|
||||
StringToWasaQuery::StringToWasaQuery()
|
||||
: internal(new Internal)
|
||||
{
|
||||
}
|
||||
|
||||
StringToWasaQuery::~StringToWasaQuery()
|
||||
{
|
||||
delete internal;
|
||||
}
|
||||
|
||||
WasaQuery *
|
||||
StringToWasaQuery::stringToQuery(const string& str, string& reason)
|
||||
{
|
||||
if (internal == 0)
|
||||
return 0;
|
||||
WasaQuery *wq = internal->stringToQuery(str, reason);
|
||||
DUMPQ(wq);
|
||||
return wq;
|
||||
}
|
||||
|
||||
WasaQuery *
|
||||
StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
||||
{
|
||||
if (m_rxneedsfree)
|
||||
regfree(&m_rx);
|
||||
|
||||
char errbuf[errbuflen+1];
|
||||
int errcode;
|
||||
if ((errcode = regcomp(&m_rx, parserExpr, REG_EXTENDED)) != 0) {
|
||||
regerror(errcode, &m_rx, errbuf, errbuflen);
|
||||
reason = errbuf;
|
||||
return 0;
|
||||
}
|
||||
m_rxneedsfree = true;
|
||||
|
||||
const char *cpe;
|
||||
m_cp = str.c_str();
|
||||
cpe = str.c_str() + str.length();
|
||||
|
||||
WasaQuery *query = new WasaQuery;
|
||||
query->m_op = WasaQuery::OP_AND;
|
||||
WasaQuery *orChain = 0;
|
||||
bool prev_or = false;
|
||||
|
||||
// Loop on repeated regexp matches on the main string.
|
||||
for (int loop = 0;;loop++) {
|
||||
if ((errcode = regexec(&m_rx, m_cp, NMATCH, m_pmatch, 0))) {
|
||||
regerror(errcode, &m_rx, errbuf, errbuflen);
|
||||
reason = errbuf;
|
||||
return 0;
|
||||
}
|
||||
if (m_pmatch[0].rm_eo <= 0) {
|
||||
// weird and fatal
|
||||
reason = "Internal regular expression handling error";
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef DEB_WASASTRINGTOQ
|
||||
DPRINT((stderr, "Next part:\n"));
|
||||
for (unsigned int i = 0; i < NMATCH; i++) {
|
||||
if (m_pmatch[i].rm_so == -1) continue;
|
||||
char match[maxmatchlen+1];
|
||||
memcpy(match, m_cp + m_pmatch[i].rm_so,
|
||||
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
|
||||
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
|
||||
if (matchNames[i][0])
|
||||
DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match,
|
||||
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo));
|
||||
}
|
||||
#endif
|
||||
|
||||
char match[maxmatchlen+1];
|
||||
if (checkSubMatch(SMI_OR, match, reason)) {
|
||||
if (prev_or) {
|
||||
// Bad syntax
|
||||
reason = "Bad syntax: consecutive OR";
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (orChain == 0) {
|
||||
// Fist OR seen: start OR subclause.
|
||||
if ((orChain = new WasaQuery()) == 0) {
|
||||
reason = "Out of memory";
|
||||
return 0;
|
||||
}
|
||||
orChain->m_op = WasaQuery::OP_OR;
|
||||
}
|
||||
|
||||
// For the first OR, we need to transfer the previous
|
||||
// query from the main vector to the OR subquery
|
||||
if (orChain->m_subs.empty() && !query->m_subs.empty()) {
|
||||
orChain->m_subs.push_back(query->m_subs.back());
|
||||
query->m_subs.pop_back();
|
||||
}
|
||||
prev_or = true;
|
||||
|
||||
} else if (checkSubMatch(SMI_AND, match, reason)) {
|
||||
// Do nothing, AND is the default. We might want to check for
|
||||
// errors like consecutive ANDs, or OR AND
|
||||
|
||||
} else {
|
||||
|
||||
WasaQuery *nclause = new WasaQuery;
|
||||
if (nclause == 0) {
|
||||
reason = "Out of memory";
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Check for quoted or unquoted value
|
||||
unsigned int mods = 0;
|
||||
if (checkSubMatch(SMI_QUOTED, match, reason)) {
|
||||
nclause->m_value = match;
|
||||
} else if (checkSubMatch(SMI_TERM, match, reason)) {
|
||||
nclause->m_value = match;
|
||||
}
|
||||
|
||||
if (nclause->m_value.empty()) {
|
||||
// Isolated +- or fieldname: without a value. Ignore until
|
||||
// told otherwise.
|
||||
DPRINT((stderr, "Clause with empty value, skipping\n"));
|
||||
delete nclause;
|
||||
goto nextfield;
|
||||
}
|
||||
|
||||
if (checkSubMatch(SMI_MODIF, match, reason)) {
|
||||
DPRINT((stderr, "Got modifiers: [%s]\n", match));
|
||||
for (unsigned int i = 0; i < strlen(match); i++) {
|
||||
switch (match[i]) {
|
||||
case 'b':
|
||||
mods |= WasaQuery::WQM_BOOST;
|
||||
nclause->m_weight = 10.0;
|
||||
break;
|
||||
case 'c': break;
|
||||
case 'C': mods |= WasaQuery::WQM_CASESENS; break;
|
||||
case 'd': break;
|
||||
case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
|
||||
case 'e': mods |= WasaQuery::WQM_CASESENS |
|
||||
WasaQuery::WQM_DIACSENS |
|
||||
WasaQuery::WQM_NOSTEM;
|
||||
break;
|
||||
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
|
||||
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
|
||||
case 'L': break;
|
||||
case 'o':
|
||||
mods |= WasaQuery::WQM_PHRASESLACK;
|
||||
// Default slack if specified only by 'o' is 10.
|
||||
nclause->m_slack = 10;
|
||||
if (i < strlen(match) - 1) {
|
||||
char *endptr;
|
||||
int slack = strtol(match+i+1, &endptr, 10);
|
||||
if (endptr != match+i+1) {
|
||||
i += endptr - (match+i+1);
|
||||
nclause->m_slack = slack;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'p':
|
||||
mods |= WasaQuery::WQM_PROX;
|
||||
nclause->m_slack = 10;
|
||||
break;
|
||||
case 'r': mods |= WasaQuery::WQM_REGEX; break;
|
||||
case 's': mods |= WasaQuery::WQM_SLOPPY; break;
|
||||
case 'w': mods |= WasaQuery::WQM_WORDS; break;
|
||||
case '.':case '0':case '1':case '2':case '3':case '4':
|
||||
case '5':case '6':case '7':case '8':case '9':
|
||||
{
|
||||
int n;
|
||||
float factor;
|
||||
if (sscanf(match+i, "%f %n", &factor, &n)) {
|
||||
nclause->m_weight = factor;
|
||||
DPRINT((stderr, "Got factor %.2f len %d\n",
|
||||
factor, n));
|
||||
}
|
||||
if (n)
|
||||
i += n-1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
nclause->m_modifiers = WasaQuery::Modifier(mods);
|
||||
|
||||
// Field indicator ?
|
||||
if (checkSubMatch(SMI_FIELD, match, reason)) {
|
||||
// We used Check for special fields indicating sorting
|
||||
// etc. here but this went away from the spec. See 1.4
|
||||
// if it comes back
|
||||
nclause->m_fieldspec = match;
|
||||
if (checkSubMatch(SMI_REL, match, reason)) {
|
||||
switch (match[0]) {
|
||||
case '=':nclause->m_rel = WasaQuery::REL_EQUALS;break;
|
||||
case ':':nclause->m_rel = WasaQuery::REL_CONTAINS;break;
|
||||
case '<':
|
||||
if (match[1] == '=')
|
||||
nclause->m_rel = WasaQuery::REL_LTE;
|
||||
else
|
||||
nclause->m_rel = WasaQuery::REL_LT;
|
||||
break;
|
||||
case '>':
|
||||
if (match[1] == '=')
|
||||
nclause->m_rel = WasaQuery::REL_GTE;
|
||||
else
|
||||
nclause->m_rel = WasaQuery::REL_GT;
|
||||
break;
|
||||
default:
|
||||
nclause->m_rel = WasaQuery::REL_CONTAINS;
|
||||
}
|
||||
} else {
|
||||
// ?? If field matched we should have a relation
|
||||
nclause->m_rel = WasaQuery::REL_CONTAINS;
|
||||
}
|
||||
}
|
||||
|
||||
// +- indicator ?
|
||||
if (checkSubMatch(SMI_PM, match, reason) && match[0] == '-') {
|
||||
nclause->m_op = WasaQuery::OP_EXCL;
|
||||
} else {
|
||||
nclause->m_op = WasaQuery::OP_LEAF;
|
||||
}
|
||||
|
||||
if (prev_or) {
|
||||
// The precedent token was an OR, add new clause to or chain
|
||||
//DPRINT((stderr, "Adding to OR chain\n"));
|
||||
orChain->m_subs.push_back(nclause);
|
||||
} else {
|
||||
if (orChain) {
|
||||
// Getting out of OR. Add the OR subquery to the main one
|
||||
//DPRINT((stderr, "Adding OR chain to main\n"));
|
||||
query->m_subs.push_back(orChain);
|
||||
orChain = 0;
|
||||
}
|
||||
//DPRINT((stderr, "Adding to main chain\n"));
|
||||
// Add new clause to main query
|
||||
query->m_subs.push_back(nclause);
|
||||
}
|
||||
|
||||
prev_or = false;
|
||||
}
|
||||
|
||||
nextfield:
|
||||
// Advance current string position. We checked earlier that
|
||||
// the increment is strictly positive, so we won't loop
|
||||
// forever
|
||||
m_cp += m_pmatch[0].rm_eo;
|
||||
if (m_cp >= cpe)
|
||||
break;
|
||||
}
|
||||
|
||||
if (orChain) {
|
||||
// Getting out of OR. Add the OR subquery to the main one
|
||||
DPRINT((stderr, "Adding OR chain to main.Before: \n"));
|
||||
DUMPQ(query);
|
||||
DUMPQ(orChain);
|
||||
query->m_subs.push_back(orChain);
|
||||
}
|
||||
|
||||
regfree(&m_rx);
|
||||
m_rxneedsfree = false;
|
||||
return query;
|
||||
}
|
||||
|
||||
#else // TEST
|
||||
|
||||
#include <stdio.h>
|
||||
#include "wasastringtoquery.h"
|
||||
|
||||
static char *thisprog;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
if (argc != 1) {
|
||||
fprintf(stderr, "need one arg\n");
|
||||
exit(1);
|
||||
}
|
||||
const string str = *argv++;argc--;
|
||||
string reason;
|
||||
StringToWasaQuery qparser;
|
||||
WasaQuery *q = qparser.stringToQuery(str, reason);
|
||||
if (q == 0) {
|
||||
fprintf(stderr, "stringToQuery failed: %s\n", reason.c_str());
|
||||
exit(1);
|
||||
}
|
||||
string desc;
|
||||
q->describe(desc);
|
||||
fprintf(stderr, "Finally: %s\n", desc.c_str());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
#endif // TEST_WASASTRINGTOQUERY
|
108
src/query/wasastringtoquery.h
Normal file
108
src/query/wasastringtoquery.h
Normal file
|
@ -0,0 +1,108 @@
|
|||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
|
||||
#define _WASASTRINGTOQUERY_H_INCLUDED_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
/* Note: Xesam used to be named wasabi. We changed the references to wasabi in
|
||||
the comments, but not the code */
|
||||
|
||||
/**
|
||||
* A simple class to represent a parsed Xesam user language element.
|
||||
* Can hold one leaf element or an array of subqueries to be joined by AND/OR
|
||||
*
|
||||
* The complete query is represented by a top WasaQuery holding a
|
||||
* chain of ANDed subclauses. Some of the subclauses may be themselves
|
||||
* OR'ed lists (it doesn't go deeper). Entries in the AND list may be
|
||||
* negated (AND NOT).
|
||||
*
|
||||
* For LEAF elements, the value can hold one or several words. In the
|
||||
* latter case, it should be interpreted as a phrase (comes from a
|
||||
* user-entered "quoted string"), except if the modifier flags say otherwise.
|
||||
*
|
||||
* Some fields only make sense either for compound or LEAF queries. This
|
||||
* is commented for each. We should subclass really.
|
||||
*
|
||||
* Note that wasaStringToQuery supposedly parses the whole Xesam
|
||||
* User Search Language v 0.95, but that some elements are dropped or
|
||||
* ignored during the translation to a native Recoll query in wasaToRcl
|
||||
*/
|
||||
class WasaQuery {
|
||||
public:
|
||||
/** Type of this element: leaf or AND/OR chain */
|
||||
enum Op {OP_NULL, OP_LEAF, OP_EXCL, OP_OR, OP_AND};
|
||||
/** Relation to be searched between field and value. Recoll actually only
|
||||
supports "contain" except for a size field */
|
||||
enum Rel {REL_NULL, REL_EQUALS, REL_CONTAINS, REL_LT, REL_LTE,
|
||||
REL_GT, REL_GTE};
|
||||
/** Modifiers for term handling: case/diacritics handling,
|
||||
stemming control */
|
||||
enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4,
|
||||
WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20,
|
||||
WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100,
|
||||
WQM_FUZZY = 0x200};
|
||||
|
||||
typedef vector<WasaQuery*> subqlist_t;
|
||||
|
||||
WasaQuery()
|
||||
: m_op(OP_NULL), m_modifiers(0), m_slack(0), m_weight(1.0)
|
||||
{}
|
||||
|
||||
~WasaQuery();
|
||||
|
||||
/** Get string describing the query tree from this point */
|
||||
void describe(string &desc) const;
|
||||
|
||||
/** Op to be performed on either value (may be LEAF or EXCL, or subqs */
|
||||
WasaQuery::Op m_op;
|
||||
|
||||
/** Field specification if any (ie: title, author ...) Only OPT_LEAF */
|
||||
string m_fieldspec;
|
||||
/** Relation between field and value: =, :, <,>,<=, >= */
|
||||
WasaQuery::Rel m_rel;
|
||||
|
||||
/* String value. Valid for op == OP_LEAF or EXCL */
|
||||
string m_value;
|
||||
|
||||
/** Subqueries. Valid for conjunctions */
|
||||
vector<WasaQuery*> m_subs;
|
||||
|
||||
unsigned int m_modifiers;
|
||||
int m_slack;
|
||||
float m_weight;
|
||||
};
|
||||
|
||||
/**
|
||||
* Wasabi query string parser class. Could be a simple function
|
||||
* really, but there might be some parser initialization work done in
|
||||
* the constructor.
|
||||
*/
|
||||
class StringToWasaQuery {
|
||||
public:
|
||||
StringToWasaQuery();
|
||||
~StringToWasaQuery();
|
||||
WasaQuery *stringToQuery(const string& str, string& reason);
|
||||
class Internal;
|
||||
private:
|
||||
Internal *internal;
|
||||
};
|
||||
|
||||
#endif /* _WASASTRINGTOQUERY_H_INCLUDED_ */
|
293
src/query/wasatorcl.cpp
Normal file
293
src/query/wasatorcl.cpp
Normal file
|
@ -0,0 +1,293 @@
|
|||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <algorithm>
|
||||
using std::string;
|
||||
using std::list;
|
||||
|
||||
#include "rclconfig.h"
|
||||
#include "wasastringtoquery.h"
|
||||
#include "rcldb.h"
|
||||
#include "searchdata.h"
|
||||
#include "wasatorcl.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
#include "rclconfig.h"
|
||||
#include "refcntr.h"
|
||||
#include "textsplit.h"
|
||||
|
||||
static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
|
||||
const string& autosuffs, string& reason)
|
||||
{
|
||||
if (wasa == 0) {
|
||||
reason = "NULL query";
|
||||
return 0;
|
||||
}
|
||||
if (wasa->m_op != WasaQuery::OP_AND && wasa->m_op != WasaQuery::OP_OR) {
|
||||
reason = "Top query neither AND nor OR ?";
|
||||
LOGERR(("wasaQueryToRcl: top query neither AND nor OR!\n"));
|
||||
return 0;
|
||||
}
|
||||
|
||||
Rcl::SearchData *sdata = new
|
||||
Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND :
|
||||
Rcl::SCLT_OR);
|
||||
LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ?
|
||||
"AND" : "OR"));
|
||||
|
||||
WasaQuery::subqlist_t::iterator it;
|
||||
Rcl::SearchDataClause *nclause;
|
||||
|
||||
// Walk the list of clauses. Some pseudo-field types need special
|
||||
// processing, which results in setting data in the top struct
|
||||
// instead of adding a clause. We check for these first
|
||||
for (it = wasa->m_subs.begin(); it != wasa->m_subs.end(); it++) {
|
||||
|
||||
if (!stringicmp("mime", (*it)->m_fieldspec) ||
|
||||
!stringicmp("format", (*it)->m_fieldspec)) {
|
||||
if ((*it)->m_op == WasaQuery::OP_LEAF) {
|
||||
sdata->addFiletype((*it)->m_value);
|
||||
} else if ((*it)->m_op == WasaQuery::OP_EXCL) {
|
||||
sdata->remFiletype((*it)->m_value);
|
||||
} else {
|
||||
reason = "internal error: mime clause neither leaf not excl??";
|
||||
return 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Xesam uses "type", we also support "rclcat", for broad
|
||||
// categories like "audio", "presentation", etc.
|
||||
if (!stringicmp("rclcat", (*it)->m_fieldspec) ||
|
||||
!stringicmp("type", (*it)->m_fieldspec)) {
|
||||
if ((*it)->m_op != WasaQuery::OP_LEAF &&
|
||||
(*it)->m_op != WasaQuery::OP_EXCL) {
|
||||
reason = "internal error: rclcat/type clause neither leaf"
|
||||
"nor excl??";
|
||||
return 0;
|
||||
}
|
||||
list<string> mtypes;
|
||||
if (config && config->getMimeCatTypes((*it)->m_value, mtypes)
|
||||
&& !mtypes.empty()) {
|
||||
for (list<string>::iterator mit = mtypes.begin();
|
||||
mit != mtypes.end(); mit++) {
|
||||
if ((*it)->m_op == WasaQuery::OP_LEAF)
|
||||
sdata->addFiletype(*mit);
|
||||
else
|
||||
sdata->remFiletype(*mit);
|
||||
}
|
||||
} else {
|
||||
reason = "Unknown rclcat/type value: no mime types found";
|
||||
return 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Filtering on location
|
||||
if (!stringicmp("dir", (*it)->m_fieldspec)) {
|
||||
string dir = path_tildexpand((*it)->m_value);
|
||||
sdata->setTopdir(dir, (*it)->m_op == WasaQuery::OP_EXCL,
|
||||
(*it)->m_weight);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle "date" spec
|
||||
if (!stringicmp("date", (*it)->m_fieldspec)) {
|
||||
if ((*it)->m_op != WasaQuery::OP_LEAF) {
|
||||
reason = "Negative date filtering not supported";
|
||||
return 0;
|
||||
}
|
||||
DateInterval di;
|
||||
if (!parsedateinterval((*it)->m_value, &di)) {
|
||||
LOGERR(("wasaQueryToRcl: bad date interval format\n"));
|
||||
reason = "Bad date interval format";
|
||||
return 0;
|
||||
}
|
||||
LOGDEB(("wasaQueryToRcl:: date span: %d-%d-%d/%d-%d-%d\n",
|
||||
di.y1,di.m1,di.d1, di.y2,di.m2,di.d2));
|
||||
sdata->setDateSpan(&di);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle "size" spec
|
||||
if (!stringicmp("size", (*it)->m_fieldspec)) {
|
||||
if ((*it)->m_op != WasaQuery::OP_LEAF) {
|
||||
reason = "Negative size filtering not supported";
|
||||
return 0;
|
||||
}
|
||||
char *cp;
|
||||
size_t size = strtoll((*it)->m_value.c_str(), &cp, 10);
|
||||
if (*cp != 0) {
|
||||
switch (*cp) {
|
||||
case 'k': case 'K': size *= 1E3;break;
|
||||
case 'm': case 'M': size *= 1E6;break;
|
||||
case 'g': case 'G': size *= 1E9;break;
|
||||
case 't': case 'T': size *= 1E12;break;
|
||||
default:
|
||||
reason = string("Bad multiplier suffix: ") + *cp;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
switch ((*it)->m_rel) {
|
||||
case WasaQuery::REL_EQUALS:
|
||||
sdata->setMaxSize(size);
|
||||
sdata->setMinSize(size);
|
||||
break;
|
||||
case WasaQuery::REL_LT:
|
||||
case WasaQuery::REL_LTE:
|
||||
sdata->setMaxSize(size);
|
||||
break;
|
||||
case WasaQuery::REL_GT:
|
||||
case WasaQuery::REL_GTE:
|
||||
sdata->setMinSize(size);
|
||||
break;
|
||||
default:
|
||||
reason = "Bad relation operator with size query. Use > < or =";
|
||||
return 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// "Regular" processing follows:
|
||||
switch ((*it)->m_op) {
|
||||
case WasaQuery::OP_NULL:
|
||||
case WasaQuery::OP_AND:
|
||||
default:
|
||||
reason = "Found bad NULL or AND query type in list";
|
||||
LOGERR(("wasaQueryToRcl: found bad NULL or AND q type in list\n"));
|
||||
continue;
|
||||
|
||||
case WasaQuery::OP_LEAF: {
|
||||
LOGDEB(("wasaQueryToRcl: leaf clause [%s]:[%s] slack %d\n",
|
||||
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
|
||||
(*it)->m_slack));
|
||||
|
||||
// Change terms found in the "autosuffs" list into "ext"
|
||||
// field queries
|
||||
if ((*it)->m_fieldspec.empty() && !autosuffs.empty()) {
|
||||
vector<string> asfv;
|
||||
if (stringToStrings(autosuffs, asfv)) {
|
||||
if (find_if(asfv.begin(), asfv.end(),
|
||||
StringIcmpPred((*it)->m_value)) != asfv.end()) {
|
||||
(*it)->m_fieldspec = "ext";
|
||||
(*it)->m_modifiers |= WasaQuery::WQM_NOSTEM;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int mods = (unsigned int)(*it)->m_modifiers;
|
||||
|
||||
// I'm not sure I understand the phrase/near detection
|
||||
// thereafter anymore, maybe it would be better to have an
|
||||
// explicit flag. Mods can only be set after a double
|
||||
// quote.
|
||||
if (TextSplit::hasVisibleWhite((*it)->m_value) || mods) {
|
||||
Rcl::SClType tp = Rcl::SCLT_PHRASE;
|
||||
if (mods & WasaQuery::WQM_PROX) {
|
||||
tp = Rcl::SCLT_NEAR;
|
||||
}
|
||||
nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
|
||||
(*it)->m_slack,
|
||||
(*it)->m_fieldspec);
|
||||
} else {
|
||||
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
|
||||
(*it)->m_value,
|
||||
(*it)->m_fieldspec);
|
||||
}
|
||||
if (nclause == 0) {
|
||||
reason = "Out of memory";
|
||||
LOGERR(("wasaQueryToRcl: out of memory\n"));
|
||||
return 0;
|
||||
}
|
||||
if (mods & WasaQuery::WQM_NOSTEM) {
|
||||
nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
||||
}
|
||||
if ((*it)->m_weight != 1.0)
|
||||
nclause->setWeight((*it)->m_weight);
|
||||
sdata->addClause(nclause);
|
||||
}
|
||||
break;
|
||||
|
||||
case WasaQuery::OP_EXCL:
|
||||
LOGDEB2(("wasaQueryToRcl: excl clause [%s]:[%s]\n",
|
||||
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
|
||||
if (wasa->m_op != WasaQuery::OP_AND) {
|
||||
LOGERR(("wasaQueryToRcl: negative clause inside OR list!\n"));
|
||||
continue;
|
||||
}
|
||||
// Note: have to add dquotes which will be translated to
|
||||
// phrase if there are several words in there. Not pretty
|
||||
// but should work. If there is actually a single
|
||||
// word, it will not be taken as a phrase, and
|
||||
// stem-expansion will work normally
|
||||
// Have to do this because searchdata has nothing like and_not
|
||||
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL,
|
||||
string("\"") +
|
||||
(*it)->m_value + "\"",
|
||||
(*it)->m_fieldspec);
|
||||
|
||||
if (nclause == 0) {
|
||||
reason = "Out of memory";
|
||||
LOGERR(("wasaQueryToRcl: out of memory\n"));
|
||||
return 0;
|
||||
}
|
||||
if ((*it)->m_modifiers & WasaQuery::WQM_NOSTEM)
|
||||
nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
||||
if ((*it)->m_weight != 1.0)
|
||||
nclause->setWeight((*it)->m_weight);
|
||||
sdata->addClause(nclause);
|
||||
break;
|
||||
|
||||
case WasaQuery::OP_OR:
|
||||
LOGDEB2(("wasaQueryToRcl: OR clause [%s]:[%s]\n",
|
||||
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
|
||||
// Create a subquery.
|
||||
Rcl::SearchData *sub =
|
||||
wasaQueryToRcl(config, *it, autosuffs, reason);
|
||||
if (sub == 0) {
|
||||
continue;
|
||||
}
|
||||
nclause =
|
||||
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB,
|
||||
RefCntr<Rcl::SearchData>(sub));
|
||||
if (nclause == 0) {
|
||||
LOGERR(("wasaQueryToRcl: out of memory\n"));
|
||||
reason = "Out of memory";
|
||||
return 0;
|
||||
}
|
||||
if ((*it)->m_modifiers & WasaQuery::WQM_NOSTEM)
|
||||
nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
||||
sdata->addClause(nclause);
|
||||
}
|
||||
}
|
||||
|
||||
return sdata;
|
||||
}
|
||||
|
||||
Rcl::SearchData *wasaStringToRcl(RclConfig *config,
|
||||
const string &qs, string &reason,
|
||||
const string& autosuffs)
|
||||
{
|
||||
StringToWasaQuery parser;
|
||||
WasaQuery *wq = parser.stringToQuery(qs, reason);
|
||||
if (wq == 0)
|
||||
return 0;
|
||||
return wasaQueryToRcl(config, wq, autosuffs, reason);
|
||||
}
|
31
src/query/wasatorcl.h
Normal file
31
src/query/wasatorcl.h
Normal file
|
@ -0,0 +1,31 @@
|
|||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef _WASATORCL_H_INCLUDED_
|
||||
#define _WASATORCL_H_INCLUDED_
|
||||
#include <string>
|
||||
using std::string;
|
||||
|
||||
#include "rcldb.h"
|
||||
#include "searchdata.h"
|
||||
|
||||
class RclConfig;
|
||||
|
||||
extern Rcl::SearchData *wasaStringToRcl(RclConfig *,
|
||||
const string& query, string &reason,
|
||||
const string& autosuffs = string());
|
||||
#endif /* _WASATORCL_H_INCLUDED_ */
|
319
src/query/xadump.cpp
Normal file
319
src/query/xadump.cpp
Normal file
|
@ -0,0 +1,319 @@
|
|||
/* Copyright (C) 2004 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <signal.h>
|
||||
#include <strings.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "pathut.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
#include "utf8iter.h"
|
||||
|
||||
#include "xapian.h"
|
||||
|
||||
static string thisprog;
|
||||
|
||||
static string usage =
|
||||
" -d <dbdir> -e <output encoding>\n"
|
||||
" -i docid -D : get document data for docid\n"
|
||||
" -i docid -X : delete document docid\n"
|
||||
" -i docid -b : 'rebuild' document from term positions\n"
|
||||
" -i docid -T : term list for doc docid\n"
|
||||
" -t term -E : term existence test\n"
|
||||
" -t term -F : retrieve term frequency data for given term\n"
|
||||
" -t term -P : retrieve postings for term\n"
|
||||
" -T : list all terms\n"
|
||||
" -f : precede each term in the list with its occurrence counts\n"
|
||||
" -n : raw data (no [])\n"
|
||||
" -l : don't list prefixed terms\n"
|
||||
" -x : separate each output char with a space\n"
|
||||
" -s : special mode to dump recoll stem db\n"
|
||||
" -q term [term ...] : perform AND query\n"
|
||||
" \n\n"
|
||||
;
|
||||
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
cerr << thisprog << ": usage:\n" << usage;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int op_flags;
|
||||
#define OPT_D 0x1
|
||||
#define OPT_E 0x2
|
||||
#define OPT_F 0x4
|
||||
#define OPT_P 0x8
|
||||
#define OPT_T 0x10
|
||||
#define OPT_X 0x20
|
||||
#define OPT_b 0x40
|
||||
#define OPT_d 0x80
|
||||
#define OPT_e 0x100
|
||||
#define OPT_f 0x200
|
||||
#define OPT_i 0x400
|
||||
#define OPT_n 0x800
|
||||
#define OPT_q 0x1000
|
||||
#define OPT_s 0x2000
|
||||
#define OPT_t 0x4000
|
||||
#define OPT_x 0x8000
|
||||
#define OPT_l 0x10000
|
||||
|
||||
// Compute an exploded version of string, inserting a space between each char.
|
||||
// (no character combining possible)
|
||||
static string detailstring(const string& in)
|
||||
{
|
||||
if (!(op_flags & OPT_x))
|
||||
return in;
|
||||
string out;
|
||||
Utf8Iter it(in);
|
||||
for (; !it.eof(); it++) {
|
||||
it.appendchartostring(out);
|
||||
out += ' ';
|
||||
}
|
||||
// Strip last space
|
||||
if (!out.empty())
|
||||
out.resize(out.size()-1);
|
||||
return out;
|
||||
}
|
||||
|
||||
Xapian::Database *db;
|
||||
|
||||
static void cleanup()
|
||||
{
|
||||
delete db;
|
||||
}
|
||||
|
||||
static void sigcleanup(int sig)
|
||||
{
|
||||
fprintf(stderr, "sigcleanup\n");
|
||||
cleanup();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
string dbdir = path_cat(path_home(), ".recoll/xapiandb");
|
||||
string outencoding = "ISO8859-1";
|
||||
int docid = 1;
|
||||
string aterm;
|
||||
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
while (argc > 0 && **argv == '-') {
|
||||
(*argv)++;
|
||||
if (!(**argv))
|
||||
/* Cas du "adb - core" */
|
||||
Usage();
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 'b': op_flags |= OPT_b; break;
|
||||
case 'D': op_flags |= OPT_D; break;
|
||||
case 'd': op_flags |= OPT_d; if (argc < 2) Usage();
|
||||
dbdir = *(++argv);
|
||||
argc--;
|
||||
goto b1;
|
||||
case 'E': op_flags |= OPT_E; break;
|
||||
case 'e': op_flags |= OPT_d; if (argc < 2) Usage();
|
||||
outencoding = *(++argv);
|
||||
argc--;
|
||||
goto b1;
|
||||
case 'F': op_flags |= OPT_F; break;
|
||||
case 'f': op_flags |= OPT_f; break;
|
||||
case 'i': op_flags |= OPT_i; if (argc < 2) Usage();
|
||||
if (sscanf(*(++argv), "%d", &docid) != 1) Usage();
|
||||
argc--;
|
||||
goto b1;
|
||||
case 'l': op_flags |= OPT_l; break;
|
||||
case 'n': op_flags |= OPT_n; break;
|
||||
case 'P': op_flags |= OPT_P; break;
|
||||
case 'q': op_flags |= OPT_q; break;
|
||||
case 's': op_flags |= OPT_s; break;
|
||||
case 'T': op_flags |= OPT_T; break;
|
||||
case 't': op_flags |= OPT_t; if (argc < 2) Usage();
|
||||
aterm = *(++argv);
|
||||
argc--;
|
||||
goto b1;
|
||||
case 'X': op_flags |= OPT_X; break;
|
||||
case 'x': op_flags |= OPT_x; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
b1: argc--; argv++;
|
||||
}
|
||||
|
||||
vector<string> qterms;
|
||||
if (op_flags & OPT_q) {
|
||||
fprintf(stderr, "q argc %d\n", argc);
|
||||
if (argc < 1)
|
||||
Usage();
|
||||
while (argc > 0) {
|
||||
qterms.push_back(*argv++); argc--;
|
||||
}
|
||||
}
|
||||
|
||||
if (argc != 0)
|
||||
Usage();
|
||||
|
||||
atexit(cleanup);
|
||||
if (signal(SIGHUP, SIG_IGN) != SIG_IGN)
|
||||
signal(SIGHUP, sigcleanup);
|
||||
if (signal(SIGINT, SIG_IGN) != SIG_IGN)
|
||||
signal(SIGINT, sigcleanup);
|
||||
if (signal(SIGQUIT, SIG_IGN) != SIG_IGN)
|
||||
signal(SIGQUIT, sigcleanup);
|
||||
if (signal(SIGTERM, SIG_IGN) != SIG_IGN)
|
||||
signal(SIGTERM, sigcleanup);
|
||||
|
||||
try {
|
||||
db = new Xapian::Database(dbdir);
|
||||
|
||||
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
|
||||
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
|
||||
|
||||
if (op_flags & OPT_T) {
|
||||
Xapian::TermIterator term;
|
||||
string printable;
|
||||
string op = (op_flags & OPT_n) ? string(): "[";
|
||||
string cl = (op_flags & OPT_n) ? string(): "]";
|
||||
if (op_flags & OPT_i) {
|
||||
for (term = db->termlist_begin(docid);
|
||||
term != db->termlist_end(docid);term++) {
|
||||
const string& s = *term;
|
||||
if ((op_flags&OPT_l) &&
|
||||
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
|
||||
continue;
|
||||
cout << op << detailstring(s) << cl << endl;
|
||||
}
|
||||
} else {
|
||||
for (term = db->allterms_begin();
|
||||
term != db->allterms_end();term++) {
|
||||
const string& s = *term;
|
||||
if ((op_flags&OPT_l) &&
|
||||
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
|
||||
continue;
|
||||
if (op_flags & OPT_f)
|
||||
cout << db->get_collection_freq(*term) << " "
|
||||
<< term.get_termfreq() << " ";
|
||||
cout << op << detailstring(s) << cl << endl;
|
||||
}
|
||||
}
|
||||
} else if (op_flags & OPT_s) {
|
||||
for (unsigned int docid = 1;
|
||||
docid < db->get_lastdocid(); docid++) {
|
||||
// cout << docid << ": ";
|
||||
Xapian::TermIterator term;
|
||||
for (term = db->termlist_begin(docid);
|
||||
term != db->termlist_end(docid);term++) {
|
||||
cout << detailstring(*term) << " ";
|
||||
Xapian::Document doc = db->get_document(docid);
|
||||
string data = doc.get_data();
|
||||
cout << data;
|
||||
}
|
||||
}
|
||||
} else if (op_flags & OPT_D) {
|
||||
Xapian::Document doc = db->get_document(docid);
|
||||
string data = doc.get_data();
|
||||
cout << data << endl;
|
||||
} else if (op_flags & OPT_X) {
|
||||
Xapian::Document doc = db->get_document(docid);
|
||||
string data = doc.get_data();
|
||||
cout << data << endl;
|
||||
cout << "Really delete xapian document ?" << endl;
|
||||
string rep;
|
||||
cin >> rep;
|
||||
if (!rep.empty() && (rep[0] == 'y' || rep[0] == 'Y')) {
|
||||
Xapian::WritableDatabase wdb(dbdir, Xapian::DB_OPEN);
|
||||
cout << "Deleting" << endl;
|
||||
wdb.delete_document(docid);
|
||||
}
|
||||
} else if (op_flags & OPT_b) {
|
||||
if (!(op_flags & OPT_i))
|
||||
Usage();
|
||||
vector<string> buf;
|
||||
Xapian::TermIterator term;
|
||||
for (term = db->termlist_begin(docid);
|
||||
term != db->termlist_end(docid); term++) {
|
||||
Xapian::PositionIterator pos;
|
||||
for (pos = db->positionlist_begin(docid, *term);
|
||||
pos != db->positionlist_end(docid, *term); pos++) {
|
||||
if (buf.size() <= *pos)
|
||||
buf.resize((*pos)+100);
|
||||
buf[(*pos)] = detailstring(*term);
|
||||
}
|
||||
}
|
||||
for (vector<string>::iterator it = buf.begin(); it != buf.end();
|
||||
it++) {
|
||||
cout << *it << " ";
|
||||
}
|
||||
} else if (op_flags & OPT_P) {
|
||||
Xapian::PostingIterator doc;
|
||||
for (doc = db->postlist_begin(aterm);
|
||||
doc != db->postlist_end(aterm); doc++) {
|
||||
cout << *doc << "(" << doc.get_wdf() << ") : " ;
|
||||
Xapian::PositionIterator pos;
|
||||
for (pos = doc.positionlist_begin();
|
||||
pos != doc.positionlist_end(); pos++) {
|
||||
cout << *pos << " " ;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
} else if (op_flags & OPT_F) {
|
||||
cout << "FreqFor " << aterm << " : " <<
|
||||
db->get_termfreq(aterm) << endl;
|
||||
} else if (op_flags & OPT_E) {
|
||||
cout << "Exists [" << aterm << "] : " <<
|
||||
db->term_exists(aterm) << endl;
|
||||
} else if (op_flags & OPT_q) {
|
||||
Xapian::Enquire enquire(*db);
|
||||
|
||||
Xapian::Query query(Xapian::Query::OP_AND, qterms.begin(),
|
||||
qterms.end());
|
||||
cout << "Performing query `" <<
|
||||
query.get_description() << "'" << endl;
|
||||
enquire.set_query(query);
|
||||
|
||||
Xapian::MSet matches = enquire.get_mset(0, 10);
|
||||
cout << "Estimated results: " <<
|
||||
matches.get_matches_lower_bound() << endl;
|
||||
Xapian::MSetIterator i;
|
||||
for (i = matches.begin(); i != matches.end(); ++i) {
|
||||
cout << "Document ID " << *i << "\t";
|
||||
cout << i.get_percent() << "% ";
|
||||
Xapian::Document doc = i.get_document();
|
||||
cout << "[" << doc.get_data() << "]" << endl;
|
||||
}
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
cout << "Exception: " << e.get_msg() << endl;
|
||||
} catch (const string &s) {
|
||||
cout << "Exception: " << s << endl;
|
||||
} catch (const char *s) {
|
||||
cout << "Exception: " << s << endl;
|
||||
} catch (...) {
|
||||
cout << "Caught unknown exception" << endl;
|
||||
}
|
||||
exit(0);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue