Add a new environment variable "RECOLL_ACTIVE_EXTRA_DBS", which helps

choose the active external indexes list.
This commit is contained in:
hxcan 2012-03-29 16:56:38 +08:00
commit 5bd071c5a6
788 changed files with 332998 additions and 0 deletions

54
src/query/Makefile Normal file
View file

@ -0,0 +1,54 @@
depth = ..
include $(depth)/mk/sysconf
PROGS = xadump recollq #trhist qtry qxtry
all: $(PROGS)
SRCS = xadump.cpp rclqlang.cpp
.cpp.o :
$(CXX) -c $(ALL_CXXFLAGS) -o $@ $<
XADUMP_OBJS= xadump.o $(BIGLIB)
xadump : $(XADUMP_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
RECOLLQ_OBJS= recollq.o $(BIGLIB)
recollq : $(RECOLLQ_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o recollq $(RECOLLQ_OBJS) \
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
recollq.o : recollq.cpp
$(CXX) $(ALL_CXXFLAGS) -DTEST_RECOLLQ -c recollq.cpp
HISTORY_OBJS= trhist.o $(BIGLIB)
trhist : $(HISTORY_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o trhist $(HISTORY_OBJS) \
$(LIBICONV) $(LIBXAPIAN)
trhist.o : history.cpp history.h
$(CXX) $(ALL_CXXFLAGS) -DTEST_HISTORY -c -o trhist.o history.cpp
WASASTRINGTOQUERY_OBJS= trwasastrtoq.o $(BIGLIB)
trwasastrtoq : $(WASASTRINGTOQUERY_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o trwasastrtoq $(WASASTRINGTOQUERY_OBJS) \
$(LIBICONV) $(LIBXAPIAN)
trwasastrtoq.o : wasastringtoquery.cpp wasastringtoquery.h
$(CXX) $(ALL_CXXFLAGS) -DTEST_WASASTRINGTOQUERY -c \
-o trwasastrtoq.o wasastringtoquery.cpp
$(BIGLIB): force
cd $(depth)/lib;$(MAKE)
force:
depend: alldeps.stamp
alldeps.stamp : $(SRCS)
$(CXX) -M $(ALL_CXXFLAGS) $(SRCS) > alldeps
touch alldeps.stamp
clean:
cp /dev/null alldeps
rm -f alldeps.stamp
rm -f *.o $(PROGS)
include alldeps

109
src/query/docseq.cpp Normal file
View file

@ -0,0 +1,109 @@
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "docseq.h"
#include "filtseq.h"
#include "sortseq.h"
#include "debuglog.h"
string DocSequence::o_sort_trans;
string DocSequence::o_filt_trans;
int DocSequence::getSeqSlice(int offs, int cnt, vector<ResListEntry>& result)
{
int ret = 0;
for (int num = offs; num < offs + cnt; num++, ret++) {
result.push_back(ResListEntry());
if (!getDoc(num, result.back().doc, &result.back().subHeader)) {
result.pop_back();
return ret;
}
}
return ret;
}
// Remove stacked modifying sources (sort, filter) until we get to a real one
void DocSource::stripStack()
{
if (m_seq.isNull())
return;
while (m_seq->getSourceSeq().isNotNull()) {
m_seq = m_seq->getSourceSeq();
}
}
bool DocSource::buildStack()
{
LOGDEB2(("DocSource::buildStack()\n"));
stripStack();
if (m_seq.isNull())
return false;
// Filtering must be done before sorting, (which may
// truncates the original list)
if (m_seq->canFilter()) {
if (!m_seq->setFiltSpec(m_fspec)) {
LOGERR(("DocSource::buildStack: setfiltspec failed\n"));
}
} else {
if (m_fspec.isNotNull()) {
m_seq =
RefCntr<DocSequence>(new DocSeqFiltered(m_config, m_seq, m_fspec));
}
}
if (m_seq->canSort()) {
if (!m_seq->setSortSpec(m_sspec)) {
LOGERR(("DocSource::buildStack: setsortspec failed\n"));
}
} else {
if (m_sspec.isNotNull()) {
m_seq = RefCntr<DocSequence>(new DocSeqSorted(m_seq, m_sspec));
}
}
return true;
}
string DocSource::title()
{
if (m_seq.isNull())
return string();
string qual;
if (m_fspec.isNotNull() && !m_sspec.isNotNull())
qual = string(" (") + o_filt_trans + string(")");
else if (!m_fspec.isNotNull() && m_sspec.isNotNull())
qual = string(" (") + o_sort_trans + string(")");
else if (m_fspec.isNotNull() && m_sspec.isNotNull())
qual = string(" (") + o_sort_trans + string(",") + o_filt_trans + string(")");
return m_seq->title() + qual;
}
bool DocSource::setFiltSpec(const DocSeqFiltSpec &f)
{
LOGDEB2(("DocSource::setFiltSpec\n"));
m_fspec = f;
buildStack();
return true;
}
bool DocSource::setSortSpec(const DocSeqSortSpec &s)
{
LOGDEB2(("DocSource::setSortSpec\n"));
m_sspec = s;
buildStack();
return true;
}

227
src/query/docseq.h Normal file
View file

@ -0,0 +1,227 @@
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _DOCSEQ_H_INCLUDED_
#define _DOCSEQ_H_INCLUDED_
#include <string>
#include <list>
#include <vector>
#ifndef NO_NAMESPACES
using std::string;
using std::list;
using std::vector;
#endif
#include "rcldoc.h"
#include "refcntr.h"
// A result list entry.
struct ResListEntry {
Rcl::Doc doc;
string subHeader;
};
/** Sort specification. */
class DocSeqSortSpec {
public:
DocSeqSortSpec() : desc(false) {}
bool isNotNull() const {return !field.empty();}
void reset() {field.erase();}
string field;
bool desc;
};
/** Filtering spec. This is only used to filter by doc category for now, hence
the rather specialized interface */
class DocSeqFiltSpec {
public:
DocSeqFiltSpec() {}
enum Crit {DSFS_MIMETYPE, DSFS_QLANG, DSFS_PASSALL};
void orCrit(Crit crit, const string& value) {
crits.push_back(crit);
values.push_back(value);
}
std::vector<Crit> crits;
std::vector<string> values;
void reset() {crits.clear(); values.clear();}
bool isNotNull() const {return crits.size() != 0;}
};
/** Interface for a list of documents coming from some source.
The result list display data may come from different sources (ie:
history or Db query), and be post-processed (DocSeqSorted).
Additional functionality like filtering/sorting can either be
obtained by stacking DocSequence objects (ie: sorting history), or
by native capability (ex: docseqdb can sort and filter). The
implementation might be nicer by using more sophisticated c++ with
multiple inheritance of sort and filter virtual interfaces, but
the current one will have to do for now.
*/
class DocSequence {
public:
DocSequence(const string &t) : m_title(t) {}
virtual ~DocSequence() {}
/** Get document at given rank.
*
* @param num document rank in sequence
* @param doc return data
* @param sh subheader to display before this result (ie: date change
* inside history)
* @return true if ok, false for error or end of data
*/
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0) = 0;
/** Get next page of documents. This accumulates entries into the result
* list parameter (doesn't reset it). */
virtual int getSeqSlice(int offs, int cnt, vector<ResListEntry>& result);
/** Get abstract for document. This is special because it may take time.
* The default is to return the input doc's abstract fields, but some
* sequences can compute a better value (ie: docseqdb) */
virtual bool getAbstract(Rcl::Doc& doc, vector<string>& abs) {
abs.push_back(doc.meta[Rcl::Doc::keyabs]);
return true;
}
virtual bool getEnclosing(Rcl::Doc&, Rcl::Doc&) = 0;
/** Get estimated total count in results */
virtual int getResCnt() = 0;
/** Get title for result list */
virtual string title() {return m_title;}
/** Get description for underlying query */
virtual string getDescription() = 0;
/** Get search terms (for highlighting abstracts). Some sequences
* may have no associated search terms. Implement this for them. */
virtual bool getTerms(vector<string>& terms,
vector<vector<string> >& groups,
vector<int>& gslks)
{
terms.clear(); groups.clear(); gslks.clear(); return true;
}
/** Get user-input terms (before stemming etc.) */
virtual void getUTerms(vector<string>& terms)
{
terms.clear();
}
virtual list<string> expand(Rcl::Doc &) {return list<string>();}
/** Optional functionality. */
virtual bool canFilter() {return false;}
virtual bool canSort() {return false;}
virtual bool setFiltSpec(const DocSeqFiltSpec &) {return false;}
virtual bool setSortSpec(const DocSeqSortSpec &) {return false;}
virtual RefCntr<DocSequence> getSourceSeq() {return RefCntr<DocSequence>();}
static void set_translations(const string& sort, const string& filt)
{
o_sort_trans = sort;
o_filt_trans = filt;
}
protected:
static string o_sort_trans;
static string o_filt_trans;
private:
string m_title;
};
/** A modifier has a child sequence which does the real work and does
* something with the results. Some operations are just delegated
*/
class DocSeqModifier : public DocSequence {
public:
DocSeqModifier(RefCntr<DocSequence> iseq)
: DocSequence(""), m_seq(iseq)
{}
virtual ~DocSeqModifier() {}
virtual bool getAbstract(Rcl::Doc& doc, vector<string>& abs)
{
if (m_seq.isNull())
return false;
return m_seq->getAbstract(doc, abs);
}
virtual string getDescription()
{
if (m_seq.isNull())
return "";
return m_seq->getDescription();
}
virtual bool getTerms(vector<string>& terms,
vector<vector<string> >& groups,
vector<int>& gslks)
{
if (m_seq.isNull())
return false;
return m_seq->getTerms(terms, groups, gslks);
}
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
{
if (m_seq.isNull())
return false;
return m_seq->getEnclosing(doc, pdoc);
}
virtual void getUTerms(vector<string>& terms)
{
if (m_seq.isNull())
return;
m_seq->getUTerms(terms);
}
virtual string title() {return m_seq->title();}
virtual RefCntr<DocSequence> getSourceSeq() {return m_seq;}
protected:
RefCntr<DocSequence> m_seq;
};
class RclConfig;
// A DocSource can juggle docseqs of different kinds to implement
// sorting and filtering in ways depending on the base seqs capabilities
class DocSource : public DocSeqModifier {
public:
DocSource(RclConfig *config, RefCntr<DocSequence> iseq)
: DocSeqModifier(iseq), m_config(config)
{}
virtual bool canFilter() {return true;}
virtual bool canSort() {return true;}
virtual bool setFiltSpec(const DocSeqFiltSpec &);
virtual bool setSortSpec(const DocSeqSortSpec &);
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0)
{
if (m_seq.isNull())
return false;
return m_seq->getDoc(num, doc, sh);
}
virtual int getResCnt()
{
if (m_seq.isNull())
return 0;
return m_seq->getResCnt();
}
virtual string title();
private:
bool buildStack();
void stripStack();
RclConfig *m_config;
DocSeqFiltSpec m_fspec;
DocSeqSortSpec m_sspec;
};
#endif /* _DOCSEQ_H_INCLUDED_ */

183
src/query/docseqdb.cpp Normal file
View file

@ -0,0 +1,183 @@
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <math.h>
#include <time.h>
#include "docseqdb.h"
#include "rcldb.h"
#include "debuglog.h"
#include "internfile.h"
#include "wasatorcl.h"
DocSequenceDb::DocSequenceDb(RefCntr<Rcl::Query> q, const string &t,
RefCntr<Rcl::SearchData> sdata)
: DocSequence(t), m_q(q), m_sdata(sdata), m_fsdata(sdata),
m_rescnt(-1),
m_queryBuildAbstract(true),
m_queryReplaceAbstract(false),
m_isFiltered(false),
m_isSorted(false),
m_needSetQuery(false)
{
}
DocSequenceDb::~DocSequenceDb()
{
}
bool DocSequenceDb::getTerms(vector<string>& terms,
vector<vector<string> >& groups,
vector<int>& gslks)
{
return m_fsdata->getTerms(terms, groups, gslks);
}
void DocSequenceDb::getUTerms(vector<string>& terms)
{
m_sdata->getUTerms(terms);
}
string DocSequenceDb::getDescription()
{
return m_fsdata->getDescription();
}
bool DocSequenceDb::getDoc(int num, Rcl::Doc &doc, string *sh)
{
setQuery();
if (sh) sh->erase();
return m_q->getDoc(num, doc);
}
int DocSequenceDb::getResCnt()
{
setQuery();
if (m_rescnt < 0) {
m_rescnt= m_q->getResCnt();
}
return m_rescnt;
}
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
{
setQuery();
if (m_q->whatDb() &&
m_queryBuildAbstract && (doc.syntabs || m_queryReplaceAbstract)) {
m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), vabs);
}
if (vabs.empty())
vabs.push_back(doc.meta[Rcl::Doc::keyabs]);
return true;
}
bool DocSequenceDb::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
{
setQuery();
string udi;
if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath,
udi))
return false;
return m_q->whatDb()->getDoc(udi, pdoc);
}
list<string> DocSequenceDb::expand(Rcl::Doc &doc)
{
setQuery();
return m_q->expand(doc);
}
string DocSequenceDb::title()
{
string qual;
if (m_isFiltered && !m_isSorted)
qual = string(" (") + o_filt_trans + string(")");
else if (!m_isFiltered && m_isSorted)
qual = string(" (") + o_sort_trans + string(")");
else if (m_isFiltered && m_isSorted)
qual = string(" (") + o_sort_trans + string(",") + o_filt_trans + string(")");
return DocSequence::title() + qual;
}
bool DocSequenceDb::setFiltSpec(const DocSeqFiltSpec &fs)
{
LOGDEB(("DocSequenceDb::setFiltSpec\n"));
if (fs.isNotNull()) {
// We build a search spec by adding a filtering layer to the base one.
m_fsdata = RefCntr<Rcl::SearchData>(new Rcl::SearchData(Rcl::SCLT_AND));
Rcl::SearchDataClauseSub *cl =
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB, m_sdata);
m_fsdata->addClause(cl);
for (unsigned int i = 0; i < fs.crits.size(); i++) {
switch (fs.crits[i]) {
case DocSeqFiltSpec::DSFS_MIMETYPE:
m_fsdata->addFiletype(fs.values[i]);
break;
case DocSeqFiltSpec::DSFS_QLANG:
{
if (m_q.isNull())
break;
string reason;
Rcl::SearchData *sd =
wasaStringToRcl(m_q->whatDb()->getConf(),
fs.values[i], reason);
if (sd) {
Rcl::SearchDataClauseSub *cl1 =
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB,
RefCntr<Rcl::SearchData>(sd));
m_fsdata->addClause(cl1);
}
}
break;
default:
break;
}
}
m_isFiltered = true;
} else {
m_fsdata = m_sdata;
m_isFiltered = false;
}
m_needSetQuery = true;
return true;
}
bool DocSequenceDb::setSortSpec(const DocSeqSortSpec &spec)
{
LOGDEB(("DocSequenceDb::setSortSpec: fld [%s] %s\n",
spec.field.c_str(), spec.desc ? "desc" : "asc"));
if (spec.isNotNull()) {
m_q->setSortBy(spec.field, !spec.desc);
m_isSorted = true;
} else {
m_q->setSortBy(string(), true);
m_isSorted = false;
}
m_needSetQuery = true;
return true;
}
bool DocSequenceDb::setQuery()
{
if (!m_needSetQuery)
return true;
m_rescnt = -1;
m_needSetQuery = !m_q->setQuery(m_fsdata);
return !m_needSetQuery;
}

65
src/query/docseqdb.h Normal file
View file

@ -0,0 +1,65 @@
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _DOCSEQDB_H_INCLUDED_
#define _DOCSEQDB_H_INCLUDED_
#include "docseq.h"
#include "refcntr.h"
#include "searchdata.h"
#include "rclquery.h"
/** A DocSequence from a Db query */
class DocSequenceDb : public DocSequence {
public:
DocSequenceDb(RefCntr<Rcl::Query> q, const string &t,
RefCntr<Rcl::SearchData> sdata);
virtual ~DocSequenceDb();
virtual bool getDoc(int num, Rcl::Doc &doc, string * = 0);
virtual int getResCnt();
virtual bool getTerms(vector<string>& terms,
vector<vector<string> >& groups,
vector<int>& gslks);
virtual void getUTerms(vector<string>& terms);
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
virtual string getDescription();
virtual list<string> expand(Rcl::Doc &doc);
virtual bool canFilter() {return true;}
virtual bool setFiltSpec(const DocSeqFiltSpec &filtspec);
virtual bool canSort() {return true;}
virtual bool setSortSpec(const DocSeqSortSpec &sortspec);
virtual void setAbstractParams(bool qba, bool qra)
{
m_queryBuildAbstract = qba;
m_queryReplaceAbstract = qra;
}
virtual string title();
private:
RefCntr<Rcl::Query> m_q;
RefCntr<Rcl::SearchData> m_sdata;
RefCntr<Rcl::SearchData> m_fsdata; // Filtered
int m_rescnt;
bool m_queryBuildAbstract;
bool m_queryReplaceAbstract;
bool m_isFiltered;
bool m_isSorted;
bool m_needSetQuery; // search data changed, need to reapply before fetch
bool setQuery();
};
#endif /* _DOCSEQDB_H_INCLUDED_ */

159
src/query/docseqhist.cpp Normal file
View file

@ -0,0 +1,159 @@
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <cmath>
#include "docseqhist.h"
#include "rcldb.h"
#include "fileudi.h"
#include "internfile.h"
#include "base64.h"
#include "debuglog.h"
#include "smallut.h"
// Encode document history entry:
// U + Unix time + base64 of udi
// The U distinguishes udi-based entries from older fn+ipath ones
bool RclDHistoryEntry::encode(string& value)
{
char chartime[30];
sprintf(chartime,"%ld", unixtime);
string budi;
base64_encode(udi, budi);
value = string("U ") + string(chartime) + " " + budi;
return true;
}
// Decode. We support historical entries which were like "time b64fn [b64ipath]"
// Current entry format is "U time b64udi"
bool RclDHistoryEntry::decode(const string &value)
{
list<string> vall;
stringToStrings(value, vall);
list<string>::const_iterator it = vall.begin();
udi.erase();
string fn, ipath;
switch (vall.size()) {
case 2:
// Old fn+ipath, null ipath case
unixtime = atol((*it++).c_str());
base64_decode(*it++, fn);
break;
case 3:
if (!it->compare("U")) {
// New udi-based entry
it++;
unixtime = atol((*it++).c_str());
base64_decode(*it++, udi);
} else {
// Old fn + ipath. We happen to know how to build an udi
unixtime = atol((*it++).c_str());
base64_decode(*it++, fn);
base64_decode(*it, ipath);
}
break;
default:
return false;
}
if (!fn.empty()) {
// Old style entry found, make an udi, using the fs udi maker
make_udi(fn, ipath, udi);
}
LOGDEB1(("RclDHistoryEntry::decode: udi [%s]\n", udi.c_str()));
return true;
}
bool RclDHistoryEntry::equal(const DynConfEntry& other)
{
const RclDHistoryEntry& e = dynamic_cast<const RclDHistoryEntry&>(other);
return e.udi == udi;
}
bool historyEnterDoc(RclDynConf *dncf, const string& udi)
{
LOGDEB1(("historyEnterDoc: [%s] into %s\n",
udi.c_str(), dncf->getFilename().c_str()));
RclDHistoryEntry ne(time(0), udi);
RclDHistoryEntry scratch;
return dncf->insertNew(docHistSubKey, ne, scratch, 200);
}
list<RclDHistoryEntry> getDocHistory(RclDynConf* dncf)
{
return dncf->getList<RclDHistoryEntry>(docHistSubKey);
}
bool DocSequenceHistory::getDoc(int num, Rcl::Doc &doc, string *sh)
{
// Retrieve history list
if (!m_hist)
return false;
if (m_hlist.empty())
m_hlist = getDocHistory(m_hist);
if (num < 0 || num >= (int)m_hlist.size())
return false;
int skip;
if (m_prevnum >= 0 && num >= m_prevnum) {
skip = num - m_prevnum;
} else {
skip = num;
m_it = m_hlist.begin();
m_prevtime = -1;
}
m_prevnum = num;
while (skip--)
m_it++;
if (sh) {
if (m_prevtime < 0 ||
abs (float(m_prevtime) - float(m_it->unixtime)) > 86400) {
m_prevtime = m_it->unixtime;
time_t t = (time_t)(m_it->unixtime);
*sh = string(ctime(&t));
// Get rid of the final \n in ctime
sh->erase(sh->length()-1);
} else
sh->erase();
}
bool ret = m_db->getDoc(m_it->udi, doc);
if (!ret) {
doc.url = "UNKNOWN";
doc.ipath = "";
}
return ret;
}
bool DocSequenceHistory::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc)
{
string udi;
if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath,
udi))
return false;
return m_db->getDoc(udi, pdoc);
}
int DocSequenceHistory::getResCnt()
{
if (m_hlist.empty())
m_hlist = getDocHistory(m_hist);
return m_hlist.size();
}

67
src/query/docseqhist.h Normal file
View file

@ -0,0 +1,67 @@
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _DOCSEQHIST_H_INCLUDED_
#define _DOCSEQHIST_H_INCLUDED_
#include "docseq.h"
#include "dynconf.h"
namespace Rcl {
class Db;
}
/** DynConf Document history entry */
class RclDHistoryEntry : public DynConfEntry {
public:
RclDHistoryEntry() : unixtime(0) {}
RclDHistoryEntry(long t, const string& u)
: unixtime(t), udi(u) {}
virtual ~RclDHistoryEntry() {}
virtual bool decode(const string &value);
virtual bool encode(string& value);
virtual bool equal(const DynConfEntry& other);
long unixtime;
string udi;
};
/** A DocSequence coming from the history file.
* History is kept as a list of urls. This queries the db to fetch
* metadata for an url key */
class DocSequenceHistory : public DocSequence {
public:
DocSequenceHistory(Rcl::Db *d, RclDynConf *h, const string &t)
: DocSequence(t), m_db(d), m_hist(h), m_prevnum(-1), m_prevtime(-1) {}
virtual ~DocSequenceHistory() {}
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0);
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
virtual int getResCnt();
virtual string getDescription() {return m_description;}
void setDescription(const string& desc) {m_description = desc;}
private:
Rcl::Db *m_db;
RclDynConf *m_hist;
int m_prevnum;
long m_prevtime;
string m_description; // This is just an nls translated 'doc history'
list<RclDHistoryEntry> m_hlist;
list<RclDHistoryEntry>::const_iterator m_it;
};
extern bool historyEnterDoc(RclDynConf *dncf, const string& udi);
#endif /* _DOCSEQ_H_INCLUDED_ */

234
src/query/dynconf.cpp Normal file
View file

@ -0,0 +1,234 @@
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef TEST_HISTORY
#include <stdio.h>
#include <time.h>
#include <cstdlib>
#include "dynconf.h"
#include "base64.h"
#include "smallut.h"
#include "debuglog.h"
#ifndef NO_NAMESPACES
using namespace std;
#endif
// Well known keys for history and external indexes.
const string docHistSubKey = "docs";
const string allEdbsSk = "allExtDbs";
const string actEdbsSk = "actExtDbs";
// @param sk section this is for
// @param n new entry
// @param s a scratch entry used for decoding and comparisons.
// This avoids templating this routine for the actual entry type.
bool RclDynConf::insertNew(const string &sk, DynConfEntry &n, DynConfEntry &s,
int maxlen)
{
// Is this doc already in list ? If it is we remove the old entry
list<string> names = m_data.getNames(sk);
list<string>::const_iterator it;
bool changed = false;
for (it = names.begin(); it != names.end(); it++) {
string oval;
if (!m_data.get(*it, oval, sk)) {
LOGDEB(("No data for %s\n", (*it).c_str()));
continue;
}
s.decode(oval);
if (s.equal(n)) {
LOGDEB(("Erasing old entry\n"));
m_data.erase(*it, sk);
changed = true;
}
}
// Maybe reget list
if (changed)
names = m_data.getNames(sk);
// Need to prune ?
if (maxlen > 0 && names.size() >= (unsigned int)maxlen) {
// Need to erase entries until we're back to size. Note that
// we don't ever reset numbers. Problems will arise when
// history is 4 billion entries old
it = names.begin();
for (unsigned int i = 0; i < names.size() - maxlen + 1; i++, it++) {
m_data.erase(*it, sk);
}
}
// Increment highest number
unsigned int hi = names.empty() ? 0 :
(unsigned int)atoi(names.back().c_str());
hi++;
char nname[20];
sprintf(nname, "%010u", hi);
string value;
n.encode(value);
LOGDEB1(("Encoded value [%s] (%d)\n", value.c_str(), value.size()));
if (!m_data.set(string(nname), value, sk)) {
LOGERR(("RclDHistory::insertNew: set failed\n"));
return false;
}
return true;
}
bool RclDynConf::eraseAll(const string &sk)
{
list<string> names = m_data.getNames(sk);
list<string>::const_iterator it;
for (it = names.begin(); it != names.end(); it++) {
m_data.erase(*it, sk);
}
return true;
}
// Generic string list specialization ///////////////////////////////////
// Encode/decode simple string. base64 used to avoid problems with
// strange chars
bool RclSListEntry::encode(string& enc)
{
base64_encode(value, enc);
return true;
}
bool RclSListEntry::decode(const string &enc)
{
base64_decode(enc, value);
return true;
}
bool RclSListEntry::equal(const DynConfEntry& other)
{
const RclSListEntry& e = dynamic_cast<const RclSListEntry&>(other);
return e.value == value;
}
bool RclDynConf::enterString(const string sk, const string value, int maxlen)
{
RclSListEntry ne(value);
RclSListEntry scratch;
return insertNew(sk, ne, scratch, maxlen);
}
list<string> RclDynConf::getStringList(const string sk)
{
list<RclSListEntry> el = getList<RclSListEntry>(sk);
list<string> sl;
for (list<RclSListEntry>::const_iterator it = el.begin();
it != el.end(); it++)
sl.push_back(it->value);
return sl;
}
#else
#include <string>
#include <iostream>
#include "history.h"
#include "debuglog.h"
#ifndef NO_NAMESPACES
using namespace std;
#endif
static string thisprog;
static string usage =
"trhist [opts] <filename>\n"
" [-s <subkey>]: specify subkey (default: RclDynConf::docHistSubKey)\n"
" [-e] : erase all\n"
" [-a <string>] enter string (needs -s, no good for history entries\n"
"\n"
;
static void
Usage(void)
{
cerr << thisprog << ": usage:\n" << usage;
exit(1);
}
static int op_flags;
#define OPT_e 0x2
#define OPT_s 0x4
#define OPT_a 0x8
int main(int argc, char **argv)
{
string sk = "docs";
string value;
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 'a': op_flags |= OPT_a; if (argc < 2) Usage();
value = *(++argv); argc--;
goto b1;
case 's': op_flags |= OPT_s; if (argc < 2) Usage();
sk = *(++argv); argc--;
goto b1;
case 'e': op_flags |= OPT_e; break;
default: Usage(); break;
}
b1: argc--; argv++;
}
if (argc != 1)
Usage();
string filename = *argv++;argc--;
RclDynConf hist(filename, 5);
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr");
if (op_flags & OPT_e) {
hist.eraseAll(sk);
} else if (op_flags & OPT_a) {
if (!(op_flags & OPT_s))
Usage();
hist.enterString(sk, value);
} else {
for (int i = 0; i < 10; i++) {
char docname[100];
sprintf(docname, "A very long document document name"
"is very long indeed and this is the end of "
"it here and exactly here:\n%d", i);
hist.enterDoc(string(docname), "ipathx");
}
list<RclDHistoryEntry> hlist = hist.getDocHistory();
for (list<RclDHistoryEntry>::const_iterator it = hlist.begin();
it != hlist.end(); it++) {
printf("[%ld] [%s] [%s]\n", it->unixtime,
it->fn.c_str(), it->ipath.c_str());
}
}
}
#endif

122
src/query/dynconf.h Normal file
View file

@ -0,0 +1,122 @@
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _DYNCONF_H_INCLUDED_
#define _DYNCONF_H_INCLUDED_
/**
* Dynamic configuration storage
*
* This used to be called "history" because of the initial usage.
* Used to store some parameters which would fit neither in recoll.conf,
* basically because they change a lot, nor in the QT preferences file, mostly
* because they are specific to a configuration directory.
* Examples:
* - History of documents selected for preview
* - Active and inactive external databases (depend on the
* configuration directory)
* - ...
*
* The storage is performed in a ConfSimple file, with subkeys and
* encodings which depend on the data stored. Under each section, the keys
* are sequential numeric, so this basically manages a set of lists.
*
*/
#include <string>
#include <list>
#include <utility>
#include "conftree.h"
#ifndef NO_NAMESPACES
using namespace std;
#endif
// Entry interface.
class DynConfEntry {
public:
virtual ~DynConfEntry() {}
virtual bool decode(const string &value) = 0;
virtual bool encode(string& value) = 0;
virtual bool equal(const DynConfEntry &other) = 0;
};
/** String storage generic object */
class RclSListEntry : public DynConfEntry {
public:
RclSListEntry() {}
RclSListEntry(const string& v) : value(v) {}
virtual ~RclSListEntry() {}
virtual bool decode(const string &enc);
virtual bool encode(string& enc);
virtual bool equal(const DynConfEntry& other);
string value;
};
/** The dynamic configuration class */
class RclDynConf {
public:
RclDynConf(const string &fn)
: m_data(fn.c_str()) {}
bool ok() {return m_data.getStatus() == ConfSimple::STATUS_RW;}
string getFilename() {return m_data.getFilename();}
// Generic methods
bool eraseAll(const string& sk);
bool insertNew(const string& sk, DynConfEntry &n, DynConfEntry &s,
int maxlen = -1);
template<typename Tp> list<Tp> getList(const string& sk);
// Specialized methods for simple string lists, designated by the
// subkey value
bool enterString(const string sk, const string value, int maxlen = -1);
list<string> getStringList(const string sk);
private:
unsigned int m_mlen;
ConfSimple m_data;
};
template<typename Tp> list<Tp> RclDynConf::getList(const string &sk)
{
list<Tp> mlist;
Tp entry;
list<string> names = m_data.getNames(sk);
for (list<string>::const_iterator it = names.begin();
it != names.end(); it++) {
string value;
if (m_data.get(*it, value, sk)) {
if (!entry.decode(value))
continue;
mlist.push_front(entry);
}
}
return mlist;
}
// Defined subkeys. Values in dynconf.cpp
// History
extern const string docHistSubKey;
// All external indexes
extern const string allEdbsSk;
// Active external indexes
extern const string actEdbsSk;
#endif /* _DYNCONF_H_INCLUDED_ */

125
src/query/filtseq.cpp Normal file
View file

@ -0,0 +1,125 @@
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "debuglog.h"
#include "filtseq.h"
#include "rclconfig.h"
using std::string;
static bool filter(const DocSeqFiltSpec& fs, const Rcl::Doc *x)
{
LOGDEB2((" Filter: ncrits %d\n", fs.crits.size()));
// Compare using each criterion in term. We're doing an or:
// 1st ok ends
for (unsigned int i = 0; i < fs.crits.size(); i++) {
switch (fs.crits[i]) {
case DocSeqFiltSpec::DSFS_MIMETYPE:
LOGDEB2((" filter: MIMETYPE: me [%s] doc [%s]\n",
fs.values[i].c_str(), x->mimetype.c_str()));
if (x->mimetype == fs.values[i])
return true;
break;
case DocSeqFiltSpec::DSFS_QLANG:
{
LOGDEB((" filter: QLANG [%s]!!\n", fs.values[i].c_str()));
}
break;
case DocSeqFiltSpec::DSFS_PASSALL:
return true;
}
}
// Did all comparisons
return false;
}
DocSeqFiltered::DocSeqFiltered(RclConfig *conf, RefCntr<DocSequence> iseq,
DocSeqFiltSpec &filtspec)
: DocSeqModifier(iseq), m_config(conf)
{
setFiltSpec(filtspec);
}
bool DocSeqFiltered::setFiltSpec(DocSeqFiltSpec &filtspec)
{
LOGDEB0(("DocSeqFiltered::setFiltSpec\n"));
for (unsigned int i = 0; i < filtspec.crits.size(); i++) {
switch (filtspec.crits[i]) {
case DocSeqFiltSpec::DSFS_MIMETYPE:
m_spec.orCrit(filtspec.crits[i], filtspec.values[i]);
break;
case DocSeqFiltSpec::DSFS_QLANG:
{
// There are very few lang constructs that we can
// interpret. The default config uses rclcat:value
// only. That will be all for now...
string val = filtspec.values[i];
if (val.find("rclcat:") == 0) {
string catg = val.substr(7);
list<string> tps;
m_config->getMimeCatTypes(catg, tps);
for (list<string>::const_iterator it = tps.begin();
it != tps.end(); it++) {
LOGDEB2(("Adding mime: [%s]\n", it->c_str()));
m_spec.orCrit(DocSeqFiltSpec::DSFS_MIMETYPE, *it);
}
}
}
break;
default:
break;
}
}
// If m_spec ends up empty, pass everything, better than filtering all.
if (m_spec.crits.empty()) {
m_spec.orCrit(DocSeqFiltSpec::DSFS_PASSALL, "");
}
m_dbindices.clear();
return true;
}
bool DocSeqFiltered::getDoc(int idx, Rcl::Doc &doc, string *)
{
LOGDEB2(("DocSeqFiltered::getDoc() fetching %d\n", idx));
if (idx >= (int)m_dbindices.size()) {
// Have to fetch docs and filter until we get enough or
// fail
m_dbindices.reserve(idx+1);
// First backend seq doc we fetch is the one after last stored
int backend_idx = m_dbindices.size() > 0 ? m_dbindices.back() + 1 : 0;
// Loop until we get enough docs
Rcl::Doc tdoc;
int i = 0;
while (idx >= (int)m_dbindices.size()) {
if (!m_seq->getDoc(backend_idx, tdoc))
return false;
if (filter(m_spec, &tdoc)) {
m_dbindices.push_back(backend_idx);
}
backend_idx++;
}
doc = tdoc;
} else {
// The corresponding backend indice is already known
if (!m_seq->getDoc(m_dbindices[idx], doc))
return false;
}
return true;
}

49
src/query/filtseq.h Normal file
View file

@ -0,0 +1,49 @@
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _FILTSEQ_H_INCLUDED_
#define _FILTSEQ_H_INCLUDED_
#include <vector>
#include <string>
using std::string;
using std::vector;
#include "refcntr.h"
#include "docseq.h"
class RclConfig;
/**
* A filtered sequence is created from another one by selecting entries
* according to the given criteria.
*/
class DocSeqFiltered : public DocSeqModifier {
public:
DocSeqFiltered(RclConfig *conf, RefCntr<DocSequence> iseq,
DocSeqFiltSpec &filtspec);
virtual ~DocSeqFiltered() {}
virtual bool canFilter() {return true;}
virtual bool setFiltSpec(DocSeqFiltSpec &filtspec);
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0);
virtual int getResCnt() {return m_seq->getResCnt();}
private:
RclConfig *m_config;
DocSeqFiltSpec m_spec;
vector<int> m_dbindices;
};
#endif /* _FILTSEQ_H_INCLUDED_ */

523
src/query/plaintorich.cpp Normal file
View file

@ -0,0 +1,523 @@
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <string>
#include <utility>
#include <list>
#include <set>
#include <vector>
#include <map>
#include <algorithm>
#ifndef NO_NAMESPACES
using std::vector;
using std::list;
using std::pair;
using std::set;
#endif /* NO_NAMESPACES */
#include "rcldb.h"
#include "rclconfig.h"
#include "debuglog.h"
#include "textsplit.h"
#include "utf8iter.h"
#include "smallut.h"
#include "plaintorich.h"
#include "cancelcheck.h"
#include "unacpp.h"
// For debug printing
static string vecStringToString(const vector<string>& t)
{
string sterms;
for (vector<string>::const_iterator it = t.begin(); it != t.end(); it++) {
sterms += "[" + *it + "] ";
}
return sterms;
}
// Text splitter callback used to take note of the position of query terms
// inside the result text. This is then used to insert highlight tags.
class TextSplitPTR : public TextSplit {
public:
// Out: begin and end byte positions of query terms/groups in text
vector<pair<int, int> > tboffs;
TextSplitPTR(const vector<string>& its,
const vector<vector<string> >&groups,
const vector<int>& slacks)
: m_wcount(0), m_groups(groups), m_slacks(slacks)
{
for (vector<string>::const_iterator it = its.begin();
it != its.end(); it++) {
m_terms.insert(*it);
}
for (vector<vector<string> >::const_iterator vit = m_groups.begin();
vit != m_groups.end(); vit++) {
for (vector<string>::const_iterator it = (*vit).begin();
it != (*vit).end(); it++) {
m_gterms.insert(*it);
}
}
}
// Callback called by the text-to-words breaker for each word
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
string dumb;
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
term.c_str()));
return true;
}
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
// pos, bts, bte));
// If this word is a search term, remember its byte-offset span.
if (m_terms.find(dumb) != m_terms.end()) {
tboffs.push_back(pair<int, int>(bts, bte));
}
if (m_gterms.find(dumb) != m_gterms.end()) {
// Term group (phrase/near) handling
m_plists[dumb].push_back(pos);
m_gpostobytes[pos] = pair<int,int>(bts, bte);
//LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
}
if ((m_wcount++ & 0xfff) == 0)
CancelCheck::instance().checkCancel();
return true;
}
// Must be called after the split to find the phrase/near match positions
virtual bool matchGroups();
private:
virtual bool matchGroup(const vector<string>& terms, int dist);
int m_wcount;
// In: user query terms
set<string> m_terms;
// In: user query groups, for near/phrase searches.
const vector<vector<string> >& m_groups;
const vector<int>& m_slacks;
set<string> m_gterms;
// group/near terms word positions.
map<string, vector<int> > m_plists;
map<int, pair<int, int> > m_gpostobytes;
};
/** Sort by shorter comparison class */
class VecIntCmpShorter {
public:
/** Return true if and only if a is strictly shorter than b.
*/
bool operator()(const vector<int> *a, const vector<int> *b) {
return a->size() < b->size();
}
};
#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \
if ((POS) > (STO)) (STO) = (POS);}
// Recursively check that each term is inside the window (which is
// readjusted as the successive terms are found). i is the index for
// the next position list to use (initially 1)
static bool do_proximity_test(int window, vector<vector<int>* >& plists,
unsigned int i, int min, int max,
int *sp, int *ep)
{
int tmp = max + 1;
// take care to avoid underflow
if (window <= tmp)
tmp -= window;
else
tmp = 0;
vector<int>::iterator it = plists[i]->begin();
// Find 1st position bigger than window start
while (it != plists[i]->end() && *it < tmp)
it++;
// Try each position inside window in turn for match with other lists
while (it != plists[i]->end()) {
int pos = *it;
if (pos > min + window - 1)
return false;
if (i + 1 == plists.size()) {
SETMINMAX(pos, *sp, *ep);
return true;
}
if (pos < min) {
min = pos;
} else if (pos > max) {
max = pos;
}
if (do_proximity_test(window, plists, i + 1, min, max, sp, ep)) {
SETMINMAX(pos, *sp, *ep);
return true;
}
it++;
}
return false;
}
// Check if there is a NEAR match for the group of terms
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
{
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
vecStringToString(terms).c_str()));
// The position lists we are going to work with. We extract them from the
// (string->plist) map
vector<vector<int>* > plists;
// A revert plist->term map. This is so that we can find who is who after
// sorting the plists by length.
map<vector<int>*, string> plistToTerm;
// For traces
vector<string> realgroup;
// Find the position list for each term in the group. Not all
// necessarily exist (esp for NEAR where terms have been
// stem-expanded: we don't know which matched)
for (vector<string>::const_iterator it = terms.begin();
it != terms.end(); it++) {
map<string, vector<int> >::iterator pl = m_plists.find(*it);
if (pl == m_plists.end()) {
LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
(*it).c_str()));
continue;
}
plists.push_back(&(pl->second));
plistToTerm[&(pl->second)] = *it;
realgroup.push_back(*it);
}
LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n",
window, vecStringToString(realgroup).c_str()));
if (plists.size() < 2) {
LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
return false;
}
// Sort the positions lists so that the shorter is first
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
{ // Debug
map<vector<int>*, string>::iterator it;
it = plistToTerm.find(plists[0]);
if (it == plistToTerm.end()) {
// SuperWeird
LOGERR(("matchGroup: term for first list not found !?!\n"));
return false;
}
LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
it->second.c_str(), plists[0]->size()));
}
// Walk the shortest plist and look for matches
for (vector<int>::iterator it = plists[0]->begin();
it != plists[0]->end(); it++) {
int pos = *it;
int sta = int(10E9), sto = 0;
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
sta, sto));
// Maybe extend the window by 1st term position, this was not
// done by do_prox..
SETMINMAX(pos, sta, sto);
// Translate the position window into a byte offset window
int bs = 0;
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
i1->second.first, i2->second.second));
tboffs.push_back(pair<int, int>(i1->second.first,
i2->second.second));
bs = i1->second.first;
} else {
LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
}
}
}
return true;
}
/** Sort integer pairs by increasing first value and decreasing width */
class PairIntCmpFirst {
public:
bool operator()(pair<int,int> a, pair<int, int>b) {
if (a.first != b.first)
return a.first < b.first;
return a.second > b.second;
}
};
// Do the phrase match thing, then merge the highlight lists
bool TextSplitPTR::matchGroups()
{
vector<vector<string> >::const_iterator vit = m_groups.begin();
vector<int>::const_iterator sit = m_slacks.begin();
for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
matchGroup(*vit, *sit + (*vit).size());
}
// Sort by start and end offsets. The merging of overlapping entries
// will be handled during output.
std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
return true;
}
// Fix result text for display inside the gui text window.
//
// To compute the term character positions in the output text, we used
// to emulate how qt's textedit counts chars (ignoring tags and
// duplicate whitespace etc...). This was tricky business, dependant
// on qtextedit internals, and we don't do it any more, so we finally
// don't know the term par/car positions in the editor text.
// Instead, we now mark the search term positions with html anchors
//
// We output the result in chunks, arranging not to cut in the middle of
// a tag, which would confuse qtextedit.
bool PlainToRich::plaintorich(const string& in,
list<string>& out, // Output chunk list
const HiliteData& hdata,
int chunksize)
{
Chrono chron;
const vector<string>& terms(hdata.terms);
const vector<vector<string> >& groups(hdata.groups);
const vector<int>& slacks(hdata.gslks);
if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {
LOGDEB0(("plaintorich: terms: \n"));
string sterms = vecStringToString(terms);
LOGDEB0((" %s\n", sterms.c_str()));
sterms = "\n";
LOGDEB0(("plaintorich: groups: \n"));
for (vector<vector<string> >::const_iterator vit = groups.begin();
vit != groups.end(); vit++) {
sterms += "GROUP: ";
sterms += vecStringToString(*vit);
sterms += "\n";
}
LOGDEB0((" %s", sterms.c_str()));
LOGDEB2((" TEXT:[%s]\n", in.c_str()));
}
// Compute the positions for the query terms. We use the text
// splitter to break the text into words, and compare the words to
// the search terms,
TextSplitPTR splitter(terms, groups, slacks);
// Note: the splitter returns the term locations in byte, not
// character, offsets.
splitter.text_to_words(in);
LOGDEB2(("plaintorich: split done %d mS\n", chron.millis()));
// Compute the positions for NEAR and PHRASE groups.
splitter.matchGroups();
out.clear();
out.push_back("");
list<string>::iterator olit = out.begin();
// Rich text output
*olit = header();
// Iterator for the list of input term positions. We use it to
// output highlight tags and to compute term positions in the
// output text
vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();
#if 0
for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
it != splitter.tboffs.end(); it++) {
LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
}
#endif
// Input character iterator
Utf8Iter chariter(in);
// State variables used to limit the number of consecutive empty lines,
// convert all eol to '\n', and preserve some indentation
int eol = 0;
int hadcr = 0;
int inindent = 1;
// Value for numbered anchors at each term match
int anchoridx = 1;
// HTML state
bool intag = false, inparamvalue = false;
// My tag state
int inrcltag = 0;
string::size_type headend = 0;
if (m_inputhtml) {
headend = in.find("</head>");
if (headend == string::npos)
headend = in.find("</HEAD>");
if (headend != string::npos)
headend += 7;
}
for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
// Check from time to time if we need to stop
if ((pos & 0xfff) == 0) {
CancelCheck::instance().checkCancel();
}
// If we still have terms positions, check (byte) position. If
// we are at or after a term match, mark.
if (tPosIt != tPosEnd) {
int ibyteidx = chariter.getBpos();
if (ibyteidx == tPosIt->first) {
if (!intag && ibyteidx > (int)headend) {
*olit += startAnchor(anchoridx);
*olit += startMatch();
}
anchoridx++;
inrcltag = 1;
} else if (ibyteidx == tPosIt->second) {
// Output end of match region tags
if (!intag && ibyteidx > (int)headend) {
*olit += endMatch();
*olit += endAnchor();
}
// Skip all highlight areas that would overlap this one
int crend = tPosIt->second;
while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
tPosIt++;
inrcltag = 0;
}
}
unsigned int car = *chariter;
if (car == '\n') {
if (!hadcr)
eol++;
hadcr = 0;
continue;
} else if (car == '\r') {
hadcr++;
eol++;
continue;
} else if (eol) {
// Got non eol char in line break state. Do line break;
inindent = 1;
hadcr = 0;
if (eol > 2)
eol = 2;
while (eol) {
if (!m_inputhtml && m_eolbr)
*olit += "<br>";
*olit += "\n";
eol--;
}
// Maybe end this chunk, begin next. Don't do it on html
// there is just no way to do it right (qtextedit cant grok
// chunks cut in the middle of <a></a> for example).
if (!m_inputhtml && !inrcltag &&
olit->size() > (unsigned int)chunksize) {
out.push_back(string(startChunk()));
olit++;
}
}
switch (car) {
case '<':
inindent = 0;
if (m_inputhtml) {
if (!inparamvalue)
intag = true;
chariter.appendchartostring(*olit);
} else {
*olit += "&lt;";
}
break;
case '>':
inindent = 0;
if (m_inputhtml) {
if (!inparamvalue)
intag = false;
}
chariter.appendchartostring(*olit);
break;
case '&':
inindent = 0;
if (m_inputhtml) {
chariter.appendchartostring(*olit);
} else {
*olit += "&amp;";
}
break;
case '"':
inindent = 0;
if (m_inputhtml && intag) {
inparamvalue = !inparamvalue;
}
chariter.appendchartostring(*olit);
break;
case ' ':
if (m_eolbr && inindent) {
*olit += "&nbsp;";
} else {
chariter.appendchartostring(*olit);
}
break;
case '\t':
if (m_eolbr && inindent) {
*olit += "&nbsp;&nbsp;&nbsp;&nbsp;";
} else {
chariter.appendchartostring(*olit);
}
break;
default:
inindent = 0;
chariter.appendchartostring(*olit);
}
} // End chariter loop
#if 0
{
FILE *fp = fopen("/tmp/debugplaintorich", "a");
fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");
for (list<string>::iterator it = out.begin();
it != out.end(); it++) {
fprintf(fp, "BEGINOFPLAINTORICHCHUNK\n");
fprintf(fp, "%s", it->c_str());
fprintf(fp, "ENDOFPLAINTORICHCHUNK\n");
}
fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
fclose(fp);
}
#endif
LOGDEB2(("plaintorich: done %d mS\n", chron.millis()));
return true;
}

95
src/query/plaintorich.h Normal file
View file

@ -0,0 +1,95 @@
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _PLAINTORICH_H_INCLUDED_
#define _PLAINTORICH_H_INCLUDED_
#include <string>
#include <list>
using std::list;
using std::string;
/// Holder for plaintorich() input data: words and groups of words to
/// be highlighted
struct HiliteData {
// Single terms
vector<string> terms;
// NEAR and PHRASE elements
vector<vector<string> > groups;
// Group slacks (number of permitted non-matched words).
// Parallel vector to the above 'groups'
vector<int> gslks;
void reset()
{
terms.clear();
groups.clear();
gslks.clear();
}
};
/**
* A class for highlighting search results. Overridable methods allow
* for different styles. We can handle plain text or html input. In the latter
* case, we may fail to highligt term groups if they are mixed with html tags.
*/
class PlainToRich {
public:
PlainToRich() : m_inputhtml(false) {}
virtual ~PlainToRich() {}
void set_inputhtml(bool v) {m_inputhtml = v;}
/**
* Transform plain text for highlighting search terms, ie in the
* preview window or result list entries.
*
* The actual tags used for highlighting and anchoring are
* determined by deriving from this class which handles the searching for
* terms and groups, but there is an assumption that the output will be
* html-like: we escape characters like < or &
*
* Finding the search terms is relatively complicated because of
* phrase/near searches, which need group highlights. As a matter
* of simplification, we handle "phrase" as "near", not filtering
* on word order.
*
* @param in raw text out of internfile.
* @param out rich text output, divided in chunks (to help our caller
* avoid inserting half tags into textedit which doesnt like it)
* @param hdata terms and groups to be highlighted. These are
* lowercase and unaccented.
* @param chunksize max size of chunks in output list
*/
virtual bool plaintorich(const string &in, list<string> &out,
const HiliteData& hdata,
int chunksize = 50000
);
/* Methods to ouput headers, highlighting and marking tags */
virtual string header() {return snull;}
virtual string startMatch() {return snull;}
virtual string endMatch() {return snull;}
virtual string startAnchor(int) {return snull;}
virtual string endAnchor() {return snull;}
virtual string startChunk() {return snull;}
protected:
const string snull;
bool m_inputhtml;
// Use <br> to break plain text lines (else caller has used a <pre> tag)
bool m_eolbr;
};
#endif /* _PLAINTORICH_H_INCLUDED_ */

411
src/query/recollq.cpp Normal file
View file

@ -0,0 +1,411 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
// Takes a query and run it, no gui, results to stdout
#ifndef TEST_RECOLLQ
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/stat.h>
#include <limits.h>
#include <iostream>
#include <list>
#include <string>
using namespace std;
#include "rcldb.h"
#include "rclquery.h"
#include "rclconfig.h"
#include "pathut.h"
#include "rclinit.h"
#include "debuglog.h"
#include "wasastringtoquery.h"
#include "wasatorcl.h"
#include "internfile.h"
#include "wipedir.h"
#include "transcode.h"
#include "textsplit.h"
#include "smallut.h"
#include "base64.h"
bool dump_contents(RclConfig *rclconfig, TempDir& tmpdir, Rcl::Doc& idoc)
{
FileInterner interner(idoc, rclconfig, tmpdir,
FileInterner::FIF_forPreview);
Rcl::Doc fdoc;
string ipath = idoc.ipath;
if (interner.internfile(fdoc, ipath)) {
cout << fdoc.text << endl;
} else {
cout << "Cant turn to text:" << idoc.url << " | " << idoc.ipath << endl;
}
return true;
}
void output_fields(const vector<string>fields, Rcl::Doc& doc,
Rcl::Query& query, Rcl::Db& rcldb)
{
for (vector<string>::const_iterator it = fields.begin();
it != fields.end(); it++) {
string out;
if (!it->compare("abstract")) {
string abstract;
rcldb.makeDocAbstract(doc, &query, abstract);
base64_encode(abstract, out);
} else {
base64_encode(doc.meta[*it], out);
}
cout << out << " ";
}
cout << endl;
}
static char *thisprog;
static char usage [] =
" -P: Show the date span for all the documents present in the index\n"
" [-o|-a|-f] [-q] <query string>\n"
" Runs a recoll query and displays result lines. \n"
" Default: will interpret the argument(s) as a xesam query string\n"
" query may be like: \n"
" implicit AND, Exclusion, field spec: t1 -t2 title:t3\n"
" OR has priority: t1 OR t2 t3 OR t4 means (t1 OR t2) AND (t3 OR t4)\n"
" Phrase: \"t1 t2\" (needs additional quoting on cmd line)\n"
" -o Emulate the GUI simple search in ANY TERM mode\n"
" -a Emulate the GUI simple search in ALL TERMS mode\n"
" -f Emulate the GUI simple search in filename mode\n"
" -q is just ignored (compatibility with the recoll GUI command line)\n"
"Common options:\n"
" -c <configdir> : specify config directory, overriding $RECOLL_CONFDIR\n"
" -d also dump file contents\n"
" -n [first-]<cnt> define the result slice. The default value for [first]\n"
" is 0. Without the option, the default max count is 2000.\n"
" Use n=0 for no limit\n"
" -b : basic. Just output urls, no mime types or titles\n"
" -Q : no result lines, just the processed query and result count\n"
" -m : dump the whole document meta[] array for each result\n"
" -A : output the document abstracts\n"
" -S fld : sort by field <fld>\n"
" -D : sort descending\n"
" -i <dbdir> : additional index, several can be given\n"
" -e use url encoding (%xx) for urls\n"
" -F <field name list> : output exactly these fields for each result.\n"
" The field values are encoded in base64, output in one line and \n"
" separated by one space character. This is the recommended format \n"
" for use by other programs. Use a normal query with option -m to \n"
" see the field names.\n"
;
static void
Usage(void)
{
cerr << thisprog << ": usage:" << endl << usage;
exit(1);
}
// ATTENTION A LA COMPATIBILITE AVEC LES OPTIONS DE recoll
// -q, -t and -l are accepted and ignored
// -a/f/o -c have the same meaning
// -h is not used
static int op_flags;
#define OPT_A 0x1
#define OPT_a 0x2
#define OPT_b 0x4
#define OPT_c 0x8
#define OPT_D 0x10
#define OPT_d 0x20
#define OPT_f 0x40
#define OPT_i 0x80
#define OPT_l 0x100
#define OPT_m 0x200
#define OPT_n 0x400
#define OPT_o 0x800
#define OPT_P 0x1000
#define OPT_Q 0x2000
#define OPT_q 0x4000
#define OPT_S 0x8000
#define OPT_s 0x10000
#define OPT_t 0x20000
#define OPT_e 0x40000
#define OPT_F 0x80000
int recollq(RclConfig **cfp, int argc, char **argv)
{
string a_config;
string sortfield;
string stemlang("english");
list<string> extra_dbs;
string sf;
vector<string> fields;
int firstres = 0;
int maxcount = 2000;
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 'A': op_flags |= OPT_A; break;
case 'a': op_flags |= OPT_a; break;
case 'b': op_flags |= OPT_b; break;
case 'c': op_flags |= OPT_c; if (argc < 2) Usage();
a_config = *(++argv);
argc--; goto b1;
case 'd': op_flags |= OPT_d; break;
case 'D': op_flags |= OPT_D; break;
case 'e': op_flags |= OPT_e; break;
case 'f': op_flags |= OPT_f; break;
case 'F': op_flags |= OPT_F; if (argc < 2) Usage();
sf = *(++argv);
argc--; goto b1;
case 'i': op_flags |= OPT_i; if (argc < 2) Usage();
extra_dbs.push_back(*(++argv));
argc--; goto b1;
case 'l': op_flags |= OPT_l; break;
case 'm': op_flags |= OPT_m; break;
case 'n': op_flags |= OPT_n; if (argc < 2) Usage();
{
string rescnt = *(++argv);
string::size_type dash = rescnt.find("-");
if (dash != string::npos) {
firstres = atoi(rescnt.substr(0, dash).c_str());
if (dash < rescnt.size()-1) {
maxcount = atoi(rescnt.substr(dash+1).c_str());
}
} else {
maxcount = atoi(rescnt.c_str());
}
if (maxcount <= 0) maxcount = INT_MAX;
}
argc--; goto b1;
case 'o': op_flags |= OPT_o; break;
case 'P': op_flags |= OPT_P; break;
case 'q': op_flags |= OPT_q; break;
case 'Q': op_flags |= OPT_Q; break;
case 'S': op_flags |= OPT_S; if (argc < 2) Usage();
sortfield = *(++argv);
argc--; goto b1;
case 's': op_flags |= OPT_s; if (argc < 2) Usage();
stemlang = *(++argv);
argc--; goto b1;
case 't': op_flags |= OPT_t; break;
default: Usage(); break;
}
b1: argc--; argv++;
}
string reason;
*cfp = recollinit(0, 0, reason, &a_config);
RclConfig *rclconfig = *cfp;
if (!rclconfig || !rclconfig->ok()) {
fprintf(stderr, "Recoll init failed: %s\n", reason.c_str());
exit(1);
}
if (argc < 1 && !(op_flags & OPT_P)) {
Usage();
}
if (op_flags & OPT_F) {
if (op_flags & (OPT_b|OPT_d|OPT_b|OPT_Q|OPT_m|OPT_A))
Usage();
stringToStrings(sf, fields);
}
Rcl::Db rcldb(rclconfig);
if (!extra_dbs.empty()) {
for (list<string>::iterator it = extra_dbs.begin();
it != extra_dbs.end(); it++) {
if (!rcldb.addQueryDb(*it)) {
cerr << "Can't add index: " << *it << endl;
exit(1);
}
}
}
if (!rcldb.open(Rcl::Db::DbRO)) {
cerr << "Cant open database in " << rclconfig->getDbDir() <<
" reason: " << rcldb.getReason() << endl;
exit(1);
}
if (op_flags & OPT_P) {
int minyear, maxyear;
if (!rcldb.maxYearSpan(&minyear, &maxyear)) {
cerr << "maxYearSpan failed: " << rcldb.getReason() << endl;
exit(1);
} else {
cout << "Min year " << minyear << " Max year " << maxyear << endl;
exit(0);
}
}
if (argc < 1) {
Usage();
}
string qs = *argv++;argc--;
while (argc > 0) {
qs += string(" ") + *argv++;argc--;
}
{
string uq;
string charset = rclconfig->getDefCharset(true);
int ercnt;
if (!transcode(qs, uq, charset, "UTF-8", &ercnt)) {
fprintf(stderr, "Can't convert command line args to utf-8\n");
exit(1);
} else if (ercnt) {
fprintf(stderr, "%d errors while converting arguments from %s "
"to utf-8\n", ercnt, charset.c_str());
}
qs = uq;
}
Rcl::SearchData *sd = 0;
if (op_flags & (OPT_a|OPT_o|OPT_f)) {
sd = new Rcl::SearchData(Rcl::SCLT_OR);
Rcl::SearchDataClause *clp = 0;
if (op_flags & OPT_f) {
clp = new Rcl::SearchDataClauseFilename(qs);
} else {
// If there is no white space inside the query, then the user
// certainly means it as a phrase.
bool isreallyaphrase = false;
if (!TextSplit::hasVisibleWhite(qs))
isreallyaphrase = true;
clp = isreallyaphrase ?
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) :
new Rcl::SearchDataClauseSimple((op_flags & OPT_o)?
Rcl::SCLT_OR : Rcl::SCLT_AND,
qs);
}
if (sd)
sd->addClause(clp);
} else {
sd = wasaStringToRcl(rclconfig, qs, reason);
}
if (!sd) {
cerr << "Query string interpretation failed: " << reason << endl;
return 1;
}
sd->setStemlang(stemlang);
RefCntr<Rcl::SearchData> rq(sd);
Rcl::Query query(&rcldb);
if (op_flags & OPT_S) {
query.setSortBy(sortfield, (op_flags & OPT_D) ? false : true);
}
Chrono chron;
query.setQuery(rq);
int cnt = query.getResCnt();
if (!(op_flags & OPT_b)) {
cout << "Recoll query: " << rq->getDescription() << endl;
if (firstres == 0) {
if (cnt <= maxcount)
cout << cnt << " results" << endl;
else
cout << cnt << " results (printing " << maxcount << " max):"
<< endl;
} else {
cout << "Printing at most " << cnt - (firstres+maxcount) <<
" results from first " << firstres << endl;
}
}
if (op_flags & OPT_Q)
cout << "Query setup took " << chron.millis() << " mS" << endl;
if (op_flags & OPT_Q)
return(0);
for (int i = firstres; i < firstres + maxcount; i++) {
Rcl::Doc doc;
if (!query.getDoc(i, doc))
break;
if (op_flags & OPT_F) {
output_fields(fields, doc, query, rcldb);
continue;
}
if (op_flags & OPT_e)
doc.url = url_encode(doc.url);
if (op_flags & OPT_b) {
cout << doc.url << endl;
} else {
string titleorfn = doc.meta[Rcl::Doc::keytt];
if (titleorfn.empty())
titleorfn = doc.meta[Rcl::Doc::keyfn];
char cpc[20];
sprintf(cpc, "%d", doc.pc);
cout
<< doc.mimetype << "\t"
<< "[" << doc.url << "]" << "\t"
<< "[" << titleorfn << "]" << "\t"
<< doc.fbytes << "\tbytes" << "\t"
<< endl;
if (op_flags & OPT_m) {
for (map<string,string>::const_iterator it = doc.meta.begin();
it != doc.meta.end(); it++) {
cout << it->first << " = " << it->second << endl;
}
}
if (op_flags & OPT_A) {
string abstract;
if (rcldb.makeDocAbstract(doc, &query, abstract)) {
cout << "ABSTRACT" << endl;
cout << abstract << endl;
cout << "/ABSTRACT" << endl;
}
}
}
if (op_flags & OPT_d) {
static TempDir tmpdir;
if (!tmpdir.ok()) {
cerr << "Can't create temporary directory: " <<
tmpdir.getreason() << endl;
return(1);
}
dump_contents(rclconfig, tmpdir, doc);
}
}
return 0;
}
#else // TEST_RECOLLQ The test driver is actually the useful program...
#include <stdlib.h>
#include "rclconfig.h"
#include "recollq.h"
static RclConfig *rclconfig;
int main(int argc, char **argv)
{
return(recollq(&rclconfig, argc, argv));
}
#endif // TEST_RECOLLQ

26
src/query/recollq.h Normal file
View file

@ -0,0 +1,26 @@
/* Copyright (C) 2007 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _recollq_h_included_
#define _recollq_h_included_
/// Execute query, print results to stdout. This is just an api to the
/// recollq command line program.
class RclConfig;
extern int recollq(RclConfig **cfp, int argc, char **argv);
#endif /* _recollq_h_included_ */

438
src/query/reslistpager.cpp Normal file
View file

@ -0,0 +1,438 @@
/* Copyright (C) 2007 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifdef HAVE_CONFIG_H
#include "autoconfig.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <sstream>
using std::ostringstream;
using std::endl;
#include "cstr.h"
#include "reslistpager.h"
#include "debuglog.h"
#include "rclconfig.h"
#include "smallut.h"
#include "plaintorich.h"
#include "mimehandler.h"
// Default highlighter. No need for locking, this is query-only.
static const string cstr_hlfontcolor("<font color=\"blue\">");
static const string cstr_hlendfont("</font>");
class PlainToRichHtReslist : public PlainToRich {
public:
virtual ~PlainToRichHtReslist() {}
virtual string startMatch() {return cstr_hlfontcolor;}
virtual string endMatch() {return cstr_hlendfont;}
};
static PlainToRichHtReslist g_hiliter;
ResListPager::ResListPager(int pagesize)
: m_pagesize(pagesize),
m_newpagesize(pagesize),
m_winfirst(-1),
m_hasNext(false),
m_hiliter(&g_hiliter)
{
}
void ResListPager::resultPageNext()
{
if (m_docSource.isNull()) {
LOGDEB(("ResListPager::resultPageNext: null source\n"));
return;
}
int resCnt = m_docSource->getResCnt();
LOGDEB(("ResListPager::resultPageNext: rescnt %d, winfirst %d\n",
resCnt, m_winfirst));
if (m_winfirst < 0) {
m_winfirst = 0;
} else {
m_winfirst += m_respage.size();
}
// Get the next page of results.
vector<ResListEntry> npage;
int pagelen = m_docSource->getSeqSlice(m_winfirst, m_pagesize, npage);
// If page was truncated, there is no next
m_hasNext = (pagelen == m_pagesize);
if (pagelen <= 0) {
// No results ? This can only happen on the first page or if the
// actual result list size is a multiple of the page pref (else
// there would have been no Next on the last page)
if (m_winfirst > 0) {
// Have already results. Let them show, just disable the
// Next button. We'd need to remove the Next link from the page
// too.
// Restore the m_winfirst value, let the current result vector alone
m_winfirst -= m_respage.size();
} else {
// No results at all (on first page)
m_winfirst = -1;
}
return;
}
m_respage = npage;
}
void ResListPager::resultPageFor(int docnum)
{
if (m_docSource.isNull()) {
LOGDEB(("ResListPager::resultPageFor: null source\n"));
return;
}
int resCnt = m_docSource->getResCnt();
LOGDEB(("ResListPager::resultPageFor(%d): rescnt %d, winfirst %d\n",
docnum, resCnt, m_winfirst));
m_winfirst = (docnum / m_pagesize) * m_pagesize;
// Get the next page of results.
vector<ResListEntry> npage;
int pagelen = m_docSource->getSeqSlice(m_winfirst, m_pagesize, npage);
// If page was truncated, there is no next
m_hasNext = (pagelen == m_pagesize);
if (pagelen <= 0) {
m_winfirst = -1;
return;
}
m_respage = npage;
}
void ResListPager::displayDoc(RclConfig *config,
int i, Rcl::Doc& doc, const HiliteData& hdata,
const string& sh)
{
ostringstream chunk;
int percent;
if (doc.pc == -1) {
percent = 0;
// Document not available, maybe other further, will go on.
doc.meta[Rcl::Doc::keyabs] = string(trans("Unavailable document"));
} else {
percent = doc.pc;
}
// Determine icon to display if any
string iconurl = iconUrl(config, doc);
// Printable url: either utf-8 if transcoding succeeds, or url-encoded
string url;
printableUrl(config->getDefCharset(), doc.url, url);
// Make title out of file name if none yet
string titleOrFilename;
string utf8fn;
doc.getmeta(Rcl::Doc::keytt, &titleOrFilename);
doc.getmeta(Rcl::Doc::keyfn, &utf8fn);
if (utf8fn.empty()) {
utf8fn = path_getsimple(url);
}
if (titleOrFilename.empty()) {
titleOrFilename = utf8fn;
}
// Result number
char numbuf[20];
int docnumforlinks = m_winfirst + 1 + i;
sprintf(numbuf, "%d", docnumforlinks);
// Document date: either doc or file modification time
char datebuf[100];
datebuf[0] = 0;
if (!doc.dmtime.empty() || !doc.fmtime.empty()) {
time_t mtime = doc.dmtime.empty() ?
atol(doc.fmtime.c_str()) : atol(doc.dmtime.c_str());
struct tm *tm = localtime(&mtime);
strftime(datebuf, 99, dateFormat().c_str(), tm);
}
// Size information. We print both doc and file if they differ a lot
off_t fsize = -1, dsize = -1;
if (!doc.dbytes.empty())
dsize = atol(doc.dbytes.c_str());
if (!doc.fbytes.empty())
fsize = atol(doc.fbytes.c_str());
string sizebuf;
if (dsize > 0) {
sizebuf = displayableBytes(dsize);
if (fsize > 10 * dsize && fsize - dsize > 1000)
sizebuf += string(" / ") + displayableBytes(fsize);
} else if (fsize >= 0) {
sizebuf = displayableBytes(fsize);
}
string richabst;
bool needabstract = parFormat().find("%A") != string::npos;
if (needabstract && m_docSource.isNotNull()) {
vector<string> vabs;
m_docSource->getAbstract(doc, vabs);
for (vector<string>::const_iterator it = vabs.begin();
it != vabs.end(); it++) {
if (!it->empty()) {
// No need to call escapeHtml(), plaintorich handles it
list<string> lr;
m_hiliter->set_inputhtml(false);
m_hiliter->plaintorich(*it, lr, hdata);
richabst += lr.front();
richabst += absSep();
}
}
}
// Links;
ostringstream linksbuf;
if (canIntern(doc.mimetype, config)) {
linksbuf << "<a href=\"P" << docnumforlinks << "\">"
<< trans("Preview") << "</a>&nbsp;&nbsp;";
}
string apptag;
doc.getmeta(Rcl::Doc::keyapptg, &apptag);
if (!config->getMimeViewerDef(doc.mimetype, apptag).empty()) {
linksbuf << "<a href=\"E" << docnumforlinks << "\">"
<< trans("Open") << "</a>";
}
// Build the result list paragraph:
// Subheader: this is used by history
if (!sh.empty())
chunk << "<p style='clear: both;'><b>" << sh << "</p>\n<p>";
else
chunk << "<p style='margin: 0px;padding: 0px;clear: both;'>";
// Configurable stuff
map<string,string> subs;
subs["A"] = !richabst.empty() ? richabst : "";
subs["D"] = datebuf;
subs["I"] = iconurl;
subs["i"] = doc.ipath;
subs["K"] = !doc.meta[Rcl::Doc::keykw].empty() ?
string("[") + escapeHtml(doc.meta[Rcl::Doc::keykw]) + "]" : "";
subs["L"] = linksbuf.rdbuf()->str();
subs["N"] = numbuf;
subs["M"] = doc.mimetype;
subs["R"] = doc.meta[Rcl::Doc::keyrr];
subs["S"] = sizebuf;
subs["T"] = escapeHtml(titleOrFilename);
subs["t"] = escapeHtml(doc.meta[Rcl::Doc::keytt]);
subs["U"] = url;
// Let %(xx) access all metadata.
subs.insert(doc.meta.begin(), doc.meta.end());
string formatted;
pcSubst(parFormat(), formatted, subs);
chunk << formatted;
chunk << "</p>" << endl;
// This was to force qt 4.x to clear the margins (which it should do
// anyway because of the paragraph's style), but we finally took
// the table approach for 1.15 for now (in guiutils.cpp)
// chunk << "<br style='clear:both;height:0;line-height:0;'>" << endl;
LOGDEB2(("Chunk: [%s]\n", (const char *)chunk.rdbuf()->str().c_str()));
append(chunk.rdbuf()->str(), i, doc);
}
void ResListPager::displayPage(RclConfig *config)
{
LOGDEB(("ResListPager::displayPage\n"));
if (m_docSource.isNull()) {
LOGDEB(("ResListPager::displayPage: null source\n"));
return;
}
if (m_winfirst < 0 && !pageEmpty()) {
LOGDEB(("ResListPager::displayPage: sequence error: winfirst < 0\n"));
return;
}
ostringstream chunk;
// Display list header
// We could use a <title> but the textedit doesnt display
// it prominently
// Note: have to append text in chunks that make sense
// html-wise. If we break things up too much, the editor
// gets confused. Hence the use of the 'chunk' text
// accumulator
// Also note that there can be results beyond the estimated resCnt.
chunk << "<html><head>" << endl
<< "<meta http-equiv=\"content-type\""
<< " content=\"text/html; charset=utf-8\">" << endl
<< headerContent()
<< "</head><body>" << endl
<< pageTop()
<< "<p><font size=+1><b>"
<< m_docSource->title()
<< "</b></font>&nbsp;&nbsp;&nbsp;";
if (pageEmpty()) {
chunk << trans("<p><b>No results found</b><br>");
vector<string>uterms;
m_docSource->getUTerms(uterms);
if (!uterms.empty()) {
map<string, vector<string> > spellings;
suggest(uterms, spellings);
if (!spellings.empty()) {
chunk <<
trans("<p><i>Alternate spellings (accents suppressed): </i>")
<< "<br /><blockquote>";
for (map<string, vector<string> >::const_iterator it0 =
spellings.begin(); it0 != spellings.end(); it0++) {
chunk << "<b>" << it0->first << "</b> : ";
for (vector<string>::const_iterator it =
it0->second.begin();
it != it0->second.end(); it++) {
chunk << *it << " ";
}
chunk << "<br />";
}
chunk << "</blockquote></p>";
}
}
} else {
unsigned int resCnt = m_docSource->getResCnt();
if (m_winfirst + m_respage.size() < resCnt) {
chunk << trans("Documents") << " <b>" << m_winfirst + 1
<< "-" << m_winfirst + m_respage.size() << "</b> "
<< trans("out of at least") << " "
<< resCnt << " " << trans("for") << " " ;
} else {
chunk << trans("Documents") << " <b>"
<< m_winfirst + 1 << "-" << m_winfirst + m_respage.size()
<< "</b> " << trans("for") << " ";
}
}
chunk << detailsLink();
if (hasPrev() || hasNext()) {
chunk << "&nbsp;&nbsp;";
if (hasPrev()) {
chunk << "<a href=\"" + prevUrl() + "\"><b>"
<< trans("Previous")
<< "</b></a>&nbsp;&nbsp;&nbsp;";
}
if (hasNext()) {
chunk << "<a href=\""+ nextUrl() + "\"><b>"
<< trans("Next")
<< "</b></a>";
}
}
chunk << "</p>" << endl;
append(chunk.rdbuf()->str());
chunk.rdbuf()->str("");
if (pageEmpty())
return;
HiliteData hdata;
m_docSource->getTerms(hdata.terms, hdata.groups, hdata.gslks);
// Emit data for result entry paragraph. Do it in chunks that make sense
// html-wise, else our client may get confused
for (int i = 0; i < (int)m_respage.size(); i++) {
Rcl::Doc &doc(m_respage[i].doc);
string& sh(m_respage[i].subHeader);
displayDoc(config, i, doc, hdata, sh);
}
// Footer
chunk << "<p align=\"center\">";
if (hasPrev() || hasNext()) {
if (hasPrev()) {
chunk << "<a href=\"" + prevUrl() + "\"><b>"
<< trans("Previous")
<< "</b></a>&nbsp;&nbsp;&nbsp;";
}
if (hasNext()) {
chunk << "<a href=\""+ nextUrl() + "\"><b>"
<< trans("Next")
<< "</b></a>";
}
}
chunk << "</p>" << endl;
chunk << "</body></html>" << endl;
append(chunk.rdbuf()->str());
}
// Default implementations for things that should be implemented by
// specializations
string ResListPager::nextUrl()
{
return "n-1";
}
string ResListPager::prevUrl()
{
return "p-1";
}
string ResListPager::iconUrl(RclConfig *config, Rcl::Doc& doc)
{
string iconurl;
config->getMimeIconName(doc.mimetype, &iconurl);
iconurl = cstr_fileu + iconurl;
return iconurl;
}
bool ResListPager::append(const string& data)
{
fprintf(stderr, "%s", data.c_str());
return true;
}
string ResListPager::trans(const string& in)
{
return in;
}
string ResListPager::detailsLink()
{
string chunk = "<a href=\"H-1\">";
chunk += trans("(show query)") + "</a>";
return chunk;
}
const string &ResListPager::parFormat()
{
static const string cstr_format("<img src=\"%I\" align=\"left\">"
"%R %S %L &nbsp;&nbsp;<b>%T</b><br>"
"%M&nbsp;%D&nbsp;&nbsp;&nbsp;<i>%U</i><br>"
"%A %K");
return cstr_format;
}
const string &ResListPager::dateFormat()
{
static const string cstr_format("&nbsp;%Y-%m-%d&nbsp;%H:%M:%S&nbsp;%z");
return cstr_format;
}

131
src/query/reslistpager.h Normal file
View file

@ -0,0 +1,131 @@
/* Copyright (C) 2007 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _reslistpager_h_included_
#define _reslistpager_h_included_
#include <vector>
using std::vector;
#include "refcntr.h"
#include "docseq.h"
class RclConfig;
class PlainToRich;
class HiliteData;
/**
* Manage a paged HTML result list.
*/
class ResListPager {
public:
ResListPager(int pagesize=10);
virtual ~ResListPager() {}
void setHighLighter(PlainToRich *ptr)
{
m_hiliter = ptr;
}
void setDocSource(RefCntr<DocSequence> src, int winfirst = -1)
{
m_pagesize = m_newpagesize;
m_winfirst = winfirst;
m_hasNext = false;
m_docSource = src;
m_respage.clear();
}
void setPageSize(int ps)
{
m_newpagesize = ps;
}
int pageNumber()
{
if (m_winfirst < 0 || m_pagesize <= 0)
return -1;
return m_winfirst / m_pagesize;
}
int pageFirstDocNum() {
return m_winfirst;
}
int pageLastDocNum() {
if (m_winfirst < 0 || m_respage.size() == 0)
return -1;
return m_winfirst + m_respage.size() - 1;
}
virtual int pageSize() const {return m_pagesize;}
void pageNext();
bool hasNext() {return m_hasNext;}
bool hasPrev() {return m_winfirst > 0;}
bool atBot() {return m_winfirst <= 0;}
void resultPageFirst() {
m_winfirst = -1;
m_pagesize = m_newpagesize;
resultPageNext();
}
void resultPageBack() {
if (m_winfirst <= 0) return;
m_winfirst -= 2 * m_pagesize;
resultPageNext();
}
void resultPageNext();
void resultPageFor(int docnum);
void displayPage(RclConfig *);
void displayDoc(RclConfig *, int idx, Rcl::Doc& doc,
const HiliteData& hdata, const string& sh = "");
bool pageEmpty() {return m_respage.size() == 0;}
string queryDescription() {return m_docSource.isNull() ? "" :
m_docSource->getDescription();}
// Things that need to be reimplemented in the subclass:
virtual bool append(const string& data);
virtual bool append(const string& data, int, const Rcl::Doc&)
{
return append(data);
}
// Translation function. This is reimplemented in the qt reslist
// object For this to work, the strings must be duplicated inside
// reslist.cpp (see the QT_TR_NOOP in there). Very very unwieldy.
// To repeat: any change to a string used with trans() inside
// reslistpager.cpp must be reflected in the string table inside
// reslist.cpp for translation to work.
virtual string trans(const string& in);
virtual string detailsLink();
virtual const string &parFormat();
virtual const string &dateFormat();
virtual string nextUrl();
virtual string prevUrl();
virtual string pageTop() {return string();}
virtual string headerContent() {return string();}
virtual string iconUrl(RclConfig *, Rcl::Doc& doc);
virtual void suggest(const vector<string>,
map<string, vector<string> >& sugg) {
sugg.clear();
}
virtual string absSep() {return "&hellip;";}
private:
int m_pagesize;
int m_newpagesize;
// First docnum (from docseq) in current page
int m_winfirst;
bool m_hasNext;
PlainToRich *m_hiliter;
RefCntr<DocSequence> m_docSource;
vector<ResListEntry> m_respage;
};
#endif /* _reslistpager_h_included_ */

76
src/query/sortseq.cpp Normal file
View file

@ -0,0 +1,76 @@
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <algorithm>
#include "debuglog.h"
#include "sortseq.h"
using std::string;
class CompareDocs {
DocSeqSortSpec ss;
public:
CompareDocs(const DocSeqSortSpec &sortspec) : ss(sortspec) {}
// It's not too clear in the std::sort doc what this should do. This
// behaves as operator<
int operator()(const Rcl::Doc *x, const Rcl::Doc *y)
{
LOGDEB1(("Comparing .. \n"));
map<string,string>::const_iterator xit, yit;
xit = x->meta.find(ss.field);
yit = y->meta.find(ss.field);
if (xit == x->meta.end() || yit == y->meta.end())
return 0;
return ss.desc ? yit->second < xit->second : xit->second < yit->second;
}
};
bool DocSeqSorted::setSortSpec(DocSeqSortSpec &sortspec)
{
LOGDEB(("DocSeqSorted::setSortSpec\n"));
m_spec = sortspec;
int count = m_seq->getResCnt();
LOGDEB(("DocSeqSorted:: count %d\n", count));
m_docs.resize(count);
int i;
for (i = 0; i < count; i++) {
if (!m_seq->getDoc(i, m_docs[i])) {
LOGERR(("DocSeqSorted: getDoc failed for doc %d\n", i));
count = i;
break;
}
}
m_docs.resize(count);
m_docsp.resize(count);
for (i = 0; i < count; i++)
m_docsp[i] = &m_docs[i];
CompareDocs cmp(sortspec);
sort(m_docsp.begin(), m_docsp.end(), cmp);
return true;
}
bool DocSeqSorted::getDoc(int num, Rcl::Doc &doc, string *)
{
LOGDEB(("DocSeqSorted::getDoc(%d)\n", num));
if (num < 0 || num >= int(m_docsp.size()))
return false;
doc = *m_docsp[num];
return true;
}

48
src/query/sortseq.h Normal file
View file

@ -0,0 +1,48 @@
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _SORTSEQ_H_INCLUDED_
#define _SORTSEQ_H_INCLUDED_
#include <vector>
#include <string>
#include "refcntr.h"
#include "docseq.h"
/**
* A sorted sequence is created from the first N documents of another one,
* and sorts them according to the given criteria.
*/
class DocSeqSorted : public DocSeqModifier {
public:
DocSeqSorted(RefCntr<DocSequence> iseq, DocSeqSortSpec &sortspec)
: DocSeqModifier(iseq)
{
setSortSpec(sortspec);
}
virtual ~DocSeqSorted() {}
virtual bool canSort() {return true;}
virtual bool setSortSpec(DocSeqSortSpec &sortspec);
virtual bool getDoc(int num, Rcl::Doc &doc, string *sh = 0);
virtual int getResCnt() {return m_docsp.size();}
private:
DocSeqSortSpec m_spec;
std::vector<Rcl::Doc> m_docs;
std::vector<Rcl::Doc *> m_docsp;
};
#endif /* _SORTSEQ_H_INCLUDED_ */

View file

@ -0,0 +1,510 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef TEST_WASASTRINGTOQUERY
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <regex.h>
#include "smallut.h"
#include "wasastringtoquery.h"
#undef DEB_WASASTRINGTOQ
#ifdef DEB_WASASTRINGTOQ
#define DPRINT(X) fprintf X
#define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());}
#else
#define DPRINT(X)
#define DUMPQ(Q)
#endif
WasaQuery::~WasaQuery()
{
for (vector<WasaQuery*>::iterator it = m_subs.begin();
it != m_subs.end(); it++) {
delete *it;
}
m_subs.clear();
}
static const char* reltosrel(WasaQuery::Rel rel)
{
switch (rel) {
case WasaQuery::REL_EQUALS: return "=";
case WasaQuery::REL_CONTAINS: return ":";
case WasaQuery::REL_LT: return "<";
case WasaQuery::REL_LTE: return "<=";
case WasaQuery::REL_GT: return ">";
case WasaQuery::REL_GTE: return ">=";
default: return "?";
}
}
void WasaQuery::describe(string &desc) const
{
desc += "(";
string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec +
reltosrel(m_rel);
switch (m_op) {
case OP_NULL:
desc += "NULL";
break;
case OP_LEAF:
desc += fieldspec + m_value;
break;
case OP_EXCL:
desc += string("NOT (" ) + fieldspec + m_value + ") ";
break;
case OP_OR:
case OP_AND:
for (vector<WasaQuery *>::const_iterator it = m_subs.begin();
it != m_subs.end(); it++) {
(*it)->describe(desc);
vector<WasaQuery *>::const_iterator it1 = it;
it1++;
if (it1 != m_subs.end())
desc += m_op == OP_OR ? "OR ": "AND ";
}
break;
}
if (desc[desc.length() - 1] == ' ')
desc.erase(desc.length() - 1);
desc += ")";
if (m_modifiers != 0) {
if (m_modifiers & WQM_BOOST) desc += "BOOST|";
if (m_modifiers & WQM_CASESENS) desc += "CASESENS|";
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
if (m_modifiers & WQM_FUZZY) desc += "FUZZY|";
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
if (m_modifiers & WQM_PHRASESLACK) {
char buf[100];
sprintf(buf, "%d", m_slack);
desc += "PHRASESLACK(" + string(buf) + string(")|");
}
if (m_modifiers & WQM_PROX) desc += "PROX|";
if (m_modifiers & WQM_REGEX) desc += "REGEX|";
if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|";
if (m_modifiers & WQM_WORDS) desc += "WORDS|";
if (desc.length() > 0 && desc[desc.length()-1] == '|')
desc.erase(desc.length()-1);
}
desc += " ";
}
// The string query parser code:
/* Shamelessly lifted from Beagle:
* This is our regular Expression Pattern:
* we expect something like this:
* -key:"Value String"modifiers
* key:Value
* or
* Value
*/
/* The master regular expression used to parse a query string
* Sub-expressions in parenthesis are numbered from 1. Each opening
* parenthesis increases the index, but we're not interested in all
* Deviations from standard:
* Relation: the standard-conformant line read as (release<1.16):
"(:|=|<|>|<=|>=)" //7 Relation
but we are not actually making use of the relation type
(interpreting all as ":"), and this can product unexpected results
as a (ie pasted) search for nonexfield=value will silently drop
the nonexfield part, while the user probably was not aware of
triggering a field search (expecting just ':' to do this).
*/
static const char * parserExpr =
"(OR|\\|\\|)[[:space:]]*" //1 OR,||
"|"
"(AND|&&)[[:space:]]*" // 2 AND,&& (ignored, default)
"|"
"(" //3
"([+-])?" //4 Force or exclude indicator
"(" //5
"([[:alpha:]][[:alnum:]:]*)" //6 Field spec: ie: "dc:title:letitre"
"[[:space:]]*"
"(:|=|>|<)" //7 Relation
"[[:space:]]*)?"
"(" //8
"(\"" //9
"([^\"]+)" //10 "A quoted term"
"\")"
"([bcCdDeflLoprsw.0-9]*)" //11 modifiers
"|"
"([^[:space:]\"]+)" //12 ANormalTerm
")"
")[[:space:]]*"
;
// For debugging the parser. But see also NMATCH
static const char *matchNames[] = {
/* 0*/ "",
/* 1*/ "OR",
/* 2*/ "AND",
/* 3*/ "",
/* 4*/ "+-",
/* 5*/ "",
/* 6*/ "FIELD",
/* 7*/ "RELATION",
/* 8*/ "",
/* 9*/ "",
/*10*/ "QUOTEDTERM",
/*11*/ "MODIFIERS",
/*12*/ "TERM",
};
#define NMATCH (sizeof(matchNames) / sizeof(char *))
// Symbolic names for the interesting submatch indices
enum SbMatchIdx {SMI_OR=1, SMI_AND=2, SMI_PM=4, SMI_FIELD=6, SMI_REL=7,
SMI_QUOTED=10, SMI_MODIF=11, SMI_TERM=12};
static const int maxmatchlen = 1024;
static const int errbuflen = 300;
class StringToWasaQuery::Internal {
public:
Internal()
: m_rxneedsfree(false)
{}
~Internal()
{
if (m_rxneedsfree)
regfree(&m_rx);
}
bool checkSubMatch(int i, char *match, string& reason)
{
if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) {
//DPRINT((stderr, "checkSubMatch: no match: i %d rm_so %d\n",
//i, m_pmatch[i].rm_so));
return false;
}
if (m_pmatch[i].rm_eo - m_pmatch[i].rm_so <= 0) {
// weird and fatal
reason = "Internal regular expression handling error";
return false;
}
//DPRINT((stderr, "checkSubMatch: so %d eo %d\n", m_pmatch[i].rm_so,
//m_pmatch[i].rm_eo));
memcpy(match, m_cp + m_pmatch[i].rm_so,
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
return true;
}
WasaQuery *stringToQuery(const string& str, string& reason);
friend class StringToWasaQuery;
private:
const char *m_cp;
regex_t m_rx;
bool m_rxneedsfree;
regmatch_t m_pmatch[NMATCH];
};
StringToWasaQuery::StringToWasaQuery()
: internal(new Internal)
{
}
StringToWasaQuery::~StringToWasaQuery()
{
delete internal;
}
WasaQuery *
StringToWasaQuery::stringToQuery(const string& str, string& reason)
{
if (internal == 0)
return 0;
WasaQuery *wq = internal->stringToQuery(str, reason);
DUMPQ(wq);
return wq;
}
WasaQuery *
StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
{
if (m_rxneedsfree)
regfree(&m_rx);
char errbuf[errbuflen+1];
int errcode;
if ((errcode = regcomp(&m_rx, parserExpr, REG_EXTENDED)) != 0) {
regerror(errcode, &m_rx, errbuf, errbuflen);
reason = errbuf;
return 0;
}
m_rxneedsfree = true;
const char *cpe;
m_cp = str.c_str();
cpe = str.c_str() + str.length();
WasaQuery *query = new WasaQuery;
query->m_op = WasaQuery::OP_AND;
WasaQuery *orChain = 0;
bool prev_or = false;
// Loop on repeated regexp matches on the main string.
for (int loop = 0;;loop++) {
if ((errcode = regexec(&m_rx, m_cp, NMATCH, m_pmatch, 0))) {
regerror(errcode, &m_rx, errbuf, errbuflen);
reason = errbuf;
return 0;
}
if (m_pmatch[0].rm_eo <= 0) {
// weird and fatal
reason = "Internal regular expression handling error";
return 0;
}
#ifdef DEB_WASASTRINGTOQ
DPRINT((stderr, "Next part:\n"));
for (unsigned int i = 0; i < NMATCH; i++) {
if (m_pmatch[i].rm_so == -1) continue;
char match[maxmatchlen+1];
memcpy(match, m_cp + m_pmatch[i].rm_so,
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
if (matchNames[i][0])
DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match,
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo));
}
#endif
char match[maxmatchlen+1];
if (checkSubMatch(SMI_OR, match, reason)) {
if (prev_or) {
// Bad syntax
reason = "Bad syntax: consecutive OR";
return 0;
}
if (orChain == 0) {
// Fist OR seen: start OR subclause.
if ((orChain = new WasaQuery()) == 0) {
reason = "Out of memory";
return 0;
}
orChain->m_op = WasaQuery::OP_OR;
}
// For the first OR, we need to transfer the previous
// query from the main vector to the OR subquery
if (orChain->m_subs.empty() && !query->m_subs.empty()) {
orChain->m_subs.push_back(query->m_subs.back());
query->m_subs.pop_back();
}
prev_or = true;
} else if (checkSubMatch(SMI_AND, match, reason)) {
// Do nothing, AND is the default. We might want to check for
// errors like consecutive ANDs, or OR AND
} else {
WasaQuery *nclause = new WasaQuery;
if (nclause == 0) {
reason = "Out of memory";
return 0;
}
// Check for quoted or unquoted value
unsigned int mods = 0;
if (checkSubMatch(SMI_QUOTED, match, reason)) {
nclause->m_value = match;
} else if (checkSubMatch(SMI_TERM, match, reason)) {
nclause->m_value = match;
}
if (nclause->m_value.empty()) {
// Isolated +- or fieldname: without a value. Ignore until
// told otherwise.
DPRINT((stderr, "Clause with empty value, skipping\n"));
delete nclause;
goto nextfield;
}
if (checkSubMatch(SMI_MODIF, match, reason)) {
DPRINT((stderr, "Got modifiers: [%s]\n", match));
for (unsigned int i = 0; i < strlen(match); i++) {
switch (match[i]) {
case 'b':
mods |= WasaQuery::WQM_BOOST;
nclause->m_weight = 10.0;
break;
case 'c': break;
case 'C': mods |= WasaQuery::WQM_CASESENS; break;
case 'd': break;
case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
case 'e': mods |= WasaQuery::WQM_CASESENS |
WasaQuery::WQM_DIACSENS |
WasaQuery::WQM_NOSTEM;
break;
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
case 'L': break;
case 'o':
mods |= WasaQuery::WQM_PHRASESLACK;
// Default slack if specified only by 'o' is 10.
nclause->m_slack = 10;
if (i < strlen(match) - 1) {
char *endptr;
int slack = strtol(match+i+1, &endptr, 10);
if (endptr != match+i+1) {
i += endptr - (match+i+1);
nclause->m_slack = slack;
}
}
break;
case 'p':
mods |= WasaQuery::WQM_PROX;
nclause->m_slack = 10;
break;
case 'r': mods |= WasaQuery::WQM_REGEX; break;
case 's': mods |= WasaQuery::WQM_SLOPPY; break;
case 'w': mods |= WasaQuery::WQM_WORDS; break;
case '.':case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':
{
int n;
float factor;
if (sscanf(match+i, "%f %n", &factor, &n)) {
nclause->m_weight = factor;
DPRINT((stderr, "Got factor %.2f len %d\n",
factor, n));
}
if (n)
i += n-1;
}
}
}
}
nclause->m_modifiers = WasaQuery::Modifier(mods);
// Field indicator ?
if (checkSubMatch(SMI_FIELD, match, reason)) {
// We used Check for special fields indicating sorting
// etc. here but this went away from the spec. See 1.4
// if it comes back
nclause->m_fieldspec = match;
if (checkSubMatch(SMI_REL, match, reason)) {
switch (match[0]) {
case '=':nclause->m_rel = WasaQuery::REL_EQUALS;break;
case ':':nclause->m_rel = WasaQuery::REL_CONTAINS;break;
case '<':
if (match[1] == '=')
nclause->m_rel = WasaQuery::REL_LTE;
else
nclause->m_rel = WasaQuery::REL_LT;
break;
case '>':
if (match[1] == '=')
nclause->m_rel = WasaQuery::REL_GTE;
else
nclause->m_rel = WasaQuery::REL_GT;
break;
default:
nclause->m_rel = WasaQuery::REL_CONTAINS;
}
} else {
// ?? If field matched we should have a relation
nclause->m_rel = WasaQuery::REL_CONTAINS;
}
}
// +- indicator ?
if (checkSubMatch(SMI_PM, match, reason) && match[0] == '-') {
nclause->m_op = WasaQuery::OP_EXCL;
} else {
nclause->m_op = WasaQuery::OP_LEAF;
}
if (prev_or) {
// The precedent token was an OR, add new clause to or chain
//DPRINT((stderr, "Adding to OR chain\n"));
orChain->m_subs.push_back(nclause);
} else {
if (orChain) {
// Getting out of OR. Add the OR subquery to the main one
//DPRINT((stderr, "Adding OR chain to main\n"));
query->m_subs.push_back(orChain);
orChain = 0;
}
//DPRINT((stderr, "Adding to main chain\n"));
// Add new clause to main query
query->m_subs.push_back(nclause);
}
prev_or = false;
}
nextfield:
// Advance current string position. We checked earlier that
// the increment is strictly positive, so we won't loop
// forever
m_cp += m_pmatch[0].rm_eo;
if (m_cp >= cpe)
break;
}
if (orChain) {
// Getting out of OR. Add the OR subquery to the main one
DPRINT((stderr, "Adding OR chain to main.Before: \n"));
DUMPQ(query);
DUMPQ(orChain);
query->m_subs.push_back(orChain);
}
regfree(&m_rx);
m_rxneedsfree = false;
return query;
}
#else // TEST
#include <stdio.h>
#include "wasastringtoquery.h"
static char *thisprog;
int main(int argc, char **argv)
{
thisprog = argv[0];
argc--; argv++;
if (argc != 1) {
fprintf(stderr, "need one arg\n");
exit(1);
}
const string str = *argv++;argc--;
string reason;
StringToWasaQuery qparser;
WasaQuery *q = qparser.stringToQuery(str, reason);
if (q == 0) {
fprintf(stderr, "stringToQuery failed: %s\n", reason.c_str());
exit(1);
}
string desc;
q->describe(desc);
fprintf(stderr, "Finally: %s\n", desc.c_str());
exit(0);
}
#endif // TEST_WASASTRINGTOQUERY

View file

@ -0,0 +1,108 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
#define _WASASTRINGTOQUERY_H_INCLUDED_
#include <string>
#include <vector>
using std::string;
using std::vector;
/* Note: Xesam used to be named wasabi. We changed the references to wasabi in
the comments, but not the code */
/**
* A simple class to represent a parsed Xesam user language element.
* Can hold one leaf element or an array of subqueries to be joined by AND/OR
*
* The complete query is represented by a top WasaQuery holding a
* chain of ANDed subclauses. Some of the subclauses may be themselves
* OR'ed lists (it doesn't go deeper). Entries in the AND list may be
* negated (AND NOT).
*
* For LEAF elements, the value can hold one or several words. In the
* latter case, it should be interpreted as a phrase (comes from a
* user-entered "quoted string"), except if the modifier flags say otherwise.
*
* Some fields only make sense either for compound or LEAF queries. This
* is commented for each. We should subclass really.
*
* Note that wasaStringToQuery supposedly parses the whole Xesam
* User Search Language v 0.95, but that some elements are dropped or
* ignored during the translation to a native Recoll query in wasaToRcl
*/
class WasaQuery {
public:
/** Type of this element: leaf or AND/OR chain */
enum Op {OP_NULL, OP_LEAF, OP_EXCL, OP_OR, OP_AND};
/** Relation to be searched between field and value. Recoll actually only
supports "contain" except for a size field */
enum Rel {REL_NULL, REL_EQUALS, REL_CONTAINS, REL_LT, REL_LTE,
REL_GT, REL_GTE};
/** Modifiers for term handling: case/diacritics handling,
stemming control */
enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4,
WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20,
WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100,
WQM_FUZZY = 0x200};
typedef vector<WasaQuery*> subqlist_t;
WasaQuery()
: m_op(OP_NULL), m_modifiers(0), m_slack(0), m_weight(1.0)
{}
~WasaQuery();
/** Get string describing the query tree from this point */
void describe(string &desc) const;
/** Op to be performed on either value (may be LEAF or EXCL, or subqs */
WasaQuery::Op m_op;
/** Field specification if any (ie: title, author ...) Only OPT_LEAF */
string m_fieldspec;
/** Relation between field and value: =, :, <,>,<=, >= */
WasaQuery::Rel m_rel;
/* String value. Valid for op == OP_LEAF or EXCL */
string m_value;
/** Subqueries. Valid for conjunctions */
vector<WasaQuery*> m_subs;
unsigned int m_modifiers;
int m_slack;
float m_weight;
};
/**
* Wasabi query string parser class. Could be a simple function
* really, but there might be some parser initialization work done in
* the constructor.
*/
class StringToWasaQuery {
public:
StringToWasaQuery();
~StringToWasaQuery();
WasaQuery *stringToQuery(const string& str, string& reason);
class Internal;
private:
Internal *internal;
};
#endif /* _WASASTRINGTOQUERY_H_INCLUDED_ */

293
src/query/wasatorcl.cpp Normal file
View file

@ -0,0 +1,293 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <cstdio>
#include <string>
#include <list>
#include <algorithm>
using std::string;
using std::list;
#include "rclconfig.h"
#include "wasastringtoquery.h"
#include "rcldb.h"
#include "searchdata.h"
#include "wasatorcl.h"
#include "debuglog.h"
#include "smallut.h"
#include "rclconfig.h"
#include "refcntr.h"
#include "textsplit.h"
static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
const string& autosuffs, string& reason)
{
if (wasa == 0) {
reason = "NULL query";
return 0;
}
if (wasa->m_op != WasaQuery::OP_AND && wasa->m_op != WasaQuery::OP_OR) {
reason = "Top query neither AND nor OR ?";
LOGERR(("wasaQueryToRcl: top query neither AND nor OR!\n"));
return 0;
}
Rcl::SearchData *sdata = new
Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND :
Rcl::SCLT_OR);
LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ?
"AND" : "OR"));
WasaQuery::subqlist_t::iterator it;
Rcl::SearchDataClause *nclause;
// Walk the list of clauses. Some pseudo-field types need special
// processing, which results in setting data in the top struct
// instead of adding a clause. We check for these first
for (it = wasa->m_subs.begin(); it != wasa->m_subs.end(); it++) {
if (!stringicmp("mime", (*it)->m_fieldspec) ||
!stringicmp("format", (*it)->m_fieldspec)) {
if ((*it)->m_op == WasaQuery::OP_LEAF) {
sdata->addFiletype((*it)->m_value);
} else if ((*it)->m_op == WasaQuery::OP_EXCL) {
sdata->remFiletype((*it)->m_value);
} else {
reason = "internal error: mime clause neither leaf not excl??";
return 0;
}
continue;
}
// Xesam uses "type", we also support "rclcat", for broad
// categories like "audio", "presentation", etc.
if (!stringicmp("rclcat", (*it)->m_fieldspec) ||
!stringicmp("type", (*it)->m_fieldspec)) {
if ((*it)->m_op != WasaQuery::OP_LEAF &&
(*it)->m_op != WasaQuery::OP_EXCL) {
reason = "internal error: rclcat/type clause neither leaf"
"nor excl??";
return 0;
}
list<string> mtypes;
if (config && config->getMimeCatTypes((*it)->m_value, mtypes)
&& !mtypes.empty()) {
for (list<string>::iterator mit = mtypes.begin();
mit != mtypes.end(); mit++) {
if ((*it)->m_op == WasaQuery::OP_LEAF)
sdata->addFiletype(*mit);
else
sdata->remFiletype(*mit);
}
} else {
reason = "Unknown rclcat/type value: no mime types found";
return 0;
}
continue;
}
// Filtering on location
if (!stringicmp("dir", (*it)->m_fieldspec)) {
string dir = path_tildexpand((*it)->m_value);
sdata->setTopdir(dir, (*it)->m_op == WasaQuery::OP_EXCL,
(*it)->m_weight);
continue;
}
// Handle "date" spec
if (!stringicmp("date", (*it)->m_fieldspec)) {
if ((*it)->m_op != WasaQuery::OP_LEAF) {
reason = "Negative date filtering not supported";
return 0;
}
DateInterval di;
if (!parsedateinterval((*it)->m_value, &di)) {
LOGERR(("wasaQueryToRcl: bad date interval format\n"));
reason = "Bad date interval format";
return 0;
}
LOGDEB(("wasaQueryToRcl:: date span: %d-%d-%d/%d-%d-%d\n",
di.y1,di.m1,di.d1, di.y2,di.m2,di.d2));
sdata->setDateSpan(&di);
continue;
}
// Handle "size" spec
if (!stringicmp("size", (*it)->m_fieldspec)) {
if ((*it)->m_op != WasaQuery::OP_LEAF) {
reason = "Negative size filtering not supported";
return 0;
}
char *cp;
size_t size = strtoll((*it)->m_value.c_str(), &cp, 10);
if (*cp != 0) {
switch (*cp) {
case 'k': case 'K': size *= 1E3;break;
case 'm': case 'M': size *= 1E6;break;
case 'g': case 'G': size *= 1E9;break;
case 't': case 'T': size *= 1E12;break;
default:
reason = string("Bad multiplier suffix: ") + *cp;
return 0;
}
}
switch ((*it)->m_rel) {
case WasaQuery::REL_EQUALS:
sdata->setMaxSize(size);
sdata->setMinSize(size);
break;
case WasaQuery::REL_LT:
case WasaQuery::REL_LTE:
sdata->setMaxSize(size);
break;
case WasaQuery::REL_GT:
case WasaQuery::REL_GTE:
sdata->setMinSize(size);
break;
default:
reason = "Bad relation operator with size query. Use > < or =";
return 0;
}
continue;
}
// "Regular" processing follows:
switch ((*it)->m_op) {
case WasaQuery::OP_NULL:
case WasaQuery::OP_AND:
default:
reason = "Found bad NULL or AND query type in list";
LOGERR(("wasaQueryToRcl: found bad NULL or AND q type in list\n"));
continue;
case WasaQuery::OP_LEAF: {
LOGDEB(("wasaQueryToRcl: leaf clause [%s]:[%s] slack %d\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
(*it)->m_slack));
// Change terms found in the "autosuffs" list into "ext"
// field queries
if ((*it)->m_fieldspec.empty() && !autosuffs.empty()) {
vector<string> asfv;
if (stringToStrings(autosuffs, asfv)) {
if (find_if(asfv.begin(), asfv.end(),
StringIcmpPred((*it)->m_value)) != asfv.end()) {
(*it)->m_fieldspec = "ext";
(*it)->m_modifiers |= WasaQuery::WQM_NOSTEM;
}
}
}
unsigned int mods = (unsigned int)(*it)->m_modifiers;
// I'm not sure I understand the phrase/near detection
// thereafter anymore, maybe it would be better to have an
// explicit flag. Mods can only be set after a double
// quote.
if (TextSplit::hasVisibleWhite((*it)->m_value) || mods) {
Rcl::SClType tp = Rcl::SCLT_PHRASE;
if (mods & WasaQuery::WQM_PROX) {
tp = Rcl::SCLT_NEAR;
}
nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
(*it)->m_slack,
(*it)->m_fieldspec);
} else {
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
(*it)->m_value,
(*it)->m_fieldspec);
}
if (nclause == 0) {
reason = "Out of memory";
LOGERR(("wasaQueryToRcl: out of memory\n"));
return 0;
}
if (mods & WasaQuery::WQM_NOSTEM) {
nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
}
if ((*it)->m_weight != 1.0)
nclause->setWeight((*it)->m_weight);
sdata->addClause(nclause);
}
break;
case WasaQuery::OP_EXCL:
LOGDEB2(("wasaQueryToRcl: excl clause [%s]:[%s]\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
if (wasa->m_op != WasaQuery::OP_AND) {
LOGERR(("wasaQueryToRcl: negative clause inside OR list!\n"));
continue;
}
// Note: have to add dquotes which will be translated to
// phrase if there are several words in there. Not pretty
// but should work. If there is actually a single
// word, it will not be taken as a phrase, and
// stem-expansion will work normally
// Have to do this because searchdata has nothing like and_not
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL,
string("\"") +
(*it)->m_value + "\"",
(*it)->m_fieldspec);
if (nclause == 0) {
reason = "Out of memory";
LOGERR(("wasaQueryToRcl: out of memory\n"));
return 0;
}
if ((*it)->m_modifiers & WasaQuery::WQM_NOSTEM)
nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING);
if ((*it)->m_weight != 1.0)
nclause->setWeight((*it)->m_weight);
sdata->addClause(nclause);
break;
case WasaQuery::OP_OR:
LOGDEB2(("wasaQueryToRcl: OR clause [%s]:[%s]\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
// Create a subquery.
Rcl::SearchData *sub =
wasaQueryToRcl(config, *it, autosuffs, reason);
if (sub == 0) {
continue;
}
nclause =
new Rcl::SearchDataClauseSub(Rcl::SCLT_SUB,
RefCntr<Rcl::SearchData>(sub));
if (nclause == 0) {
LOGERR(("wasaQueryToRcl: out of memory\n"));
reason = "Out of memory";
return 0;
}
if ((*it)->m_modifiers & WasaQuery::WQM_NOSTEM)
nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING);
sdata->addClause(nclause);
}
}
return sdata;
}
Rcl::SearchData *wasaStringToRcl(RclConfig *config,
const string &qs, string &reason,
const string& autosuffs)
{
StringToWasaQuery parser;
WasaQuery *wq = parser.stringToQuery(qs, reason);
if (wq == 0)
return 0;
return wasaQueryToRcl(config, wq, autosuffs, reason);
}

31
src/query/wasatorcl.h Normal file
View file

@ -0,0 +1,31 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _WASATORCL_H_INCLUDED_
#define _WASATORCL_H_INCLUDED_
#include <string>
using std::string;
#include "rcldb.h"
#include "searchdata.h"
class RclConfig;
extern Rcl::SearchData *wasaStringToRcl(RclConfig *,
const string& query, string &reason,
const string& autosuffs = string());
#endif /* _WASATORCL_H_INCLUDED_ */

319
src/query/xadump.cpp Normal file
View file

@ -0,0 +1,319 @@
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <strings.h>
#include <iostream>
#include <string>
#include <vector>
#include "pathut.h"
#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */
#include "utf8iter.h"
#include "xapian.h"
static string thisprog;
static string usage =
" -d <dbdir> -e <output encoding>\n"
" -i docid -D : get document data for docid\n"
" -i docid -X : delete document docid\n"
" -i docid -b : 'rebuild' document from term positions\n"
" -i docid -T : term list for doc docid\n"
" -t term -E : term existence test\n"
" -t term -F : retrieve term frequency data for given term\n"
" -t term -P : retrieve postings for term\n"
" -T : list all terms\n"
" -f : precede each term in the list with its occurrence counts\n"
" -n : raw data (no [])\n"
" -l : don't list prefixed terms\n"
" -x : separate each output char with a space\n"
" -s : special mode to dump recoll stem db\n"
" -q term [term ...] : perform AND query\n"
" \n\n"
;
static void
Usage(void)
{
cerr << thisprog << ": usage:\n" << usage;
exit(1);
}
static int op_flags;
#define OPT_D 0x1
#define OPT_E 0x2
#define OPT_F 0x4
#define OPT_P 0x8
#define OPT_T 0x10
#define OPT_X 0x20
#define OPT_b 0x40
#define OPT_d 0x80
#define OPT_e 0x100
#define OPT_f 0x200
#define OPT_i 0x400
#define OPT_n 0x800
#define OPT_q 0x1000
#define OPT_s 0x2000
#define OPT_t 0x4000
#define OPT_x 0x8000
#define OPT_l 0x10000
// Compute an exploded version of string, inserting a space between each char.
// (no character combining possible)
static string detailstring(const string& in)
{
if (!(op_flags & OPT_x))
return in;
string out;
Utf8Iter it(in);
for (; !it.eof(); it++) {
it.appendchartostring(out);
out += ' ';
}
// Strip last space
if (!out.empty())
out.resize(out.size()-1);
return out;
}
Xapian::Database *db;
static void cleanup()
{
delete db;
}
static void sigcleanup(int sig)
{
fprintf(stderr, "sigcleanup\n");
cleanup();
exit(1);
}
int main(int argc, char **argv)
{
string dbdir = path_cat(path_home(), ".recoll/xapiandb");
string outencoding = "ISO8859-1";
int docid = 1;
string aterm;
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
case 'b': op_flags |= OPT_b; break;
case 'D': op_flags |= OPT_D; break;
case 'd': op_flags |= OPT_d; if (argc < 2) Usage();
dbdir = *(++argv);
argc--;
goto b1;
case 'E': op_flags |= OPT_E; break;
case 'e': op_flags |= OPT_d; if (argc < 2) Usage();
outencoding = *(++argv);
argc--;
goto b1;
case 'F': op_flags |= OPT_F; break;
case 'f': op_flags |= OPT_f; break;
case 'i': op_flags |= OPT_i; if (argc < 2) Usage();
if (sscanf(*(++argv), "%d", &docid) != 1) Usage();
argc--;
goto b1;
case 'l': op_flags |= OPT_l; break;
case 'n': op_flags |= OPT_n; break;
case 'P': op_flags |= OPT_P; break;
case 'q': op_flags |= OPT_q; break;
case 's': op_flags |= OPT_s; break;
case 'T': op_flags |= OPT_T; break;
case 't': op_flags |= OPT_t; if (argc < 2) Usage();
aterm = *(++argv);
argc--;
goto b1;
case 'X': op_flags |= OPT_X; break;
case 'x': op_flags |= OPT_x; break;
default: Usage(); break;
}
b1: argc--; argv++;
}
vector<string> qterms;
if (op_flags & OPT_q) {
fprintf(stderr, "q argc %d\n", argc);
if (argc < 1)
Usage();
while (argc > 0) {
qterms.push_back(*argv++); argc--;
}
}
if (argc != 0)
Usage();
atexit(cleanup);
if (signal(SIGHUP, SIG_IGN) != SIG_IGN)
signal(SIGHUP, sigcleanup);
if (signal(SIGINT, SIG_IGN) != SIG_IGN)
signal(SIGINT, sigcleanup);
if (signal(SIGQUIT, SIG_IGN) != SIG_IGN)
signal(SIGQUIT, sigcleanup);
if (signal(SIGTERM, SIG_IGN) != SIG_IGN)
signal(SIGTERM, sigcleanup);
try {
db = new Xapian::Database(dbdir);
cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
if (op_flags & OPT_T) {
Xapian::TermIterator term;
string printable;
string op = (op_flags & OPT_n) ? string(): "[";
string cl = (op_flags & OPT_n) ? string(): "]";
if (op_flags & OPT_i) {
for (term = db->termlist_begin(docid);
term != db->termlist_end(docid);term++) {
const string& s = *term;
if ((op_flags&OPT_l) &&
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
continue;
cout << op << detailstring(s) << cl << endl;
}
} else {
for (term = db->allterms_begin();
term != db->allterms_end();term++) {
const string& s = *term;
if ((op_flags&OPT_l) &&
!s.empty() && s[0] >= 'A' && s[0] <= 'Z')
continue;
if (op_flags & OPT_f)
cout << db->get_collection_freq(*term) << " "
<< term.get_termfreq() << " ";
cout << op << detailstring(s) << cl << endl;
}
}
} else if (op_flags & OPT_s) {
for (unsigned int docid = 1;
docid < db->get_lastdocid(); docid++) {
// cout << docid << ": ";
Xapian::TermIterator term;
for (term = db->termlist_begin(docid);
term != db->termlist_end(docid);term++) {
cout << detailstring(*term) << " ";
Xapian::Document doc = db->get_document(docid);
string data = doc.get_data();
cout << data;
}
}
} else if (op_flags & OPT_D) {
Xapian::Document doc = db->get_document(docid);
string data = doc.get_data();
cout << data << endl;
} else if (op_flags & OPT_X) {
Xapian::Document doc = db->get_document(docid);
string data = doc.get_data();
cout << data << endl;
cout << "Really delete xapian document ?" << endl;
string rep;
cin >> rep;
if (!rep.empty() && (rep[0] == 'y' || rep[0] == 'Y')) {
Xapian::WritableDatabase wdb(dbdir, Xapian::DB_OPEN);
cout << "Deleting" << endl;
wdb.delete_document(docid);
}
} else if (op_flags & OPT_b) {
if (!(op_flags & OPT_i))
Usage();
vector<string> buf;
Xapian::TermIterator term;
for (term = db->termlist_begin(docid);
term != db->termlist_end(docid); term++) {
Xapian::PositionIterator pos;
for (pos = db->positionlist_begin(docid, *term);
pos != db->positionlist_end(docid, *term); pos++) {
if (buf.size() <= *pos)
buf.resize((*pos)+100);
buf[(*pos)] = detailstring(*term);
}
}
for (vector<string>::iterator it = buf.begin(); it != buf.end();
it++) {
cout << *it << " ";
}
} else if (op_flags & OPT_P) {
Xapian::PostingIterator doc;
for (doc = db->postlist_begin(aterm);
doc != db->postlist_end(aterm); doc++) {
cout << *doc << "(" << doc.get_wdf() << ") : " ;
Xapian::PositionIterator pos;
for (pos = doc.positionlist_begin();
pos != doc.positionlist_end(); pos++) {
cout << *pos << " " ;
}
cout << endl;
}
} else if (op_flags & OPT_F) {
cout << "FreqFor " << aterm << " : " <<
db->get_termfreq(aterm) << endl;
} else if (op_flags & OPT_E) {
cout << "Exists [" << aterm << "] : " <<
db->term_exists(aterm) << endl;
} else if (op_flags & OPT_q) {
Xapian::Enquire enquire(*db);
Xapian::Query query(Xapian::Query::OP_AND, qterms.begin(),
qterms.end());
cout << "Performing query `" <<
query.get_description() << "'" << endl;
enquire.set_query(query);
Xapian::MSet matches = enquire.get_mset(0, 10);
cout << "Estimated results: " <<
matches.get_matches_lower_bound() << endl;
Xapian::MSetIterator i;
for (i = matches.begin(); i != matches.end(); ++i) {
cout << "Document ID " << *i << "\t";
cout << i.get_percent() << "% ";
Xapian::Document doc = i.get_document();
cout << "[" << doc.get_data() << "]" << endl;
}
}
} catch (const Xapian::Error &e) {
cout << "Exception: " << e.get_msg() << endl;
} catch (const string &s) {
cout << "Exception: " << s << endl;
} catch (const char *s) {
cout << "Exception: " << s << endl;
} catch (...) {
cout << "Caught unknown exception" << endl;
}
exit(0);
}