define new searchdataclausepath to replace the old dir: filtering mechanism. ORing dirs now works

This commit is contained in:
Jean-Francois Dockes 2013-01-05 16:21:30 +01:00
parent e219fa016d
commit 3e6a9971c3
10 changed files with 200 additions and 164 deletions

View file

@ -39,7 +39,7 @@ bool subtreelist(RclConfig *config, const string& top,
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, cstr_null);
RefCntr<Rcl::SearchData> rq(sd);
rq->addDirSpec(top);
sd->addClause(new Rcl::SearchDataClausePath(top, false));
Rcl::Query query(&rcldb);
query.setQuery(rq);

View file

@ -384,9 +384,9 @@ void AdvSearch::runSearch()
string cat;
if ((qit = cat_rtranslations.find(qcat)) !=
cat_rtranslations.end()) {
cat = (const char *)qit->second.toUtf8();
cat = qs2utf8s(qit->second);
} else {
cat = (const char *)qcat.toUtf8();
cat = qs2utf8s(qcat);
}
vector<string> types;
theconfig->getMimeCatTypes(cat, types);
@ -395,8 +395,7 @@ void AdvSearch::runSearch()
sdata->addFiletype(*it);
}
} else {
sdata->addFiletype((const char *)
yesFiltypsLB->item(i)->text().toUtf8());
sdata->addFiletype(qs2utf8s(yesFiltypsLB->item(i)->text()));
}
}
}
@ -422,8 +421,9 @@ void AdvSearch::runSearch()
if (!subtreeCMB->currentText().isEmpty()) {
QString current = subtreeCMB->currentText();
sdata->addDirSpec((const char*)subtreeCMB->currentText().toUtf8(),
direxclCB->isChecked());
sdata->addClause(new Rcl::SearchDataClausePath(
(const char*)current.toLocal8Bit(),
direxclCB->isChecked()));
// Keep history clean and sorted. Maybe there would be a
// simpler way to do this
list<QString> entries;
@ -463,12 +463,25 @@ void AdvSearch::fromSearch(RefCntr<SearchData> sdata)
addClause();
}
subtreeCMB->setEditText("");
direxclCB->setChecked(0);
for (unsigned int i = 0; i < sdata->m_query.size(); i++) {
// Set fields from clause
if (sdata->m_query[i]->getTp() == SCLT_SUB) {
LOGERR(("AdvSearch::fromSearch: SUB clause found !\n"));
continue;
}
if (sdata->m_query[i]->getTp() == SCLT_PATH) {
SearchDataClausePath *cs =
dynamic_cast<SearchDataClausePath*>(sdata->m_query[i]);
// We can only use one such clause. There should be only one too
// if this is sfrom aved search data.
QString qdir = QString::fromLocal8Bit(cs->gettext().c_str());
subtreeCMB->setEditText(qdir);
direxclCB->setChecked(cs->getexclude());
continue;
}
SearchDataClauseSimple *cs =
dynamic_cast<SearchDataClauseSimple*>(sdata->m_query[i]);
m_clauseWins[i]->setFromClause(cs);
@ -531,16 +544,6 @@ void AdvSearch::fromSearch(RefCntr<SearchData> sdata)
minSizeLE->setText("");
maxSizeLE->setText("");
}
if (!sdata->m_dirspecs.empty()) {
// Can only use one entry
QString qdir = QString::fromLocal8Bit(sdata->m_dirspecs[0].dir.c_str());
subtreeCMB->setEditText(qdir);
direxclCB->setChecked(sdata->m_dirspecs[0].exclude);
} else {
subtreeCMB->setEditText("");
direxclCB->setChecked(0);
}
}
void AdvSearch::slotHistoryNext()

View file

@ -29,8 +29,8 @@ using namespace Rcl;
class SDHXMLHandler : public QXmlDefaultHandler {
public:
SDHXMLHandler()
: slack(0)
{
resetTemps();
}
bool startElement(const QString & /* namespaceURI */,
const QString & /* localName */,
@ -165,11 +165,11 @@ bool SDHXMLHandler::endElement(const QString & /* namespaceURI */,
} else if (qName == "YD") {
string d;
base64_decode((const char*)currentText.trimmed().toAscii(), d);
sd->addDirSpec(d);
sd->addClause(new SearchDataClausePath(d));
} else if (qName == "ND") {
string d;
base64_decode((const char*)currentText.trimmed().toAscii(), d);
sd->addDirSpec(d, true);
sd->addClause(new SearchDataClausePath(d, true));
} else if (qName == "SD") {
// Closing current search descriptor. Finishing touches...
if (hasdates)

View file

@ -64,10 +64,11 @@ void WasaQuery::describe(string &desc) const
desc += "NULL";
break;
case OP_LEAF:
if (m_exclude)
desc += "NOT (";
desc += fieldspec + m_value;
break;
case OP_EXCL:
desc += string("NOT (" ) + fieldspec + m_value + ") ";
if (m_exclude)
desc += ")";
break;
case OP_OR:
case OP_AND:
@ -429,11 +430,12 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
}
}
nclause->m_op = WasaQuery::OP_LEAF;
// +- indicator ?
if (checkSubMatch(SMI_PM, match, reason) && match[0] == '-') {
nclause->m_op = WasaQuery::OP_EXCL;
nclause->m_exclude = true;
} else {
nclause->m_op = WasaQuery::OP_LEAF;
nclause->m_exclude = false;
}
if (prev_or) {

View file

@ -48,7 +48,7 @@ using std::vector;
class WasaQuery {
public:
/** Type of this element: leaf or AND/OR chain */
enum Op {OP_NULL, OP_LEAF, OP_EXCL, OP_OR, OP_AND};
enum Op {OP_NULL, OP_LEAF, OP_OR, OP_AND};
/** Relation to be searched between field and value. Recoll actually only
supports "contain" except for a size field */
enum Rel {REL_NULL, REL_EQUALS, REL_CONTAINS, REL_LT, REL_LTE,
@ -63,7 +63,8 @@ public:
typedef vector<WasaQuery*> subqlist_t;
WasaQuery()
: m_op(OP_NULL), m_modifiers(0), m_slack(0), m_weight(1.0)
: m_op(OP_NULL), m_rel(REL_NULL), m_exclude(false),
m_modifiers(0), m_slack(0), m_weight(1.0)
{}
~WasaQuery();
@ -79,6 +80,9 @@ public:
/** Relation between field and value: =, :, <,>,<=, >= */
WasaQuery::Rel m_rel;
/* Negating flag */
bool m_exclude;
/* String value. Valid for op == OP_LEAF or EXCL */
string m_value;

View file

@ -64,11 +64,13 @@ static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config,
if (!stringicmp("mime", (*it)->m_fieldspec) ||
!stringicmp("format", (*it)->m_fieldspec)) {
if ((*it)->m_op == WasaQuery::OP_LEAF) {
sdata->addFiletype((*it)->m_value);
} else if ((*it)->m_op == WasaQuery::OP_EXCL) {
sdata->remFiletype((*it)->m_value);
if ((*it)->m_exclude) {
sdata->remFiletype((*it)->m_value);
} else {
sdata->addFiletype((*it)->m_value);
}
} else {
reason = "internal error: mime clause neither leaf not excl??";
reason = "internal error: mime clause not leaf??";
return 0;
}
continue;
@ -78,10 +80,8 @@ static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config,
// categories like "audio", "presentation", etc.
if (!stringicmp("rclcat", (*it)->m_fieldspec) ||
!stringicmp("type", (*it)->m_fieldspec)) {
if ((*it)->m_op != WasaQuery::OP_LEAF &&
(*it)->m_op != WasaQuery::OP_EXCL) {
reason = "internal error: rclcat/type clause neither leaf"
"nor excl??";
if ((*it)->m_op != WasaQuery::OP_LEAF) {
reason = "internal error: rclcat/type clause not leaf??";
return 0;
}
vector<string> mtypes;
@ -89,10 +89,11 @@ static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config,
&& !mtypes.empty()) {
for (vector<string>::iterator mit = mtypes.begin();
mit != mtypes.end(); mit++) {
if ((*it)->m_op == WasaQuery::OP_LEAF)
sdata->addFiletype(*mit);
else
if ((*it)->m_exclude) {
sdata->remFiletype(*mit);
} else {
sdata->addFiletype(*mit);
}
}
} else {
reason = "Unknown rclcat/type value: no mime types found";
@ -101,14 +102,6 @@ static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config,
continue;
}
// Filtering on location
if (!stringicmp("dir", (*it)->m_fieldspec)) {
string dir = path_tildexpand((*it)->m_value);
sdata->addDirSpec(dir, (*it)->m_op == WasaQuery::OP_EXCL,
(*it)->m_weight);
continue;
}
// Handle "date" spec
if (!stringicmp("date", (*it)->m_fieldspec)) {
if ((*it)->m_op != WasaQuery::OP_LEAF) {
@ -181,9 +174,9 @@ static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config,
continue;
case WasaQuery::OP_LEAF: {
LOGDEB0(("wasaQueryToRcl: leaf clause [%s]:[%s] slack %d\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
(*it)->m_slack));
LOGDEB0(("wasaQueryToRcl: leaf clause [%s:%s] slack %d excl %d\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
(*it)->m_slack, (*it)->m_exclude));
// Change terms found in the "autosuffs" list into "ext"
// field queries
@ -198,23 +191,45 @@ static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config,
}
}
// I'm not sure I understand the phrase/near detection
// thereafter anymore, maybe it would be better to have an
// explicit flag. Mods can only be set after a double
// quote.
if (TextSplit::hasVisibleWhite((*it)->m_value) || mods) {
Rcl::SClType tp = Rcl::SCLT_PHRASE;
if (mods & WasaQuery::WQM_PROX) {
tp = Rcl::SCLT_NEAR;
if (!stringicmp("dir", (*it)->m_fieldspec)) {
// dir filtering special case
nclause = new Rcl::SearchDataClausePath((*it)->m_value,
(*it)->m_exclude);
} else if ((*it)->m_exclude) {
if (wasa->m_op != WasaQuery::OP_AND) {
LOGERR(("wasaQueryToRcl: excl clause inside OR list!\n"));
continue;
}
nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
(*it)->m_slack,
(*it)->m_fieldspec);
} else {
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
(*it)->m_value,
// Note: have to add dquotes which will be translated to
// phrase if there are several words in there. Not pretty
// but should work. If there is actually a single
// word, it will not be taken as a phrase, and
// stem-expansion will work normally
// Have to do this because searchdata has nothing like and_not
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL,
string("\"") +
(*it)->m_value + "\"",
(*it)->m_fieldspec);
} else {
// I'm not sure I understand the phrase/near detection
// thereafter anymore, maybe it would be better to have an
// explicit flag. Mods can only be set after a double
// quote.
if (TextSplit::hasVisibleWhite((*it)->m_value) || mods) {
Rcl::SClType tp = Rcl::SCLT_PHRASE;
if (mods & WasaQuery::WQM_PROX) {
tp = Rcl::SCLT_NEAR;
}
nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
(*it)->m_slack,
(*it)->m_fieldspec);
} else {
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
(*it)->m_value,
(*it)->m_fieldspec);
}
}
if (nclause == 0) {
reason = "Out of memory";
LOGERR(("wasaQueryToRcl: out of memory\n"));
@ -223,31 +238,6 @@ static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config,
}
break;
case WasaQuery::OP_EXCL:
LOGDEB2(("wasaQueryToRcl: excl clause [%s]:[%s]\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
if (wasa->m_op != WasaQuery::OP_AND) {
LOGERR(("wasaQueryToRcl: negative clause inside OR list!\n"));
continue;
}
// Note: have to add dquotes which will be translated to
// phrase if there are several words in there. Not pretty
// but should work. If there is actually a single
// word, it will not be taken as a phrase, and
// stem-expansion will work normally
// Have to do this because searchdata has nothing like and_not
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL,
string("\"") +
(*it)->m_value + "\"",
(*it)->m_fieldspec);
if (nclause == 0) {
reason = "Out of memory";
LOGERR(("wasaQueryToRcl: out of memory\n"));
return 0;
}
break;
case WasaQuery::OP_OR:
LOGDEB2(("wasaQueryToRcl: OR clause [%s]:[%s]\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));

View file

@ -128,7 +128,7 @@ bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
// addClause())
Xapian::Query::op op;
if (tp == SCLT_AND) {
if ((*it)->m_tp == SCLT_EXCL) {
if ((*it)->m_tp == SCLT_EXCL || (*it)->getexclude()) {
op = Xapian::Query::OP_AND_NOT;
} else {
op = Xapian::Query::OP_AND;
@ -274,36 +274,6 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
}
// Add the directory filtering clauses. Each is a phrase of terms
// prefixed with the pathelt prefix XP
for (vector<DirSpec>::const_iterator dit = m_dirspecs.begin();
dit != m_dirspecs.end(); dit++) {
vector<string> vpath;
stringToTokens(dit->dir, vpath, "/");
vector<string> pvpath;
if (dit->dir[0] == '/')
pvpath.push_back(wrap_prefix(pathelt_prefix));
for (vector<string>::const_iterator pit = vpath.begin();
pit != vpath.end(); pit++){
pvpath.push_back(wrap_prefix(pathelt_prefix) + *pit);
}
Xapian::Query::op tdop;
if (dit->weight == 1.0) {
tdop = dit->exclude ?
Xapian::Query::OP_AND_NOT : Xapian::Query::OP_FILTER;
} else {
tdop = dit->exclude ?
Xapian::Query::OP_AND_NOT : Xapian::Query::OP_AND_MAYBE;
}
Xapian::Query tdq = Xapian::Query(Xapian::Query::OP_PHRASE,
pvpath.begin(), pvpath.end());
if (dit->weight != 1.0)
tdq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT,
tdq, dit->weight);
xq = Xapian::Query(tdop, xq, tdq);
}
*((Xapian::Query *)d) = xq;
return true;
}
@ -418,7 +388,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
// Add clause to current list. OR lists cant have EXCL clauses.
bool SearchData::addClause(SearchDataClause* cl)
{
if (m_tp == SCLT_OR && (cl->m_tp == SCLT_EXCL)) {
if (m_tp == SCLT_OR && (cl->m_tp == SCLT_EXCL || cl->getexclude())) {
LOGERR(("SearchData::addClause: cant add EXCL to OR list\n"));
m_reason = "No Negative (AND_NOT) clauses allowed in OR queries";
return false;
@ -438,7 +408,6 @@ void SearchData::erase()
delete *it;
m_query.clear();
m_filetypes.clear();
m_dirspecs.clear();
m_description.erase();
m_reason.erase();
m_haveDates = false;
@ -1169,6 +1138,35 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
return true;
}
// Translate a dir: path filtering clause. See comments in .h
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
{
LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
if (m_text.empty()) {
LOGERR(("SearchDataClausePath: empty path??\n"));
return false;
}
vector<string> vpath;
stringToTokens(m_text, vpath, "/");
vector<string> pvpath;
if (m_text[0] == '/')
pvpath.push_back(wrap_prefix(pathelt_prefix));
for (vector<string>::const_iterator pit = vpath.begin();
pit != vpath.end(); pit++){
pvpath.push_back(wrap_prefix(pathelt_prefix) + *pit);
}
*qp = Xapian::Query(Xapian::Query::OP_PHRASE,
pvpath.begin(), pvpath.end());
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true;
}
// Translate NEAR or PHRASE clause.
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
{

View file

@ -41,7 +41,7 @@ namespace Rcl {
/** Search clause types */
enum SClType {
SCLT_AND,
SCLT_OR, SCLT_EXCL, SCLT_FILENAME, SCLT_PHRASE, SCLT_NEAR,
SCLT_OR, SCLT_EXCL, SCLT_FILENAME, SCLT_PHRASE, SCLT_NEAR, SCLT_PATH,
SCLT_SUB
};
@ -84,7 +84,7 @@ public:
commoninit();
}
SearchData()
: m_tp(SCLT_AND), m_stemlang("english")
: m_tp(SCLT_AND)
{
commoninit();
}
@ -118,12 +118,6 @@ public:
*/
bool maybeAddAutoPhrase(Rcl::Db &db, double threshold);
/** Set/get top subdirectory for filtering results */
void addDirSpec(const std::string& t, bool excl = false, float w = 1.0)
{
m_dirspecs.push_back(DirSpec(t, excl, w));
}
const std::string& getStemLang() {return m_stemlang;}
void setMinSize(size_t size) {m_minSize = size;}
@ -182,20 +176,6 @@ private:
// Excluded set of file types if not empty
std::vector<std::string> m_nfiletypes;
// Restrict to subtree or exclude one
class DirSpec {
public:
std::string dir;
bool exclude;
// For positive spec: affect weight instead of filter
float weight;
DirSpec(const std::string&d, bool x, float w)
: dir(d), exclude(x), weight(w)
{
}
};
std::vector<DirSpec> m_dirspecs;
bool m_haveDates;
DateInterval m_dates; // Restrict to date interval
size_t m_maxSize;
@ -240,7 +220,7 @@ public:
SearchDataClause(SClType tp)
: m_tp(tp), m_parentSearch(0), m_haveWildCards(0),
m_modifiers(SDCM_NONE), m_weight(1.0)
m_modifiers(SDCM_NONE), m_weight(1.0), m_exclude(false)
{}
virtual ~SearchDataClause() {}
virtual bool toNativeQuery(Rcl::Db &db, void *) = 0;
@ -299,8 +279,12 @@ public:
{
m_weight = w;
}
friend class SearchData;
virtual bool getexclude() const
{
return m_exclude;
}
friend class SearchData;
protected:
std::string m_reason;
SClType m_tp;
@ -308,6 +292,7 @@ protected:
bool m_haveWildCards;
Modifier m_modifiers;
float m_weight;
bool m_exclude;
private:
SearchDataClause(const SearchDataClause&)
{
@ -404,6 +389,54 @@ protected:
std::string m_text;
};
/**
* Pathname filtering clause. This is special because of history:
* - Pathname filtering used to be performed as a post-processing step
* done with the url fields of doc data records.
* - Then it was done as special phrase searchs on path elements prefixed
* with XP.
* Up to this point dir filtering data was stored as part of the searchdata
* object, not in the SearchDataClause tree. Only one, then a list,
* of clauses where stored, and they were always ANDed together.
*
* In order to allow for OR searching, dir clauses are now stored in a
* specific SearchDataClause, but this is still special because the field has
* non-standard phrase-like processing, reflected in index storage by
* an empty element representing / (as "XP").
*
* A future version should use a standard phrase with an anchor to the
* start if the path starts with /. As this implies an index format
* change but is no important enough to warrant it, this has to wait for
* the next format change.
*/
class SearchDataClausePath : public SearchDataClause {
public:
SearchDataClausePath(const std::string& txt, bool excl = false)
: SearchDataClause(SCLT_PATH), m_text(txt)
{
m_exclude = excl;
m_haveWildCards = false;
}
virtual ~SearchDataClausePath()
{
}
virtual void getTerms(HighlightData&) const
{
}
virtual bool toNativeQuery(Rcl::Db &, void *);
virtual const std::string& gettext() const
{
return m_text;
}
protected:
std::string m_text;
};
/**
* A clause coming from a NEAR or PHRASE entry field. There is only one
* std::string group, and a specified distance, which applies to it.

View file

@ -15,7 +15,8 @@
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
// Handle translation from rcl's SearchData structures to Xapian Queries
// Handle translation from rcl's SearchData structures to XML. Used for
// complex search history storage in the GUI
#include "autoconfig.h"
@ -64,6 +65,20 @@ string SearchData::asXML()
LOGERR(("SearchData::asXML: can't do subclauses !\n"));
continue;
}
//if (c->getexclude())
// os << "<NEG/>" << endl;
if (c->getTp() == SCLT_PATH) {
// Keep these apart, for compat with the older history format
SearchDataClausePath *cl =
dynamic_cast<SearchDataClausePath*>(c);
if (cl->getexclude()) {
os << "<ND>" << base64_encode(cl->gettext()) << "</ND>" << endl;
} else {
os << "<YD>" << base64_encode(cl->gettext()) << "</YD>" << endl;
}
continue;
}
SearchDataClauseSimple *cl =
dynamic_cast<SearchDataClauseSimple*>(c);
os << "<C>" << endl;
@ -100,7 +115,6 @@ string SearchData::asXML()
}
}
if (m_minSize != size_t(-1)) {
os << "<MIS>" << m_minSize << "</MIS>" << endl;
}
@ -126,14 +140,6 @@ string SearchData::asXML()
os << "</IT>" << endl;
}
for (vector<DirSpec>::const_iterator dit = m_dirspecs.begin();
dit != m_dirspecs.end(); dit++) {
if (dit->exclude) {
os << "<ND>" << base64_encode(dit->dir) << "</ND>" << endl;
} else {
os << "<YD>" << base64_encode(dit->dir) << "</YD>" << endl;
}
}
os << "</SD>";
return os.str();
}

View file

@ -57,18 +57,18 @@ bool StemDb::stemExpand(const std::string& langs, const std::string& term,
}
#ifndef RCL_INDEX_STRIPCHARS
// Expand the unaccented stem
if (!o_index_stripchars) {
string unac;
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
if (term != unac) {
for (vector<string>::const_iterator it = llangs.begin();
it != llangs.end(); it++) {
SynTermTransStem stemmer(*it);
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
*it, &stemmer);
(void)expander.synExpand(unac, result);
}
// Expand the unaccented stem, using the unaccented stem
// db. Because it's a different db, We need to do it even if
// the input has no accent (unac == term)
for (vector<string>::const_iterator it = llangs.begin();
it != llangs.end(); it++) {
SynTermTransStem stemmer(*it);
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
*it, &stemmer);
(void)expander.synExpand(unac, result);
}
}
#endif