removed list size truncature on filename expansion

This commit is contained in:
Jean-Francois Dockes 2012-10-05 09:19:42 +02:00
parent 8de0e2e8c9
commit 86515ce52a
5 changed files with 37 additions and 30 deletions

View file

@ -38,6 +38,7 @@ src/doc/user/rcl.program.api.html
src/doc/user/rcl.program.fields.html src/doc/user/rcl.program.fields.html
src/doc/user/rcl.program.html src/doc/user/rcl.program.html
src/doc/user/rcl.search.anchorwild.html src/doc/user/rcl.search.anchorwild.html
src/doc/user/rcl.search.casediac.html
src/doc/user/rcl.search.commandline.html src/doc/user/rcl.search.commandline.html
src/doc/user/rcl.search.complex.html src/doc/user/rcl.search.complex.html
src/doc/user/rcl.search.custom.html src/doc/user/rcl.search.custom.html

View file

@ -1449,7 +1449,7 @@ bool Db::filenameWildExp(const string& fnexp, vector<string>& names)
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str())); LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
TermMatchResult result; TermMatchResult result;
if (!termMatch(ET_WILD, string(), pattern, result, 1000, if (!termMatch(ET_WILD, string(), pattern, result, -1,
unsplitFilenameFieldName)) unsplitFilenameFieldName))
return false; return false;
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin(); for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
@ -1506,7 +1506,7 @@ public:
}; };
bool Db::stemExpand(const string &langs, const string &term, bool Db::stemExpand(const string &langs, const string &term,
TermMatchResult& result, int max) TermMatchResult& result)
{ {
if (m_ndb == 0 || m_ndb->m_isopen == false) if (m_ndb == 0 || m_ndb->m_isopen == false)
return false; return false;
@ -1518,7 +1518,9 @@ bool Db::stemExpand(const string &langs, const string &term,
return true; return true;
} }
/** Add prefix to all strings in list */ /** Add prefix to all strings in list.
* @param prefix already wrapped prefix
*/
static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix) static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
{ {
if (prefix.empty()) if (prefix.empty())
@ -1579,7 +1581,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
res.prefix = prefix; res.prefix = prefix;
if (typ == ET_STEM) { if (typ == ET_STEM) {
if (!stemExpand(lang, root, res, max)) if (!stemExpand(lang, root, res))
return false; return false;
for (vector<TermMatchEntry>::iterator it = res.entries.begin(); for (vector<TermMatchEntry>::iterator it = res.entries.begin();
it != res.entries.end(); it++) { it != res.entries.end(); it++) {
@ -1623,7 +1625,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
Xapian::TermIterator it = xdb.allterms_begin(); Xapian::TermIterator it = xdb.allterms_begin();
if (!is.empty()) if (!is.empty())
it.skip_to(is.c_str()); it.skip_to(is.c_str());
for (int n = 0; it != xdb.allterms_end(); it++) { for (int rcnt = 0; it != xdb.allterms_end(); it++) {
// If we're beyond the terms matching the initial // If we're beyond the terms matching the initial
// string, end // string, end
if (!is.empty() && (*it).find(is) != 0) if (!is.empty() && (*it).find(is) != 0)
@ -1645,7 +1647,14 @@ bool Db::termMatch(MatchType typ, const string &lang,
res.entries.push_back(TermMatchEntry(*it, res.entries.push_back(TermMatchEntry(*it,
xdb.get_collection_freq(*it), xdb.get_collection_freq(*it),
it.get_termfreq())); it.get_termfreq()));
++n;
// The problem with truncating here is that this is done
// alphabetically and we may not keep the most frequent
// terms. OTOH, not doing it may stall the program if
// we are walking the whole term list. We compromise
// by cutting at 2*max
if (max > 0 && ++rcnt >= 2*max)
break;
} }
m_reason.erase(); m_reason.erase();
break; break;
@ -1676,6 +1685,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
TermMatchCmpByWcf wcmp; TermMatchCmpByWcf wcmp;
sort(res.entries.begin(), res.entries.end(), wcmp); sort(res.entries.begin(), res.entries.end(), wcmp);
if (max > 0) { if (max > 0) {
// Would need a small max and big stem expansion...
res.entries.resize(MIN(res.entries.size(), (unsigned int)max)); res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
} }
return true; return true;

View file

@ -406,7 +406,7 @@ private:
// Reinitialize when adding/removing additional dbs // Reinitialize when adding/removing additional dbs
bool adjustdbs(); bool adjustdbs();
bool stemExpand(const string &lang, const string &s, bool stemExpand(const string &lang, const string &s,
TermMatchResult& result, int max = -1); TermMatchResult& result);
// Flush when idxflushmb is reached // Flush when idxflushmb is reached
bool maybeflush(off_t moretext); bool maybeflush(off_t moretext);

View file

@ -19,11 +19,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
#ifndef NO_NAMESPACES
using std::string;
using std::vector;
#endif
#include "refcntr.h" #include "refcntr.h"
#include "searchdata.h" #include "searchdata.h"
@ -43,18 +38,18 @@ enum abstract_result {
// Snippet entry for makeDocAbstract // Snippet entry for makeDocAbstract
class Snippet { class Snippet {
public: public:
Snippet(int page, const string& snip) Snippet(int page, const std::string& snip)
: page(page), snippet(snip) : page(page), snippet(snip)
{ {
} }
Snippet& setTerm(const string& trm) Snippet& setTerm(const std::string& trm)
{ {
term = trm; term = trm;
return *this; return *this;
} }
int page; int page;
string term; std::string term;
string snippet; std::string snippet;
}; };
@ -71,11 +66,11 @@ class Query {
~Query(); ~Query();
/** Get explanation about last error */ /** Get explanation about last error */
string getReason() const; std::string getReason() const;
/** Choose sort order. Must be called before setQuery */ /** Choose sort order. Must be called before setQuery */
void setSortBy(const string& fld, bool ascending = true); void setSortBy(const std::string& fld, bool ascending = true);
const string& getSortBy() const {return m_sortField;} const std::string& getSortBy() const {return m_sortField;}
bool getSortAscending() const {return m_sortAscending;} bool getSortAscending() const {return m_sortAscending;}
/** Return or filter results with identical content checksum */ /** Return or filter results with identical content checksum */
@ -94,26 +89,26 @@ class Query {
bool getDoc(int i, Doc &doc); bool getDoc(int i, Doc &doc);
/** Get possibly expanded list of query terms */ /** Get possibly expanded list of query terms */
bool getQueryTerms(vector<string>& terms); bool getQueryTerms(std::vector<std::string>& terms);
/** Return a list of terms which matched for a specific result document */ /** Return a list of terms which matched for a specific result document */
bool getMatchTerms(const Doc& doc, vector<string>& terms); bool getMatchTerms(const Doc& doc, std::vector<std::string>& terms);
bool getMatchTerms(unsigned long xdocid, vector<string>& terms); bool getMatchTerms(unsigned long xdocid, std::vector<std::string>& terms);
/** Build synthetic abstract for document, extracting chunks relevant for /** Build synthetic abstract for document, extracting chunks relevant for
* the input query. This uses index data only (no access to the file) */ * the input query. This uses index data only (no access to the file) */
// Abstract return as one string // Abstract return as one string
bool makeDocAbstract(Doc &doc, string& abstract); bool makeDocAbstract(Doc &doc, std::string& abstract);
// Returned as a snippets vector // Returned as a snippets vector
bool makeDocAbstract(Doc &doc, vector<string>& abstract); bool makeDocAbstract(Doc &doc, std::vector<std::string>& abstract);
// Returned as a vector of pair<page,snippet> page is 0 if unknown // Returned as a vector of pair<page,snippet> page is 0 if unknown
abstract_result makeDocAbstract(Doc &doc, vector<Snippet>& abst, abstract_result makeDocAbstract(Doc &doc, std::vector<Snippet>& abst,
int maxoccs= -1, int ctxwords = -1); int maxoccs= -1, int ctxwords = -1);
/** Retrieve detected page breaks positions */ /** Retrieve detected page breaks positions */
int getFirstMatchPage(Doc &doc, std::string& term); int getFirstMatchPage(Doc &doc, std::string& term);
/** Expand query to look for documents like the one passed in */ /** Expand query to look for documents like the one passed in */
vector<string> expand(const Doc &doc); std::vector<std::string> expand(const Doc &doc);
/** Return the Db we're set for */ /** Return the Db we're set for */
Db *whatDb(); Db *whatDb();
@ -123,10 +118,10 @@ class Query {
Native *m_nq; Native *m_nq;
private: private:
string m_reason; // Error explanation std::string m_reason; // Error explanation
Db *m_db; Db *m_db;
void *m_sorter; void *m_sorter;
string m_sortField; std::string m_sortField;
bool m_sortAscending; bool m_sortAscending;
bool m_collapseDuplicates; bool m_collapseDuplicates;
int m_resCnt; int m_resCnt;

View file

@ -1095,8 +1095,9 @@ bool StringToXapianQ::processUserString(const string &iq,
bool useNear bool useNear
) )
{ {
LOGDEB(("StringToXapianQ:: qstr [%s] mods 0x%x slack %d near %d\n", LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
iq.c_str(), mods, slack, useNear)); "slack %d near %d\n",
iq.c_str(), m_field.c_str(), mods, slack, useNear));
ermsg.erase(); ermsg.erase();
const StopList stops = m_db.getStopList(); const StopList stops = m_db.getStopList();