removed list size truncature on filename expansion

2012-10-05 09:19:42 +02:00 · 2012-10-05 09:19:42 +02:00 · 86515ce52a
commit 86515ce52a
parent 8de0e2e8c9
5 changed files with 37 additions and 30 deletions
--- a/.hgignore
+++ b/.hgignore
@ -38,6 +38,7 @@ src/doc/user/rcl.program.api.html
 src/doc/user/rcl.program.fields.html
 src/doc/user/rcl.program.html
 src/doc/user/rcl.search.anchorwild.html
+src/doc/user/rcl.search.casediac.html
 src/doc/user/rcl.search.commandline.html
 src/doc/user/rcl.search.complex.html
 src/doc/user/rcl.search.custom.html
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1449,7 +1449,7 @@ bool Db::filenameWildExp(const string& fnexp, vector<string>& names)
    LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));

    TermMatchResult result;
-    if (!termMatch(ET_WILD, string(), pattern, result, 1000, 
+    if (!termMatch(ET_WILD, string(), pattern, result, -1,
 		   unsplitFilenameFieldName))
 	return false;
    for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
@ -1506,7 +1506,7 @@ public:
 };

 bool Db::stemExpand(const string &langs, const string &term, 
-		    TermMatchResult& result, int max)
+		    TermMatchResult& result)
 {
    if (m_ndb == 0 || m_ndb->m_isopen == false)
 	return false;
@ -1518,7 +1518,9 @@ bool Db::stemExpand(const string &langs, const string &term,
    return true;
 }

-/** Add prefix to all strings in list */
+/** Add prefix to all strings in list. 
+ * @param prefix already wrapped prefix
+ */
 static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
 {
    if (prefix.empty())
@ -1579,7 +1581,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
    res.prefix = prefix;

    if (typ == ET_STEM) {
-	if (!stemExpand(lang, root, res, max))
+	if (!stemExpand(lang, root, res))
 	    return false;
 	for (vector<TermMatchEntry>::iterator it = res.entries.begin(); 
 	     it != res.entries.end(); it++) {
@ -1623,7 +1625,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
                Xapian::TermIterator it = xdb.allterms_begin(); 
                if (!is.empty())
                    it.skip_to(is.c_str());
-                for (int n = 0; it != xdb.allterms_end(); it++) {
+                for (int rcnt = 0; it != xdb.allterms_end(); it++) {
                    // If we're beyond the terms matching the initial
                    // string, end
                    if (!is.empty() && (*it).find(is) != 0)
@ -1645,7 +1647,14 @@ bool Db::termMatch(MatchType typ, const string &lang,
                    res.entries.push_back(TermMatchEntry(*it, 
                                                   xdb.get_collection_freq(*it),
                                                   it.get_termfreq()));
-                    ++n;
+
+		    // The problem with truncating here is that this is done
+		    // alphabetically and we may not keep the most frequent 
+		    // terms. OTOH, not doing it may stall the program if
+		    // we are walking the whole term list. We compromise
+		    // by cutting at 2*max
+                    if (max > 0 && ++rcnt >= 2*max)
+			break;
                }
                m_reason.erase();
                break;
@ -1676,6 +1685,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
    TermMatchCmpByWcf wcmp;
    sort(res.entries.begin(), res.entries.end(), wcmp);
    if (max > 0) {
+	// Would need a small max and big stem expansion...
 	res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
    }
    return true;
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -406,7 +406,7 @@ private:
    // Reinitialize when adding/removing additional dbs
    bool adjustdbs(); 
    bool stemExpand(const string &lang, const string &s, 
-		    TermMatchResult& result, int max = -1);
+		    TermMatchResult& result);

    // Flush when idxflushmb is reached
    bool maybeflush(off_t moretext);
--- a/src/rcldb/rclquery.h
+++ b/src/rcldb/rclquery.h
@ -19,11 +19,6 @@
 #include <string>
 #include <vector>

-#ifndef NO_NAMESPACES
-using std::string;
-using std::vector;
-#endif
-
 #include "refcntr.h"
 #include "searchdata.h"

@ -43,18 +38,18 @@ enum abstract_result {
 // Snippet entry for makeDocAbstract
 class Snippet {
 public:
-    Snippet(int page, const string& snip) 
+    Snippet(int page, const std::string& snip) 
 	: page(page), snippet(snip)
    {
    }
-    Snippet& setTerm(const string& trm)
+    Snippet& setTerm(const std::string& trm)
    {
 	term = trm;
 	return *this;
    }
    int page;
-    string term;
-    string snippet;
+    std::string term;
+    std::string snippet;
 };

 	
@ -71,11 +66,11 @@ class Query {
    ~Query();

    /** Get explanation about last error */
-    string getReason() const;
+    std::string getReason() const;

    /** Choose sort order. Must be called before setQuery */
-    void setSortBy(const string& fld, bool ascending = true);
-    const string& getSortBy() const {return m_sortField;}
+    void setSortBy(const std::string& fld, bool ascending = true);
+    const std::string& getSortBy() const {return m_sortField;}
    bool getSortAscending() const {return m_sortAscending;}

    /** Return or filter results with identical content checksum */
@ -94,26 +89,26 @@ class Query {
    bool getDoc(int i, Doc &doc);

    /** Get possibly expanded list of query terms */
-    bool getQueryTerms(vector<string>& terms);
+    bool getQueryTerms(std::vector<std::string>& terms);

    /** Return a list of terms which matched for a specific result document */
-    bool getMatchTerms(const Doc& doc, vector<string>& terms);
-    bool getMatchTerms(unsigned long xdocid, vector<string>& terms);
+    bool getMatchTerms(const Doc& doc, std::vector<std::string>& terms);
+    bool getMatchTerms(unsigned long xdocid, std::vector<std::string>& terms);

    /** Build synthetic abstract for document, extracting chunks relevant for
     * the input query. This uses index data only (no access to the file) */
    // Abstract return as one string
-    bool makeDocAbstract(Doc &doc, string& abstract);
+    bool makeDocAbstract(Doc &doc, std::string& abstract);
    // Returned as a snippets vector
-    bool makeDocAbstract(Doc &doc, vector<string>& abstract);
+    bool makeDocAbstract(Doc &doc, std::vector<std::string>& abstract);
    // Returned as a vector of pair<page,snippet> page is 0 if unknown
-    abstract_result makeDocAbstract(Doc &doc, vector<Snippet>& abst, 
+    abstract_result makeDocAbstract(Doc &doc, std::vector<Snippet>& abst, 
 				    int maxoccs= -1, int ctxwords = -1);
    /** Retrieve detected page breaks positions */
    int getFirstMatchPage(Doc &doc, std::string& term);

    /** Expand query to look for documents like the one passed in */
-    vector<string> expand(const Doc &doc);
+    std::vector<std::string> expand(const Doc &doc);

    /** Return the Db we're set for */
    Db *whatDb();
@ -123,10 +118,10 @@ class Query {
    Native *m_nq;

 private:
-    string m_reason; // Error explanation
+    std::string m_reason; // Error explanation
    Db    *m_db;
    void  *m_sorter;
-    string m_sortField;
+    std::string m_sortField;
    bool   m_sortAscending;
    bool   m_collapseDuplicates;     
    int    m_resCnt;
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -1095,8 +1095,9 @@ bool StringToXapianQ::processUserString(const string &iq,
 					bool useNear
 					)
 {
-    LOGDEB(("StringToXapianQ:: qstr [%s] mods 0x%x slack %d near %d\n", 
-	    iq.c_str(), mods, slack, useNear));
+    LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
+	    "slack %d near %d\n", 
+	    iq.c_str(), m_field.c_str(), mods, slack, useNear));
    ermsg.erase();

    const StopList stops = m_db.getStopList();