diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp index b1a28b3f..690d1372 100644 --- a/src/rcldb/rclabstract.cpp +++ b/src/rcldb/rclabstract.cpp @@ -42,7 +42,7 @@ static const string cstr_ellipsis("..."); // This is used to mark positions overlapped by a multi-word match term static const string occupiedmarker("?"); -#define DEBUGABSTRACT +#undef DEBUGABSTRACT #ifdef DEBUGABSTRACT #define LOGABS LOGDEB static void listList(const string& what, const vector&l) @@ -60,16 +60,16 @@ static void listList(const string&, const vector&) } #endif -// Keep only non-prefixed terms. We use to remove prefixes and keep -// the terms instead, but field terms are normally also indexed -// un-prefixed, so this is simpler and better. +// Unprefix terms. static void noPrefixList(const vector& in, vector& out) { for (vector::const_iterator qit = in.begin(); qit != in.end(); qit++) { - if (!has_prefix(*qit)) - out.push_back(*qit); + out.push_back(strip_prefix(*qit)); } + sort(out.begin(), out.end()); + vector::iterator it = unique(out.begin(), out.end()); + out.resize(it - out.begin()); } // Retrieve db-wide frequencies for the query terms and store them in @@ -132,26 +132,44 @@ double Query::Native::qualityTerms(Xapian::docid docid, m_q->m_sd->getTerms(hld); } +#ifdef DEBUGABSTRACT + { + string deb; + hld.toString(deb); + LOGABS(("qualityTerms: hld: %s\n", deb.c_str())); + } +#endif + // Group the input terms by the user term they were possibly expanded from map > byRoot; for (vector::const_iterator qit = terms.begin(); qit != terms.end(); qit++) { bool found = false; - for (unsigned int gidx = 0; gidx < hld.groups.size(); gidx++) { - if (hld.groups[gidx].size() == 1 && hld.groups[gidx][0] == *qit) { - string us = hld.ugroups[hld.grpsugidx[gidx]][0]; - LOGABS(("qualityTerms: [%s] found, comes from [%s]\n", - (*qit).c_str(), us.c_str())); - byRoot[us].push_back(*qit); - found = true; - } - } - if (!found) { + map::const_iterator eit = hld.terms.find(*qit); + if (eit != hld.terms.end()) { + byRoot[eit->second].push_back(*qit); + } else { LOGDEB0(("qualityTerms: [%s] not found in hld\n", (*qit).c_str())); byRoot[*qit].push_back(*qit); } } +#ifdef DEBUGABSTRACT + { + string byRootstr; + for (map >::const_iterator debit = + byRoot.begin(); debit != byRoot.end(); debit++) { + byRootstr.append("[").append(debit->first).append("]->"); + for (vector::const_iterator it = debit->second.begin(); + it != debit->second.end(); it++) { + byRootstr.append("[").append(*it).append("] "); + } + byRootstr.append("\n"); + } + LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str())); + } +#endif + // Compute in-document and global frequencies for the groups. map grpwdfs; map grptfreqs; diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 283033a0..1fce7eaa 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -142,6 +142,29 @@ inline bool has_prefix(const string& trm) #endif } +inline string strip_prefix(const string& trm) +{ + if (trm.empty()) + return trm; + string::size_type st = 0; +#ifndef RCL_INDEX_STRIPCHARS + if (o_index_stripchars) { +#endif + st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ"); + if (st == string::npos) + return string(); +#ifndef RCL_INDEX_STRIPCHARS + } else { + if (has_prefix(trm)) { + st = trm.find_last_of(":") + 1; + } else { + return trm; + } + } +#endif + return trm.substr(st); +} + inline string wrap_prefix(const string& pfx) { #ifndef RCL_INDEX_STRIPCHARS diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index a7e7eb81..08e84738 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -745,6 +745,7 @@ void StringToXapianQ::expandTerm(int mods, if (noexpansion) { sterm = term; oexp.push_back(prefix + term); + m_hld.terms[term] = m_hld.uterms.size() - 1; LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); return; } @@ -790,9 +791,9 @@ void StringToXapianQ::expandTerm(int mods, // result: if (diac_sensitive && case_sensitive) { - // No expansion whatsoever - m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field); - goto termmatchtoresult; + // No expansion whatsoever. + lexp.push_back(term); + goto exptotermatch; } else if (diac_sensitive) { // Expand for accents and case, filtering for same accents, SynTermTransUnac foldtrans(UNACOP_FOLD); @@ -842,13 +843,12 @@ void StringToXapianQ::expandTerm(int mods, lexp.resize(uit - lexp.begin()); } - // Bogus wildcard expand to generate the result + // Bogus wildcard expand to generate the result (possibly add prefixes) exptotermatch: LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str())); for (vector::const_iterator it = lexp.begin(); it != lexp.end(); it++) { - m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, - res, -1, m_field); + m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, -1, m_field); } #endif @@ -864,6 +864,11 @@ termmatchtoresult: if (oexp.empty()) oexp.push_back(prefix + term); + // Remember the uterm-to-expansion links + for (vector::const_iterator it = oexp.begin(); + it != oexp.end(); it++) { + m_hld.terms[strip_prefix(*it)] = term; + } LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); } diff --git a/src/utils/hldata.h b/src/utils/hldata.h index 3e0050ee..94c6ca27 100644 --- a/src/utils/hldata.h +++ b/src/utils/hldata.h @@ -17,6 +17,12 @@ struct HighlightData { */ std::set uterms; + /** The db query terms linked to the uterms entry they were expanded from. + * This is used for aggregating term stats when generating snippets (for + * choosing the best terms, allocating slots, etc. ) + */ + std::map terms; + /** The original user terms-or-groups. This is for display * purposes: ie when creating a menu to look for a specific * matched group inside a preview window. We want to show the diff --git a/src/utils/smallut.cpp b/src/utils/smallut.cpp index a8a784c1..72e4ee09 100644 --- a/src/utils/smallut.cpp +++ b/src/utils/smallut.cpp @@ -1050,7 +1050,12 @@ void HighlightData::toString(std::string& out) it != uterms.end(); it++) { out.append(" [").append(*it).append("]"); } - + out.append("\nUser terms to Query terms:"); + for (map::const_iterator it = terms.begin(); + it != terms.end(); it++) { + out.append("[").append(it->first).append("]->["); + out.append(it->second).append("] "); + } out.append("\nGroups: "); char cbuf[200]; sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d", @@ -1075,13 +1080,12 @@ void HighlightData::toString(std::string& out) out.append("}").append(cbuf); } out.append("\n"); - fprintf(stderr, "toString ok\n"); } void HighlightData::append(const HighlightData& hl) { uterms.insert(hl.uterms.begin(), hl.uterms.end()); - + terms.insert(hl.terms.begin(), hl.terms.end()); size_t ugsz0 = ugroups.size(); ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());