/* Copyright (C) 2006 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ // Handle translation from rcl's SearchData structures to Xapian Queries #include "autoconfig.h" #include #include #include #include #include #include using namespace std; #include "xapian.h" #include "cstr.h" #include "rcldb.h" #include "rcldb_p.h" #include "searchdata.h" #include "debuglog.h" #include "smallut.h" #include "textsplit.h" #include "unacpp.h" #include "utf8iter.h" #include "stoplist.h" #include "rclconfig.h" #include "termproc.h" #include "synfamily.h" #include "stemdb.h" #include "expansiondbs.h" #include "base64.h" #include "daterange.h" namespace Rcl { typedef vector::iterator qlist_it_t; typedef vector::const_iterator qlist_cit_t; void SearchData::commoninit() { m_haveDates = false; m_maxSize = size_t(-1); m_minSize = size_t(-1); m_haveWildCards = false; m_autodiacsens = false; m_autocasesens = true; m_maxexp = 10000; m_maxcl = 100000; m_softmaxexpand = -1; } SearchData::~SearchData() { LOGDEB0(("SearchData::~SearchData\n")); for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) delete *it; } // This is called by the GUI simple search if the option is set: add // (OR) phrase to a query (if it is simple enough) so that results // where the search terms are close and in order will come up on top. // We remove very common terms from the query to avoid performance issues. bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold) { LOGDEB0(("SearchData::maybeAddAutoPhrase()\n")); // cerr << "BEFORE SIMPLIFY\n"; dump(cerr); simplify(); // cerr << "AFTER SIMPLIFY\n"; dump(cerr); if (!m_query.size()) { LOGDEB2(("SearchData::maybeAddAutoPhrase: empty query\n")); return false; } string field; vector words; // Walk the clause list. If this is not an AND list, we find any // non simple clause or different field names, bail out. for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) { SClType tp = (*it)->m_tp; if (tp != SCLT_AND) { LOGDEB2(("SearchData::maybeAddAutoPhrase: wrong tp %d\n", tp)); return false; } SearchDataClauseSimple *clp = dynamic_cast(*it); if (clp == 0) { LOGDEB2(("SearchData::maybeAddAutoPhrase: dyncast failed\n")); return false; } if (it == m_query.begin()) { field = clp->getfield(); } else { if (clp->getfield().compare(field)) { LOGDEB2(("SearchData::maybeAddAutoPhrase: diff. fields\n")); return false; } } // If there are wildcards or quotes in there, bail out if (clp->gettext().find_first_of("\"*[?") != string::npos) { LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n")); return false; } // Do a simple word-split here, not the full-blown // textsplit. Spans of stopwords should not be trimmed later // in this function, they will be properly split when the // phrase gets processed by toNativeQuery() later on. vector wl; stringToStrings(clp->gettext(), wl); words.insert(words.end(), wl.begin(), wl.end()); } // Trim the word list by eliminating very frequent terms // (increasing the slack as we do it): int slack = 0; int doccnt = db.docCnt(); if (!doccnt) doccnt = 1; string swords; for (vector::iterator it = words.begin(); it != words.end(); it++) { double freq = double(db.termDocCnt(*it)) / doccnt; if (freq < freqThreshold) { if (!swords.empty()) swords.append(1, ' '); swords += *it; } else { LOGDEB0(("SearchData::Autophrase: [%s] too frequent (%.2f %%)\n", it->c_str(), 100 * freq)); slack++; } } // We can't make a phrase with a single word :) int nwords = TextSplit::countWords(swords); if (nwords <= 1) { LOGDEB2(("SearchData::maybeAddAutoPhrase: ended with 1 word\n")); return false; } // Increase the slack: we want to be a little more laxist than for // an actual user-entered phrase slack += 1 + nwords / 3; m_autophrase = RefCntr( new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field)); return true; } // Add clause to current list. OR lists cant have EXCL clauses. bool SearchData::addClause(SearchDataClause* cl) { if (m_tp == SCLT_OR && cl->getexclude()) { LOGERR(("SearchData::addClause: cant add EXCL to OR list\n")); m_reason = "No Negative (AND_NOT) clauses allowed in OR queries"; return false; } cl->setParent(this); m_haveWildCards = m_haveWildCards || cl->m_haveWildCards; m_query.push_back(cl); return true; } // Am I a file name only search ? This is to turn off term highlighting bool SearchData::fileNameOnly() { for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) if (!(*it)->isFileName()) return false; return true; } void SearchData::simplify() { for (unsigned int i = 0; i < m_query.size(); i++) { if (m_query[i]->m_tp != SCLT_SUB) continue; //C[est ce dyncast qui crashe?? SearchDataClauseSub *clsubp = dynamic_cast(m_query[i]); if (clsubp == 0) { // ?? continue; } if (clsubp->getSub()->m_tp != m_tp) continue; clsubp->getSub()->simplify(); // If this subquery has special attributes, it's not a // candidate for collapsing if (!clsubp->getSub()->m_filetypes.empty() || !clsubp->getSub()->m_nfiletypes.empty() || clsubp->getSub()->m_haveDates || clsubp->getSub()->m_maxSize != size_t(-1) || clsubp->getSub()->m_minSize != size_t(-1) || clsubp->getSub()->m_haveWildCards) continue; bool allsametp = true; for (qlist_it_t it1 = clsubp->getSub()->m_query.begin(); it1 != clsubp->getSub()->m_query.end(); it1++) { // We want all AND or OR clause, and same as our conjunction if (((*it1)->getTp() != SCLT_AND && (*it1)->getTp() != SCLT_OR) || (*it1)->getTp() != m_tp) { allsametp = false; break; } } if (!allsametp) continue; // All ok: delete the clause_sub, and insert the queries from // its searchdata in its place m_query.erase(m_query.begin() + i); m_query.insert(m_query.begin() + i, clsubp->getSub()->m_query.begin(), clsubp->getSub()->m_query.end()); for (unsigned int j = i; j < i + clsubp->getSub()->m_query.size(); j++) { m_query[j]->setParent(this); } i += clsubp->getSub()->m_query.size() - 1; // We don't want the clauses to be deleted when the parent is, as we // know own them. clsubp->getSub()->m_query.clear(); delete clsubp; } } bool SearchData::singleSimple() { if (m_query.size() != 1 || !m_filetypes.empty() || !m_nfiletypes.empty() || m_haveDates || m_maxSize != size_t(-1) || m_minSize != size_t(-1) || m_haveWildCards) return false; SearchDataClause *clp = *m_query.begin(); if (clp->getTp() != SCLT_AND && clp->getTp() != SCLT_OR) { return false; } return true; } // Extract all term data void SearchData::getTerms(HighlightData &hld) const { for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++) (*it)->getTerms(hld); return; } void SearchData::dump(ostream& o) const { o << "SearchData: " << " qs " << int(m_query.size()) << " ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() << " hd " << m_haveDates << " maxs " << int(m_maxSize) << " mins " << int(m_minSize) << " wc " << m_haveWildCards << "\n"; for (std::vector::const_iterator it = m_query.begin(); it != m_query.end(); it++) { (*it)->dump(o); o << "\n"; } o << "\n"; } void SearchDataClause::dump(ostream& o) const { o << "SearchDataClause??"; } void SearchDataClauseSimple::dump(ostream& o) const { o << "ClauseSimple: "; if (m_exclude) o << "- "; o << "[" ; if (!m_field.empty()) o << m_field << " : "; o << m_text << "]"; } void SearchDataClauseFilename::dump(ostream& o) const { o << "ClauseFN: "; if (m_exclude) o << " - "; o << "[" << m_text << "]"; } void SearchDataClausePath::dump(ostream& o) const { o << "ClausePath: "; if (m_exclude) o << " - "; o << "[" << m_text << "]"; } void SearchDataClauseDist::dump(ostream& o) const { if (m_tp == SCLT_NEAR) o << "ClauseDist: NEAR: "; else o << "ClauseDist: PHRA: "; if (m_exclude) o << " - "; o << "["; if (!m_field.empty()) o << m_field << " : "; o << m_text << "]"; } void SearchDataClauseSub::dump(ostream& o) const { o << "ClauseSub {\n"; m_sub.getconstptr()->dump(o); o << "}"; } } // Namespace Rcl