recoll/src/rcldb/rclterms.cpp

/* Copyright (C) 2004 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

////////////////////////////////////////////////////////////////////
/** Things dealing with walking the terms lists and expansion dbs */

#include "autoconfig.h"

#include <string>

#include "log.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "stemdb.h"
#include "expansiondbs.h"
#include "strmatcher.h"

using namespace std;

namespace Rcl {

// File name wild card expansion. This is a specialisation ot termMatch
bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
{
    string pattern = fnexp;
    names.clear();

    // If pattern is not capitalized, not quoted (quoted pattern can't
    // get here currently anyway), and has no wildcards, we add * at
    // each end: match any substring
    if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
        pattern = pattern.substr(1, pattern.size() -2);
    } else if (pattern.find_first_of(cstr_minwilds) == string::npos &&
               !unaciscapital(pattern)) {
        pattern = "*" + pattern + "*";
    } // else let it be

    LOGDEB("Rcl::Db::filenameWildExp: pattern: ["  << (pattern) << "]\n" );

    // We inconditionnally lowercase and strip the pattern, as is done
    // during indexing. This seems to be the only sane possible
    // approach with file names and wild cards. termMatch does
    // stripping conditionally on indexstripchars.
    string pat1;
    if (unacmaybefold(pattern, pat1, "UTF-8", UNACOP_UNACFOLD)) {
        pattern.swap(pat1);
    }

    TermMatchResult result;
    if (!idxTermMatch(ET_WILD, string(), pattern, result, max,
                      unsplitFilenameFieldName))
        return false;
    for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
         it != result.entries.end(); it++)
        names.push_back(it->term);

    if (names.empty()) {
        // Build an impossible query: we know its impossible because we
        // control the prefixes!
        names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
    }
    return true;
}

// Walk the Y terms and return min/max
bool Db::maxYearSpan(int *minyear, int *maxyear)
{
    LOGDEB("Rcl::Db:maxYearSpan\n" );
    *minyear = 1000000;
    *maxyear = -1000000;
    TermMatchResult result;
    if (!idxTermMatch(ET_WILD, string(), "*", result, -1, "xapyear")) {
        LOGINFO("Rcl::Db:maxYearSpan: termMatch failed\n" );
        return false;
    }
    for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
         it != result.entries.end(); it++) {
        if (!it->term.empty()) {
            int year = atoi(strip_prefix(it->term).c_str());
            if (year < *minyear)
                *minyear = year;
            if (year > *maxyear)
                *maxyear = year;
        }
    }
    return true;
}

bool Db::getAllDbMimeTypes(std::vector<std::string>& exp)
{
    Rcl::TermMatchResult res;
    if (!idxTermMatch(Rcl::Db::ET_WILD, "", "*", res, -1, "mtype")) {
        return false;
    }
    for (vector<Rcl::TermMatchEntry>::const_iterator rit = res.entries.begin();
         rit != res.entries.end(); rit++) {
        exp.push_back(Rcl::strip_prefix(rit->term));
    }
    return true;
}

class TermMatchCmpByWcf {
public:
    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
        return r.wcf - l.wcf < 0;
    }
};
class TermMatchCmpByTerm {
public:
    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
        return l.term.compare(r.term) > 0;
    }
};
class TermMatchTermEqual {
public:
    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
        return !l.term.compare(r.term);
    }
};

static const char *tmtptostr(int typ)
{
    switch (typ) {
    case Db::ET_WILD: return "wildcard";
    case Db::ET_REGEXP: return "regexp";
    case Db::ET_STEM: return "stem";
    case Db::ET_NONE:
    default: return "none";
    }
}

// Find all index terms that match an input along different expansion modes:
// wildcard, regular expression, or stemming. Depending on flags we perform
// case and/or diacritics expansion (this can be the only thing requested).
// If the "field" parameter is set, we return a list of appropriately
// prefixed terms (which are going to be used to build a Xapian
// query).
// This routine performs case/diacritics/stemming expansion against
// the auxiliary tables, and possibly calls idxTermMatch() for work
// using the main index terms (filtering, retrieving stats, expansion
// in some cases).
bool Db::termMatch(int typ_sens, const string &lang, const string &_term,
                   TermMatchResult& res, int max,  const string& field,
                   vector<string>* multiwords)
{
    int matchtyp = matchTypeTp(typ_sens);
    if (!m_ndb || !m_ndb->m_isopen)
        return false;
    Xapian::Database xrdb = m_ndb->xrdb;

    bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
    bool case_sensitive = (typ_sens & ET_CASESENS) != 0;

    LOGDEB0("Db::TermMatch: typ "  << (tmtptostr(matchtyp)) << " diacsens "  << (diac_sensitive) << " casesens "  << (case_sensitive) << " lang ["  << (lang) << "] term ["  << (_term) << "] max "  << (max) << " field ["  << (field) << "] stripped "  << (o_index_stripchars) << " init res.size "  << (res.entries.size()) << "\n" );

    // If index is stripped, no case or diac expansion can be needed:
    // for the processing inside this routine, everything looks like
    // we're all-sensitive: no use of expansion db.
    // Also, convert input to lowercase and strip its accents.
    string term = _term;
    if (o_index_stripchars) {
        diac_sensitive = case_sensitive = true;
        if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
            LOGERR("Db::termMatch: unac failed for ["  << (_term) << "]\n" );
            return false;
        }
    }

    // The case/diac expansion db
    SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
    XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);

    if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
        std::shared_ptr<StrMatcher> matcher;
        if (matchtyp == ET_WILD) {
            matcher = std::shared_ptr<StrMatcher>(new StrWildMatcher(term));
        } else {
            matcher = std::shared_ptr<StrMatcher>(new StrRegexpMatcher(term));
        }
        if (!diac_sensitive || !case_sensitive) {
            // Perform case/diac expansion on the exp as appropriate and
            // expand the result.
            vector<string> exp;
            if (diac_sensitive) {
                // Expand for diacritics and case, filtering for same diacritics
                SynTermTransUnac foldtrans(UNACOP_FOLD);
                synac.synKeyExpand(matcher.get(), exp, &foldtrans);
            } else if (case_sensitive) {
                // Expand for diacritics and case, filtering for same case
                SynTermTransUnac unactrans(UNACOP_UNAC);
                synac.synKeyExpand(matcher.get(), exp, &unactrans);
            } else {
                // Expand for diacritics and case, no filtering
                synac.synKeyExpand(matcher.get(), exp);
            }
            // Retrieve additional info and filter against the index itself
            for (vector<string>::const_iterator it = exp.begin();
                 it != exp.end(); it++) {
                idxTermMatch(ET_NONE, "", *it, res, max, field);
            }
            // And also expand the original expression against the
            // main index: for the common case where the expression
            // had no case/diac expansion (no entry in the exp db if
            // the original term is lowercase and without accents).
            idxTermMatch(typ_sens, lang, term, res, max, field);
        } else {
            idxTermMatch(typ_sens, lang, term, res, max, field);
        }

    } else {
        // Expansion is STEM or NONE (which may still need synonyms
        // and case/diac exp)

        vector<string> lexp;
        if (diac_sensitive && case_sensitive) {
            // No case/diac expansion
            lexp.push_back(term);
        } else if (diac_sensitive) {
            // Expand for accents and case, filtering for same accents,
            SynTermTransUnac foldtrans(UNACOP_FOLD);
            synac.synExpand(term, lexp, &foldtrans);
        } else if (case_sensitive) {
            // Expand for accents and case, filtering for same case
            SynTermTransUnac unactrans(UNACOP_UNAC);
            synac.synExpand(term, lexp, &unactrans);
        } else {
            // We are neither accent- nor case- sensitive and may need stem
            // expansion or not. Expand for accents and case
            synac.synExpand(term, lexp);
        }

        if (matchtyp == ET_STEM || (typ_sens & ET_SYNEXP)) {
            // Note: if any of the above conds is true, we are insensitive to
            // diacs and case (enforced in searchdatatox:termexpand
            // Need stem expansion. Lowercase the result of accent and case
            // expansion for input to stemdb.
            for (unsigned int i = 0; i < lexp.size(); i++) {
                string lower;
                unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
                lexp[i] = lower;
            }
            sort(lexp.begin(), lexp.end());
            lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());

            if (matchtyp == ET_STEM) {
                StemDb sdb(xrdb);
                vector<string> exp1;
                for (vector<string>::const_iterator it = lexp.begin();
                     it != lexp.end(); it++) {
                    sdb.stemExpand(lang, *it, exp1);
                }
                exp1.swap(lexp);
                sort(lexp.begin(), lexp.end());
                lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
                LOGDEB("ExpTerm: stemexp: "  << (stringsToString(lexp)) << "\n" );
            }

            if (m_syngroups.ok() && (typ_sens & ET_SYNEXP)) {
                LOGDEB("ExpTerm: got syngroups\n" );
                vector<string> exp1(lexp);
                for (vector<string>::const_iterator it = lexp.begin();
                     it != lexp.end(); it++) {
                    vector<string> sg = m_syngroups.getgroup(*it);
                    if (!sg.empty()) {
                        LOGDEB("ExpTerm: syns: "  << *it << " -> "  << (stringsToString(sg)) << "\n" );
                        for (vector<string>::const_iterator it1 = sg.begin();
                             it1 != sg.end(); it1++) {
                            if (it1->find_first_of(" ") != string::npos) {
                                if (multiwords) {
                                    multiwords->push_back(*it1);
                                }
                            } else {
                                exp1.push_back(*it1);
                            }
                        }
                    }
                }
                lexp.swap(exp1);
                sort(lexp.begin(), lexp.end());
                lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
            }

            // Expand the resulting list for case (all stemdb content
            // is lowercase)
            vector<string> exp1;
            for (vector<string>::const_iterator it = lexp.begin();
                 it != lexp.end(); it++) {
                synac.synExpand(*it, exp1);
            }
            exp1.swap(lexp);
            sort(lexp.begin(), lexp.end());
            lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
        }

        // Filter the result and get the stats, possibly add prefixes.
        LOGDEB("ExpandTerm:TM: lexp: "  << (stringsToString(lexp)) << "\n" );
        for (vector<string>::const_iterator it = lexp.begin();
             it != lexp.end(); it++) {
            idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
        }
    }

    TermMatchCmpByTerm tcmp;
    sort(res.entries.begin(), res.entries.end(), tcmp);
    TermMatchTermEqual teq;
    vector<TermMatchEntry>::iterator uit =
        unique(res.entries.begin(), res.entries.end(), teq);
    res.entries.resize(uit - res.entries.begin());
    TermMatchCmpByWcf wcmp;
    sort(res.entries.begin(), res.entries.end(), wcmp);
    if (max > 0) {
        // Would need a small max and big stem expansion...
        res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
    }
    return true;
}

// Second phase of wildcard/regexp term expansion after case/diac
// expansion: expand against main index terms
bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
                      TermMatchResult& res, int max,  const string& field)
{
    int typ = matchTypeTp(typ_sens);
    LOGDEB1("Db::idxTermMatch: typ "  << (tmtptostr(typ)) << " lang ["  << (lang) << "] term ["  << (root) << "] max "  << (max) << " field ["  << (field) << "] init res.size "  << (res.entries.size()) << "\n" );

    if (typ == ET_STEM) {
        LOGFATAL("RCLDB: internal error: idxTermMatch called with ET_STEM\n" );
        abort();
    }

    Xapian::Database xdb = m_ndb->xrdb;

    string prefix;
    if (!field.empty()) {
        const FieldTraits *ftp = 0;
        if (!fieldToTraits(field, &ftp, true) || ftp->pfx.empty()) {
            LOGDEB("Db::termMatch: field is not indexed (no prefix): ["  << (field) << "]\n" );
        } else {
            prefix = wrap_prefix(ftp->pfx);
        }
    }
    res.prefix = prefix;

    std::shared_ptr<StrMatcher> matcher;
    if (typ == ET_REGEXP) {
        matcher = std::shared_ptr<StrMatcher>(new StrRegexpMatcher(root));
        if (!matcher->ok()) {
            LOGERR("termMatch: regcomp failed: "  << (matcher->getreason()));
                return false;
        }
    } else if (typ == ET_WILD) {
        matcher = std::shared_ptr<StrMatcher>(new StrWildMatcher(root));
    }

    // Find the initial section before any special char
    string::size_type es = string::npos;
    if (matcher) {
        es = matcher->baseprefixlen();
    }

    // Initial section: the part of the prefix+expr before the
    // first wildcard character. We only scan the part of the
    // index where this matches
    string is;
    if (es == string::npos) {
        is = prefix + root;
    } else if (es == 0) {
        is = prefix;
    } else {
        is = prefix + root.substr(0, es);
    }
    LOGDEB2("termMatch: initsec: ["  << (is) << "]\n" );

    for (int tries = 0; tries < 2; tries++) {
        try {
            Xapian::TermIterator it = xdb.allterms_begin();
            if (!is.empty())
                it.skip_to(is.c_str());
            for (int rcnt = 0; it != xdb.allterms_end(); it++) {
                // If we're beyond the terms matching the initial
                // section, end
                if (!is.empty() && (*it).find(is) != 0)
                    break;

                // Else try to match the term. The matcher content
                // is without prefix, so we remove this if any. We
                // just checked that the index term did begin with
                // the prefix.
                string term;
                if (!prefix.empty()) {
                    term = (*it).substr(prefix.length());
                } else {
                    if (has_prefix(*it)) {
                        continue;
                    }
                    term = *it;
                }

                if (matcher && !matcher->match(term))
                    continue;

                res.entries.push_back(
                    TermMatchEntry(*it, xdb.get_collection_freq(*it),
                                   it.get_termfreq()));

                // The problem with truncating here is that this is done
                // alphabetically and we may not keep the most frequent
                // terms. OTOH, not doing it may stall the program if
                // we are walking the whole term list. We compromise
                // by cutting at 2*max
                if (max > 0 && ++rcnt >= 2*max)
                    break;
            }
            m_reason.erase();
            break;
        } catch (const Xapian::DatabaseModifiedError &e) {
            m_reason = e.get_msg();
            xdb.reopen();
            continue;
        } XCATCHERROR(m_reason);
        break;
    }
    if (!m_reason.empty()) {
        LOGERR("termMatch: "  << (m_reason) << "\n" );
        return false;
    }

    return true;
}

/** Term list walking. */
class TermIter {
public:
    Xapian::TermIterator it;
    Xapian::Database db;
};
TermIter *Db::termWalkOpen()
{
    if (!m_ndb || !m_ndb->m_isopen)
        return 0;
    TermIter *tit = new TermIter;
    if (tit) {
        tit->db = m_ndb->xrdb;
        XAPTRY(tit->it = tit->db.allterms_begin(), tit->db, m_reason);
        if (!m_reason.empty()) {
            LOGERR("Db::termWalkOpen: xapian error: "  << (m_reason) << "\n" );
            return 0;
        }
    }
    return tit;
}
bool Db::termWalkNext(TermIter *tit, string &term)
{
    XAPTRY(
        if (tit && tit->it != tit->db.allterms_end()) {
            term = *(tit->it)++;
            return true;
        }
        , tit->db, m_reason);

    if (!m_reason.empty()) {
        LOGERR("Db::termWalkOpen: xapian error: "  << (m_reason) << "\n" );
    }
    return false;
}
void Db::termWalkClose(TermIter *tit)
{
    try {
        delete tit;
    } catch (...) {}
}

bool Db::termExists(const string& word)
{
    if (!m_ndb || !m_ndb->m_isopen)
        return 0;

    XAPTRY(if (!m_ndb->xrdb.term_exists(word)) return false,
           m_ndb->xrdb, m_reason);

    if (!m_reason.empty()) {
        LOGERR("Db::termWalkOpen: xapian error: "  << (m_reason) << "\n" );
        return false;
    }
    return true;
}

bool Db::stemDiffers(const string& lang, const string& word,
                     const string& base)
{
    Xapian::Stem stemmer(lang);
    if (!stemmer(word).compare(stemmer(base))) {
        LOGDEB2("Rcl::Db::stemDiffers: same for "  << (word) << " and "  << (base) << "\n" );
        return false;
    }
    return true;
}

} // End namespace Rcl