/* Copyright (C) 2004 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ #include "autoconfig.h" #include #include #include "cstr.h" #include "refcntr.h" #include "rcldoc.h" #include "stoplist.h" #include "rclconfig.h" #include "utf8iter.h" #include "textsplit.h" using std::string; using std::vector; // rcldb defines an interface for a 'real' text database. The current // implementation uses xapian only, and xapian-related code is in rcldb.cpp // If support was added for other backend, the xapian code would be moved in // rclxapian.cpp, another file would be created for the new backend, and the // configuration/compile/link code would be adjusted to allow choosing. There // is no plan for supporting multiple different backends. // // In no case does this try to implement a useful virtualized text-db interface // The main goal is simplicity and good matching to usage inside the recoll // user interface. In other words, this is not exhaustive or well-designed or // reusable. // // Unique Document Identifier: uniquely identifies a document in its // source storage (file system or other). Used for up to date checks // etc. "udi". Our user is responsible for making sure it's not too // big, cause it's stored as a Xapian term (< 150 bytes would be // reasonable) class RclConfig; #ifndef NO_NAMESPACES namespace Rcl { #endif // Omega compatible values. We leave a hole for future omega values. Not sure // it makes any sense to keep any level of omega compat given that the index // is incompatible anyway. enum value_slot { // Omega-compatible values: VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970. VALUE_MD5 = 1, // 16 byte MD5 checksum of original document. VALUE_SIZE = 2, // sortable_serialise() // Recoll only: VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size }; class SearchData; class TermIter; class Query; /** Used for returning result lists for index terms matching some criteria */ class TermMatchEntry { public: TermMatchEntry() : wcf(0) { } TermMatchEntry(const string& t, int f, int d) : term(t), wcf(f), docs(d) { } TermMatchEntry(const string& t) : term(t), wcf(0) { } bool operator==(const TermMatchEntry &o) const { return term == o.term; } bool operator<(const TermMatchEntry &o) const { return term < o.term; } string term; int wcf; // Total count of occurrences within collection. int docs; // Number of documents countaining term. }; /** Term match result list header: statistics and global info */ class TermMatchResult { public: TermMatchResult() { clear(); } void clear() { entries.clear(); dbdoccount = 0; dbavgdoclen = 0; } // Term expansion vector entries; // If a field was specified, this is the corresponding index prefix string prefix; // Index-wide stats unsigned int dbdoccount; double dbavgdoclen; }; #ifdef IDX_THREADS extern void *DbUpdWorker(void*); #endif // IDX_THREADS inline bool has_prefix(const string& trm) { #ifndef RCL_INDEX_STRIPCHARS if (o_index_stripchars) { #endif return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z'; #ifndef RCL_INDEX_STRIPCHARS } else { return !trm.empty() && trm[0] == ':'; } #endif } inline string wrap_prefix(const string& pfx) { #ifndef RCL_INDEX_STRIPCHARS if (o_index_stripchars) { #endif return pfx; #ifndef RCL_INDEX_STRIPCHARS } else { return cstr_colon + pfx + cstr_colon; } #endif } /** * Wrapper class for the native database. */ class Db { public: // A place for things we don't want visible here. class Native; friend class Native; #ifdef IDX_THREADS friend void *DbUpdWorker(void*); #endif // IDX_THREADS /* General stuff (valid for query or update) ****************************/ Db(RclConfig *cfp); ~Db(); enum OpenMode {DbRO, DbUpd, DbTrunc}; enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb}; bool open(OpenMode mode, OpenError *error = 0); bool close(); bool isopen(); /** Get explanation about last error */ string getReason() const {return m_reason;} /** Return all possible stemmer names */ static vector getStemmerNames(); /** Return existing stemming databases */ vector getStemLangs(); /** Test word for spelling correction candidate: not too long, no special chars... */ static bool isSpellingCandidate(const string& term) { if (term.empty() || term.length() > 50) return false; if (has_prefix(term)) return false; Utf8Iter u8i(term); if (TextSplit::isCJK(*u8i)) return false; if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") != string::npos) return false; return true; } #ifdef TESTING_XAPIAN_SPELL /** Return spelling suggestion */ string getSpellingSuggestion(const string& word); #endif /* The next two, only for searchdata, should be somehow hidden */ /* Return configured stop words */ const StopList& getStopList() const {return m_stops;} /* Field name to prefix translation (ie: author -> 'A') */ bool fieldToTraits(const string& fldname, const FieldTraits **ftpp); /* Update-related methods ******************************************/ /** Test if the db entry for the given udi is up to date (by * comparing the input and stored sigs). * Side-effect: set the existence flag for the file document * and all subdocs if any (for later use by 'purge()') */ bool needUpdate(const string &udi, const string& sig); /** Add or update document. The Doc class should have been filled as much as * possible depending on the document type. parent_udi is only * use for subdocs, else set it to empty */ bool addOrUpdate(const string &udi, const string &parent_udi, Doc &doc); /** Delete document(s) for given UDI, including subdocs */ bool purgeFile(const string &udi, bool *existed = 0); /** Remove documents that no longer exist in the file system. This * depends on the update map, which is built during * indexing (needUpdate()). * * This should only be called after a full walk of * the file system, else the update map will not be complete, and * many documents will be deleted that shouldn't, which is why this * has to be called externally, rcldb can't know if the indexing * pass was complete or partial. */ bool purge(); /** Create stem expansion database for given language. */ bool createStemDbs(const std::vector &langs); /** Delete stem expansion database for given language. */ bool deleteStemDb(const string &lang); /* Query-related methods ************************************/ /** Return total docs in db */ int docCnt(); /** Return count of docs which have an occurrence of term */ int termDocCnt(const string& term); /** Add extra database for querying */ bool addQueryDb(const string &dir); /** Remove extra database. if dir == "", remove all. */ bool rmQueryDb(const string &dir); /** Look where the doc result comes from. * @return: 0 main index, (size_t)-1 don't know, * other: order of database in add_database() sequence. */ size_t whatDbIdx(const Doc& doc); /** Tell if directory seems to hold xapian db */ static bool testDbDir(const string &dir); /** Return the index terms that match the input string * Expansion is performed either with either wildcard or regexp processing * Stem expansion is performed if lang is not empty * * @param typ defines the kind of expansion: wildcard, regexp or stemming * @param lang sets the stemming language(s). Can be a space-separated list * @param term is the term to expand * @param result is the main output * @param max defines the maximum result count * @param field if set, defines the field within with the expansion should * be performed. Only used for wildcards and regexps, stemming is * always global. If this is set, the resulting output terms * will be appropriately prefix and the prefix value will be set * in the TermMatchResult header */ enum MatchType {ET_WILD, ET_REGEXP, ET_STEM}; bool termMatch(MatchType typ, const string &lang, const string &term, TermMatchResult& result, int max = -1, const string& field = cstr_null ); /** Return min and max years for doc mod times in db */ bool maxYearSpan(int *minyear, int *maxyear); /** Wildcard expansion specific to file names. Internal/sdata use only */ bool filenameWildExp(const string& exp, vector& names); /** Set parameters for synthetic abstract generation */ void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen); int getAbsCtxLen() const { return m_synthAbsWordCtxLen; } int getAbsLen() const { return m_synthAbsLen; } /** Get document for given udi * * Used by the 'history' feature (and nothing else?) */ bool getDoc(const string &udi, Doc &doc); /* The following are mainly for the aspell module */ /** Whole term list walking. */ TermIter *termWalkOpen(); bool termWalkNext(TermIter *, string &term); void termWalkClose(TermIter *); /** Test term existence */ bool termExists(const string& term); /** Test if terms stem to different roots. */ bool stemDiffers(const string& lang, const string& term, const string& base); RclConfig *getConf() {return m_config;} /** Activate the "in place reset" mode where all documents are considered as needing update. This is a global/per-process option, and can't be reset. It should be set at the start of the indexing pass */ static void setInPlaceReset() {o_inPlaceReset = true;} /* This has to be public for access by embedded Query::Native */ Native *m_ndb; private: // Internal form of close, can be called during destruction bool i_close(bool final); RclConfig *m_config; string m_reason; // Error explanation /* Parameters cached out of the configuration files */ // This is how long an abstract we keep or build from beginning of // text when indexing. It only has an influence on the size of the // db as we are free to shorten it again when displaying int m_idxAbsTruncLen; // This is the size of the abstract that we synthetize out of query // term contexts at *query time* int m_synthAbsLen; // This is how many words (context size) we keep around query terms // when building the abstract int m_synthAbsWordCtxLen; // Flush threshold. Megabytes of text indexed before we flush. int m_flushMb; // Text bytes indexed since beginning long long m_curtxtsz; // Text bytes at last flush long long m_flushtxtsz; // Text bytes at last fsoccup check long long m_occtxtsz; // First fs occup check ? int m_occFirstCheck; // Maximum file system occupation percentage int m_maxFsOccupPc; // Database directory string m_basedir; // Xapian directories for additional databases to query vector m_extraDbs; OpenMode m_mode; // File existence vector: this is filled during the indexing pass. Any // document whose bit is not set at the end is purged vector updated; // Stop terms: those don't get indexed. StopList m_stops; // When this is set, all documents are considered as needing a reindex. // This implements an alternative to just erasing the index before // beginning, with the advantage that, for small index formats updates, // between releases the index remains available while being recreated. static bool o_inPlaceReset; // Reinitialize when adding/removing additional dbs bool adjustdbs(); bool stemExpand(const string &lang, const string &s, TermMatchResult& result, int max = -1); // Flush when idxflushmb is reached bool maybeflush(off_t moretext); /* Copyconst and assignement private and forbidden */ Db(const Db &) {} Db& operator=(const Db &) {return *this;}; }; // This has to go somewhere, and as it needs the Xapian version, this is // the most reasonable place. string version_string(); extern const string pathelt_prefix; extern const string udi_prefix; extern const string parent_prefix; #ifdef RCL_INDEX_STRIPCHARS extern const string start_of_field_term; extern const string end_of_field_term; #else extern string start_of_field_term; extern string end_of_field_term; #endif } #endif /* _DB_H_INCLUDED_ */