Make Recoll optionally sensitive to case and diacritics

2012-09-14 14:34:27 +02:00 · 2012-09-14 14:34:27 +02:00 · 166624f7f2
commit 166624f7f2
parent 7fcfe27952
30 changed files with 849 additions and 487 deletions
--- a/src/rcldb/stemdb.cpp
+++ b/src/rcldb/stemdb.cpp
@ -19,6 +19,9 @@
 * Management of the auxiliary databases listing stems and their expansion 
 * terms
 */
+
+#include "autoconfig.h"
+
 #include <unistd.h>

 #include <algorithm>
@ -27,13 +30,8 @@
 #include <xapian.h>

 #include "stemdb.h"
-#include "pathut.h"
 #include "debuglog.h"
 #include "smallut.h"
-#include "utf8iter.h"
-#include "textsplit.h"
-#include "rcldb.h"
-#include "rcldb_p.h"
 #include "synfamily.h"
 #include "unacpp.h"

@ -43,140 +41,6 @@ using namespace std;

 namespace Rcl {

-// Fast raw detection of non-natural-language words: look for ascii
-// chars which are not lowercase letters. Not too sure what islower()
-// would do with 8 bit values, so not using it here. If we want to be
-// more complete we'd need to go full utf-8
-inline static bool p_notlowerascii(unsigned int c)
-{
-    if (c < 'a' || (c > 'z' && c < 128))
-	return true;
-    return false;
-}
-
-/**
- * Create database of stem to parents associations for a given language.
- */
-bool createExpansionDbs(Xapian::WritableDatabase& wdb, 
-			const vector<string>& langs)
-{
-    LOGDEB(("StemDb::createExpansionDbs\n"));
-    Chrono cron;
-
-    vector<XapWritableSynFamily> stemdbs;
-    for (unsigned int i = 0; i < langs.size(); i++) {
-	stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
-	stemdbs[i].deleteMember(langs[i]);
-	stemdbs[i].createMember(langs[i]);
-	stemdbs[i].setCurrentMemberName(langs[i]);
-    }
-
-    // We walk the list of all terms, and stem each. We skip terms which
-    // don't look like natural language.
-    // If the stem is not identical to the term, we add a synonym entry.
-    // Statistics
-    int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
-    int stemconst = 0; // Stem == term
-    int allsyns = 0; // Total number of entries created
-
-    string ermsg;
-    try {
-	vector<Xapian::Stem> stemmers;
-	for (unsigned int i = 0; i < langs.size(); i++) {
-	    stemmers.push_back(Xapian::Stem(langs[i]));
-	}
-
-        for (Xapian::TermIterator it = wdb.allterms_begin(); 
-	     it != wdb.allterms_end(); it++) {
-	    // If the term has any non-lowercase 7bit char (that is,
-            // numbers, capitals and punctuation) dont stem.
-            string::iterator sit = (*it).begin(), eit = sit + (*it).length();
-            if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
-                ++nostem;
-                LOGDEB1(("stemskipped: [%s], because of 0x%x\n", 
-                         (*it).c_str(), *sit));
-                continue;
-            }
-
-	    // Detect and skip CJK terms.
-	    // We're still sending all other multibyte utf-8 chars to
-            // the stemmer, which is not too well defined for
-            // xapian<1.0 (very obsolete now), but seems to work
-            // anyway. There shouldn't be too many in any case because
-            // accents are stripped at this point. 
-	    // The effect of stripping accents on stemming is not good, 
-            // (e.g: in french partimes -> partim, parti^mes -> part)
-	    // but fixing the issue would be complicated.
-	    Utf8Iter utfit(*it);
-	    if (TextSplit::isCJK(*utfit)) {
-		// LOGDEB(("stemskipped: Skipping CJK\n"));
-		continue;
-	    }
-
-	    // Create stemming synonym for every lang
-	    for (unsigned int i = 0; i < langs.size(); i++) {
-		string stem = stemmers[i](*it);
-		if (stem == *it) {
-		    ++stemconst;
-		} else {
-		    stemdbs[i].addSynonym(stem, *it);
-		    LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n", 
-			     (*it).c_str(), langs[i].c_str(), stem.c_str()));
-		    ++allsyns;
-		}
-	    }
-
-        }
-    } XCATCHERROR(ermsg);
-    if (!ermsg.empty()) {
-        LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
-        return false;
-    }
-
-    LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
-    LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n", 
-	    nostem, stemconst, allsyns));
-    return true;
-}
-
-/**
- * Expand term to list of all terms which stem to the same term, for one
- * expansion language
- */
-bool StemDb::expandOne(const std::string& lang,
-		       const std::string& term,
-		       vector<string>& result)
-{
-    try {
-	Xapian::Stem stemmer(lang);
-	string stem = stemmer(term);
-	LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n", 
-                lang.c_str(), term.c_str(), stem.c_str()));
-
-	if (!synExpand(lang, stem, result)) {
-	    // ?
-	}
-
-	// If the user term or stem are not in the list, add them
-	if (find(result.begin(), result.end(), term) == result.end()) {
-	    result.push_back(term);
-	}
-	if (find(result.begin(), result.end(), stem) == result.end()) {
-	    result.push_back(stem);
-	}
-	LOGDEB0(("stemExpand:%s: %s ->  %s\n", lang.c_str(), stem.c_str(),
-		 stringsToString(result).c_str()));
-
-    } catch (...) {
-	LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
-		lang.c_str()));
-	result.push_back(term);
-	return false;
-    }
-
-    return true;
-}
-    
 /**
 * Expand for one or several languages
 */
@ -186,14 +50,34 @@ bool StemDb::stemExpand(const std::string& langs,
 {
    vector<string> llangs;
    stringToStrings(langs, llangs);
+
    for (vector<string>::const_iterator it = llangs.begin();
 	 it != llangs.end(); it++) {
-	vector<string> oneexp;
-	expandOne(*it, term, oneexp);
-	result.insert(result.end(), oneexp.begin(), oneexp.end());
+	SynTermTransStem stemmer(*it);
+	XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
+	(void)expander.synExpand(term, result);
    }
+
+#ifndef RCL_INDEX_STRIPCHARS
+    for (vector<string>::const_iterator it = llangs.begin();
+	 it != llangs.end(); it++) {
+	SynTermTransStem stemmer(*it);
+	XapComputableSynFamMember expander(getdb(), synFamStemUnac, 
+					   *it, &stemmer);
+	string unac;
+	unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
+	(void)expander.synExpand(unac, result);
+    }
+#endif 
+
+    if (result.empty())
+	result.push_back(term);
+
    sort(result.begin(), result.end());
-    unique(result.begin(), result.end());
+    vector<string>::iterator uit = unique(result.begin(), result.end());
+    result.resize(uit - result.begin());
+    LOGDEB0(("stemExpand:%s: %s ->  %s\n", langs.c_str(), term.c_str(),
+	     stringsToString(result).c_str()));
    return true;
 }