none

2012-09-12 10:33:04 +02:00 · 2012-09-12 10:33:04 +02:00 · eaaa7aa419
commit eaaa7aa419
parent 80dd073b08
2 changed files with 196 additions and 0 deletions
--- a/src/rcldb/expansiondbs.cpp
+++ b/src/rcldb/expansiondbs.cpp
@ -0,0 +1,145 @@
+/* Copyright (C) 2005 J.F.Dockes 
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+
+#include "autoconfig.h"
+
+#include "debuglog.h"
+#include "utf8iter.h"
+#include "smallut.h"
+#include "refcntr.h"
+#include "textsplit.h"
+#include "xmacros.h"
+#include "rcldb.h"
+#include "stemdb.h"
+#include "expansiondbs.h"
+
+using namespace std;
+
+namespace Rcl {
+
+/**
+ * Create all expansion dbs used to transform user input term to widen a query
+ * We use Xapian synonyms subsets to store the expansions.
+ */
+bool createExpansionDbs(Xapian::WritableDatabase& wdb, 
+			const vector<string>& langs)
+{
+    LOGDEB(("StemDb::createExpansionDbs: languages: %s\n", 
+	    stringsToString(langs).c_str()));
+    Chrono cron;
+
+    // Erase and recreate all the expansion groups
+
+    // Stem dbs
+    vector<XapWritableComputableSynFamMember> stemdbs;
+    // Note: tried to make this to work with stack-allocated objects, couldn't.
+    // Looks like a bug in copy constructors somewhere, can't guess where
+    vector<RefCntr<SynTermTransStem> > stemmers;
+    for (unsigned int i = 0; i < langs.size(); i++) {
+	stemmers.push_back(RefCntr<SynTermTransStem>
+			   (new SynTermTransStem(langs[i])));
+	stemdbs.push_back(
+	    XapWritableComputableSynFamMember(wdb, synFamStem, langs[i], 
+					      stemmers.back().getptr()));
+	stemdbs.back().recreate();
+    }
+
+#ifndef RCL_INDEX_STRIPCHARS
+    // Unaccented stem dbs
+    vector<XapWritableComputableSynFamMember> unacstemdbs;
+    // We can reuse the same stemmer pointers, the objects are stateless.
+    for (unsigned int i = 0; i < langs.size(); i++) {
+	unacstemdbs.push_back(
+	    XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i], 
+					      stemmers.back().getptr()));
+	unacstemdbs.back().recreate();
+    }
+
+    SynTermTransUnac transunac(UNACOP_UNACFOLD);
+    XapWritableComputableSynFamMember 
+	diacasedb(wdb, synFamDiac, "all", &transunac);
+    diacasedb.recreate();
+#endif
+
+    // Walk the list of all terms, and stem/unac each.
+    string ermsg;
+    try {
+        for (Xapian::TermIterator it = wdb.allterms_begin(); 
+	     it != wdb.allterms_end(); it++) {
+
+	    // Skip terms which don't look like natural language words.
+            if (!Db::isSpellingCandidate(*it)) {
+                LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
+                continue;
+            }
+
+	    // Detect and skip CJK terms.
+	    // We're still sending all other multibyte utf-8 chars to
+            // the stemmer, which is not too well defined for
+            // xapian<1.0 (very obsolete now), but seems to work
+            // anyway. There shouldn't be too many in any case because
+            // accents are stripped at this point. 
+	    // The effect of stripping accents on stemming is not good, 
+            // (e.g: in french partimes -> partim, parti^mes -> part)
+	    // but fixing the issue would be complicated.
+	    Utf8Iter utfit(*it);
+	    if (TextSplit::isCJK(*utfit)) {
+		// LOGDEB(("stemskipped: Skipping CJK\n"));
+		continue;
+	    }
+
+	    string lower = *it;
+#ifndef RCL_INDEX_STRIPCHARS
+	    // If the index is raw, compute the case-folded term which
+	    // is the input to the stem db, and add a synonym from the
+	    // stripped term to the cased and accented one, for accent
+	    // and case expansion at query time
+	    unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
+	    diacasedb.addSynonym(*it);
+#endif
+
+	    // Create stemming synonym for every language. The input is the 
+	    // lowercase accented term
+	    for (unsigned int i = 0; i < langs.size(); i++) {
+		stemdbs[i].addSynonym(lower);
+	    }
+
+#ifndef RCL_INDEX_STRIPCHARS
+	    // For a raw index, also maybe create a stem expansion for
+	    // the unaccented term. While this may be incorrect, it is
+	    // also necessary for searching in a diacritic-unsensitive
+	    // way on a raw index
+	    string unac;
+	    unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
+	    if (unac != lower)
+		for (unsigned int i = 0; i < langs.size(); i++) {
+		    unacstemdbs[i].addSynonym(unac);
+		}
+#endif
+        }
+    } XCATCHERROR(ermsg);
+    if (!ermsg.empty()) {
+        LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
+        return false;
+    }
+
+    LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
+    return true;
+}
+
+}    
--- a/src/rcldb/expansiondbs.h
+++ b/src/rcldb/expansiondbs.h
@ -0,0 +1,51 @@
+/* Copyright (C) 2012 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+#ifndef _EXPANSIONDBS_H_INCLUDED_
+#define _EXPANSIONDBS_H_INCLUDED_
+
+#include <string>
+#include <vector>
+
+#include <xapian.h>
+
+namespace Rcl {
+
+/* A Capitals/Diacritics removal functor for using with
+   XapComputableSynFamMember */
+class SynTermTransUnac : public SynTermTrans {
+public:
+    SynTermTransUnac(UnacOp op)
+    : m_op(op)
+    {
+    }
+    virtual std::string operator()(const std::string& in)
+    {
+	string out;
+	unacmaybefold(in, out, "UTF-8", m_op);
+	LOGDEB2(("SynTermTransUnac(%d): in [%s] out [%s]\n", int(m_op),
+		 in.c_str(), out.c_str()));
+	return out;
+    }
+    UnacOp m_op;
+};
+
+/** Walk the Xapian term list and create all the expansion dbs in one go */
+extern bool createExpansionDbs(Xapian::WritableDatabase& wdb, 
+			       const std::vector<std::string>& langs);
+}
+
+#endif /* _EXPANSIONDBS_H_INCLUDED_ */