more utf-8 err checking prevents bogus terms in index

2013-03-30 10:24:10 +01:00 · 2013-03-30 10:24:10 +01:00 · 5341b5575b
commit 5341b5575b
parent 1bdf2b67e5
6 changed files with 54 additions and 23 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -350,6 +350,9 @@ bool TextSplit::text_to_words(const string &in)
 	     m_flags & TXTS_KEEPWILD ? " keepwild" : "",
 	     in.substr(0,50).c_str()));
    if (in.empty())
 	return true;
    m_span.erase();
    m_inNumber = false;
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
--- a/src/doc/user/00README.txt
+++ b/src/doc/user/00README.txt
@ -31,3 +31,7 @@ making progress.
 The current conclusion would seem to be that the SGML version should stay
 operational to give an easy way to make the PDF one on FreeBSD.
 But see also notes about dblatex on the asciidoc page. Actually asciidoc would 
 be a candidate replacement for the source format. 
 http://www.methods.co.nz/asciidoc/userguide.html
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -540,8 +540,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 	!stringlowercmp("x-user-defined", charset) || 
 	!stringlowercmp("x-unknown", charset) || 
 	!stringlowercmp("unknown", charset) ) {
-        m_config->getConfParam("maildefcharset", charset);
+        if (!m_config->getConfParam("maildefcharset", charset))
        if (charset.empty())
            charset = "CP1252";
    }
--- a/src/utils/pathut.cpp
+++ b/src/utils/pathut.cpp
@ -462,7 +462,7 @@ string url_encode(const string& url, string::size_type offs)
    string out = url.substr(0, offs);
    const char *cp = url.c_str();
    for (string::size_type i = offs; i < url.size(); i++) {
-	int c;
+	unsigned int c;
 	const char *h = "0123456789ABCDEF";
 	c = cp[i];
 	if (c <= 0x20 || 
--- a/src/utils/strmatcher.cpp
+++ b/src/utils/strmatcher.cpp
@ -28,6 +28,7 @@ using std::string;
 #include "cstr.h"
 #include "debuglog.h"
 #include "strmatcher.h"
 #include "pathut.h"
 bool StrWildMatcher::match(const string& val) const
 {
@ -38,8 +39,9 @@ bool StrWildMatcher::match(const string& val) const
    case 0: return true;
    case FNM_NOMATCH: return false;
    default:
-	LOGDEB0(("StrWildMatcher::match error: [%s] against [%s]\n", 
+	LOGINFO(("StrWildMatcher::match:err: e [%s] s [%s] (%s) ret %d\n", 
-		 m_sexp.c_str(), val.c_str()));
+		 m_sexp.c_str(), val.c_str(), 
 		 url_encode(val).c_str(), ret));
 	return false;
    }
 }
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -32,7 +32,7 @@
 class Utf8Iter {
 public:
    Utf8Iter(const std::string &in) 
-	: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
+	: m_s(in), m_cl(0), m_pos(0), m_charpos(0)
    {
 	update_cl();
    }
@ -44,7 +44,6 @@ public:
 	m_cl = 0; 
 	m_pos = 0; 
 	m_charpos = 0; 
 	m_error = false;
 	update_cl();
    }
@ -62,15 +61,15 @@ public:
 	int l;
 	while (mypos < m_s.length() && mycp != charpos) {
 	    l = get_cl(mypos);
-	    if (l <= 0)
+	    if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
 		return (unsigned int)-1;
 	    mypos += l;
 	    ++mycp;
 	}
 	if (mypos < m_s.length() && mycp == charpos) {
 	    l = get_cl(mypos);
-	    if (poslok(mypos, l))
+	    if (poslok(mypos, l) && checkvalidat(mypos, l))
-		return getvalueat(mypos, get_cl(mypos));
+		return getvalueat(mypos, l);
 	}
 	return (unsigned int)-1;
    }
@ -83,7 +82,7 @@ public:
 #ifdef UTF8ITER_CHECK
 	assert(m_cl != 0);
 #endif
-	if (m_cl <= 0) 
+	if (m_cl == 0)
 	    return std::string::npos;
 	m_pos += m_cl;
@ -96,9 +95,9 @@ public:
    unsigned int operator*() 
    {
 #ifdef UTF8ITER_CHECK
-	assert(m_cl != 0);
+	assert(m_cl > 0);
 #endif
-	return getvalueat(m_pos, m_cl);
+	return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
    }
    /** Append current utf-8 possibly multi-byte character to string param.
@ -116,15 +115,15 @@ public:
 #ifdef UTF8ITER_CHECK
 	assert(m_cl != 0);
 #endif
-	return m_s.substr(m_pos, m_cl);
+	return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
    }
-    bool eof() {
+    bool eof() const {
 	return m_pos == m_s.length();
    }
-    bool error() {
+    bool error() const {
-	return m_error;
+	return m_cl == 0;
    }
    /** Return current byte offset in input string */
@ -152,8 +151,6 @@ private:
    std::string::size_type m_pos; 
    // Current character position
    unsigned int      m_charpos; 
    // Am I ok ?
    mutable bool      m_error;
    // Check position and cl against string length
    bool poslok(std::string::size_type p, int l) const {
@ -163,7 +160,7 @@ private:
 	return p != std::string::npos && l > 0 && p + l <= m_s.length();
    }
-    // Update current char length in object state, minimum checking
+    // Update current char length in object state, check
    // for errors
    inline void update_cl() 
    {
@ -176,7 +173,34 @@ private:
 	    // basically prevents the caller to discriminate error and eof.
 	    //	    m_pos = m_s.length();
 	    m_cl = 0;
-	    m_error = true;
+	    return;
 	}
 	if (!checkvalidat(m_pos, m_cl)) {
 	    m_cl = 0;
 	}
    }
    inline bool checkvalidat(std::string::size_type p, int l) const
    {
 	switch (l) {
 	case 1: 
 	    return (unsigned char)m_s[p] < 128;
 	case 2: 
 	    return (((unsigned char)m_s[p]) & 224) == 192
 		&& (((unsigned char)m_s[p+1]) & 192) == 128;
 	case 3: 
 	    return (((unsigned char)m_s[p]) & 240) == 224
 		   && (((unsigned char)m_s[p+1]) & 192) ==  128
 		   && (((unsigned char)m_s[p+2]) & 192) ==  128
 		   ;
 	case 4: 
 	    return (((unsigned char)m_s[p]) & 248) == 240
 		   && (((unsigned char)m_s[p+1]) & 192) ==  128
 		   && (((unsigned char)m_s[p+2]) & 192) ==  128
 		   && (((unsigned char)m_s[p+3]) & 192) ==  128
 		;
 	default:
 	    return false;
 	}
    }
@ -249,7 +273,6 @@ private:
 #ifdef UTF8ITER_CHECK
 	    assert(l <= 4);
 #endif
 	    m_error = true;
 	    return (unsigned int)-1;
 	}
    }