more utf-8 err checking prevents bogus terms in index

This commit is contained in:
Jean-Francois Dockes 2013-03-30 10:24:10 +01:00
parent 1bdf2b67e5
commit 5341b5575b
6 changed files with 54 additions and 23 deletions

View file

@ -350,6 +350,9 @@ bool TextSplit::text_to_words(const string &in)
m_flags & TXTS_KEEPWILD ? " keepwild" : "", m_flags & TXTS_KEEPWILD ? " keepwild" : "",
in.substr(0,50).c_str())); in.substr(0,50).c_str()));
if (in.empty())
return true;
m_span.erase(); m_span.erase();
m_inNumber = false; m_inNumber = false;
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0; m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;

View file

@ -31,3 +31,7 @@ making progress.
The current conclusion would seem to be that the SGML version should stay The current conclusion would seem to be that the SGML version should stay
operational to give an easy way to make the PDF one on FreeBSD. operational to give an easy way to make the PDF one on FreeBSD.
But see also notes about dblatex on the asciidoc page. Actually asciidoc would
be a candidate replacement for the source format.
http://www.methods.co.nz/asciidoc/userguide.html

View file

@ -540,8 +540,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
!stringlowercmp("x-user-defined", charset) || !stringlowercmp("x-user-defined", charset) ||
!stringlowercmp("x-unknown", charset) || !stringlowercmp("x-unknown", charset) ||
!stringlowercmp("unknown", charset) ) { !stringlowercmp("unknown", charset) ) {
m_config->getConfParam("maildefcharset", charset); if (!m_config->getConfParam("maildefcharset", charset))
if (charset.empty())
charset = "CP1252"; charset = "CP1252";
} }

View file

@ -462,7 +462,7 @@ string url_encode(const string& url, string::size_type offs)
string out = url.substr(0, offs); string out = url.substr(0, offs);
const char *cp = url.c_str(); const char *cp = url.c_str();
for (string::size_type i = offs; i < url.size(); i++) { for (string::size_type i = offs; i < url.size(); i++) {
int c; unsigned int c;
const char *h = "0123456789ABCDEF"; const char *h = "0123456789ABCDEF";
c = cp[i]; c = cp[i];
if (c <= 0x20 || if (c <= 0x20 ||

View file

@ -28,6 +28,7 @@ using std::string;
#include "cstr.h" #include "cstr.h"
#include "debuglog.h" #include "debuglog.h"
#include "strmatcher.h" #include "strmatcher.h"
#include "pathut.h"
bool StrWildMatcher::match(const string& val) const bool StrWildMatcher::match(const string& val) const
{ {
@ -38,8 +39,9 @@ bool StrWildMatcher::match(const string& val) const
case 0: return true; case 0: return true;
case FNM_NOMATCH: return false; case FNM_NOMATCH: return false;
default: default:
LOGDEB0(("StrWildMatcher::match error: [%s] against [%s]\n", LOGINFO(("StrWildMatcher::match:err: e [%s] s [%s] (%s) ret %d\n",
m_sexp.c_str(), val.c_str())); m_sexp.c_str(), val.c_str(),
url_encode(val).c_str(), ret));
return false; return false;
} }
} }

View file

@ -32,7 +32,7 @@
class Utf8Iter { class Utf8Iter {
public: public:
Utf8Iter(const std::string &in) Utf8Iter(const std::string &in)
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false) : m_s(in), m_cl(0), m_pos(0), m_charpos(0)
{ {
update_cl(); update_cl();
} }
@ -44,7 +44,6 @@ public:
m_cl = 0; m_cl = 0;
m_pos = 0; m_pos = 0;
m_charpos = 0; m_charpos = 0;
m_error = false;
update_cl(); update_cl();
} }
@ -62,15 +61,15 @@ public:
int l; int l;
while (mypos < m_s.length() && mycp != charpos) { while (mypos < m_s.length() && mycp != charpos) {
l = get_cl(mypos); l = get_cl(mypos);
if (l <= 0) if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
return (unsigned int)-1; return (unsigned int)-1;
mypos += l; mypos += l;
++mycp; ++mycp;
} }
if (mypos < m_s.length() && mycp == charpos) { if (mypos < m_s.length() && mycp == charpos) {
l = get_cl(mypos); l = get_cl(mypos);
if (poslok(mypos, l)) if (poslok(mypos, l) && checkvalidat(mypos, l))
return getvalueat(mypos, get_cl(mypos)); return getvalueat(mypos, l);
} }
return (unsigned int)-1; return (unsigned int)-1;
} }
@ -83,7 +82,7 @@ public:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert(m_cl != 0); assert(m_cl != 0);
#endif #endif
if (m_cl <= 0) if (m_cl == 0)
return std::string::npos; return std::string::npos;
m_pos += m_cl; m_pos += m_cl;
@ -96,9 +95,9 @@ public:
unsigned int operator*() unsigned int operator*()
{ {
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert(m_cl != 0); assert(m_cl > 0);
#endif #endif
return getvalueat(m_pos, m_cl); return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
} }
/** Append current utf-8 possibly multi-byte character to string param. /** Append current utf-8 possibly multi-byte character to string param.
@ -116,15 +115,15 @@ public:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert(m_cl != 0); assert(m_cl != 0);
#endif #endif
return m_s.substr(m_pos, m_cl); return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
} }
bool eof() { bool eof() const {
return m_pos == m_s.length(); return m_pos == m_s.length();
} }
bool error() { bool error() const {
return m_error; return m_cl == 0;
} }
/** Return current byte offset in input string */ /** Return current byte offset in input string */
@ -152,8 +151,6 @@ private:
std::string::size_type m_pos; std::string::size_type m_pos;
// Current character position // Current character position
unsigned int m_charpos; unsigned int m_charpos;
// Am I ok ?
mutable bool m_error;
// Check position and cl against string length // Check position and cl against string length
bool poslok(std::string::size_type p, int l) const { bool poslok(std::string::size_type p, int l) const {
@ -163,7 +160,7 @@ private:
return p != std::string::npos && l > 0 && p + l <= m_s.length(); return p != std::string::npos && l > 0 && p + l <= m_s.length();
} }
// Update current char length in object state, minimum checking // Update current char length in object state, check
// for errors // for errors
inline void update_cl() inline void update_cl()
{ {
@ -176,7 +173,34 @@ private:
// basically prevents the caller to discriminate error and eof. // basically prevents the caller to discriminate error and eof.
// m_pos = m_s.length(); // m_pos = m_s.length();
m_cl = 0; m_cl = 0;
m_error = true; return;
}
if (!checkvalidat(m_pos, m_cl)) {
m_cl = 0;
}
}
inline bool checkvalidat(std::string::size_type p, int l) const
{
switch (l) {
case 1:
return (unsigned char)m_s[p] < 128;
case 2:
return (((unsigned char)m_s[p]) & 224) == 192
&& (((unsigned char)m_s[p+1]) & 192) == 128;
case 3:
return (((unsigned char)m_s[p]) & 240) == 224
&& (((unsigned char)m_s[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128
;
case 4:
return (((unsigned char)m_s[p]) & 248) == 240
&& (((unsigned char)m_s[p+1]) & 192) == 128
&& (((unsigned char)m_s[p+2]) & 192) == 128
&& (((unsigned char)m_s[p+3]) & 192) == 128
;
default:
return false;
} }
} }
@ -249,7 +273,6 @@ private:
#ifdef UTF8ITER_CHECK #ifdef UTF8ITER_CHECK
assert(l <= 4); assert(l <= 4);
#endif #endif
m_error = true;
return (unsigned int)-1; return (unsigned int)-1;
} }
} }