more utf-8 err checking prevents bogus terms in index
This commit is contained in:
parent
1bdf2b67e5
commit
5341b5575b
6 changed files with 54 additions and 23 deletions
|
@ -350,6 +350,9 @@ bool TextSplit::text_to_words(const string &in)
|
|||
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
||||
in.substr(0,50).c_str()));
|
||||
|
||||
if (in.empty())
|
||||
return true;
|
||||
|
||||
m_span.erase();
|
||||
m_inNumber = false;
|
||||
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
||||
|
|
|
@ -31,3 +31,7 @@ making progress.
|
|||
|
||||
The current conclusion would seem to be that the SGML version should stay
|
||||
operational to give an easy way to make the PDF one on FreeBSD.
|
||||
|
||||
But see also notes about dblatex on the asciidoc page. Actually asciidoc would
|
||||
be a candidate replacement for the source format.
|
||||
http://www.methods.co.nz/asciidoc/userguide.html
|
||||
|
|
|
@ -540,8 +540,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
|||
!stringlowercmp("x-user-defined", charset) ||
|
||||
!stringlowercmp("x-unknown", charset) ||
|
||||
!stringlowercmp("unknown", charset) ) {
|
||||
m_config->getConfParam("maildefcharset", charset);
|
||||
if (charset.empty())
|
||||
if (!m_config->getConfParam("maildefcharset", charset))
|
||||
charset = "CP1252";
|
||||
}
|
||||
|
||||
|
|
|
@ -462,7 +462,7 @@ string url_encode(const string& url, string::size_type offs)
|
|||
string out = url.substr(0, offs);
|
||||
const char *cp = url.c_str();
|
||||
for (string::size_type i = offs; i < url.size(); i++) {
|
||||
int c;
|
||||
unsigned int c;
|
||||
const char *h = "0123456789ABCDEF";
|
||||
c = cp[i];
|
||||
if (c <= 0x20 ||
|
||||
|
|
|
@ -28,6 +28,7 @@ using std::string;
|
|||
#include "cstr.h"
|
||||
#include "debuglog.h"
|
||||
#include "strmatcher.h"
|
||||
#include "pathut.h"
|
||||
|
||||
bool StrWildMatcher::match(const string& val) const
|
||||
{
|
||||
|
@ -38,8 +39,9 @@ bool StrWildMatcher::match(const string& val) const
|
|||
case 0: return true;
|
||||
case FNM_NOMATCH: return false;
|
||||
default:
|
||||
LOGDEB0(("StrWildMatcher::match error: [%s] against [%s]\n",
|
||||
m_sexp.c_str(), val.c_str()));
|
||||
LOGINFO(("StrWildMatcher::match:err: e [%s] s [%s] (%s) ret %d\n",
|
||||
m_sexp.c_str(), val.c_str(),
|
||||
url_encode(val).c_str(), ret));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
class Utf8Iter {
|
||||
public:
|
||||
Utf8Iter(const std::string &in)
|
||||
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
|
||||
: m_s(in), m_cl(0), m_pos(0), m_charpos(0)
|
||||
{
|
||||
update_cl();
|
||||
}
|
||||
|
@ -44,7 +44,6 @@ public:
|
|||
m_cl = 0;
|
||||
m_pos = 0;
|
||||
m_charpos = 0;
|
||||
m_error = false;
|
||||
update_cl();
|
||||
}
|
||||
|
||||
|
@ -62,15 +61,15 @@ public:
|
|||
int l;
|
||||
while (mypos < m_s.length() && mycp != charpos) {
|
||||
l = get_cl(mypos);
|
||||
if (l <= 0)
|
||||
if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
|
||||
return (unsigned int)-1;
|
||||
mypos += l;
|
||||
++mycp;
|
||||
}
|
||||
if (mypos < m_s.length() && mycp == charpos) {
|
||||
l = get_cl(mypos);
|
||||
if (poslok(mypos, l))
|
||||
return getvalueat(mypos, get_cl(mypos));
|
||||
if (poslok(mypos, l) && checkvalidat(mypos, l))
|
||||
return getvalueat(mypos, l);
|
||||
}
|
||||
return (unsigned int)-1;
|
||||
}
|
||||
|
@ -83,7 +82,7 @@ public:
|
|||
#ifdef UTF8ITER_CHECK
|
||||
assert(m_cl != 0);
|
||||
#endif
|
||||
if (m_cl <= 0)
|
||||
if (m_cl == 0)
|
||||
return std::string::npos;
|
||||
|
||||
m_pos += m_cl;
|
||||
|
@ -96,9 +95,9 @@ public:
|
|||
unsigned int operator*()
|
||||
{
|
||||
#ifdef UTF8ITER_CHECK
|
||||
assert(m_cl != 0);
|
||||
assert(m_cl > 0);
|
||||
#endif
|
||||
return getvalueat(m_pos, m_cl);
|
||||
return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
|
||||
}
|
||||
|
||||
/** Append current utf-8 possibly multi-byte character to string param.
|
||||
|
@ -116,15 +115,15 @@ public:
|
|||
#ifdef UTF8ITER_CHECK
|
||||
assert(m_cl != 0);
|
||||
#endif
|
||||
return m_s.substr(m_pos, m_cl);
|
||||
return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
|
||||
}
|
||||
|
||||
bool eof() {
|
||||
bool eof() const {
|
||||
return m_pos == m_s.length();
|
||||
}
|
||||
|
||||
bool error() {
|
||||
return m_error;
|
||||
bool error() const {
|
||||
return m_cl == 0;
|
||||
}
|
||||
|
||||
/** Return current byte offset in input string */
|
||||
|
@ -152,8 +151,6 @@ private:
|
|||
std::string::size_type m_pos;
|
||||
// Current character position
|
||||
unsigned int m_charpos;
|
||||
// Am I ok ?
|
||||
mutable bool m_error;
|
||||
|
||||
// Check position and cl against string length
|
||||
bool poslok(std::string::size_type p, int l) const {
|
||||
|
@ -163,7 +160,7 @@ private:
|
|||
return p != std::string::npos && l > 0 && p + l <= m_s.length();
|
||||
}
|
||||
|
||||
// Update current char length in object state, minimum checking
|
||||
// Update current char length in object state, check
|
||||
// for errors
|
||||
inline void update_cl()
|
||||
{
|
||||
|
@ -176,7 +173,34 @@ private:
|
|||
// basically prevents the caller to discriminate error and eof.
|
||||
// m_pos = m_s.length();
|
||||
m_cl = 0;
|
||||
m_error = true;
|
||||
return;
|
||||
}
|
||||
if (!checkvalidat(m_pos, m_cl)) {
|
||||
m_cl = 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool checkvalidat(std::string::size_type p, int l) const
|
||||
{
|
||||
switch (l) {
|
||||
case 1:
|
||||
return (unsigned char)m_s[p] < 128;
|
||||
case 2:
|
||||
return (((unsigned char)m_s[p]) & 224) == 192
|
||||
&& (((unsigned char)m_s[p+1]) & 192) == 128;
|
||||
case 3:
|
||||
return (((unsigned char)m_s[p]) & 240) == 224
|
||||
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
||||
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
||||
;
|
||||
case 4:
|
||||
return (((unsigned char)m_s[p]) & 248) == 240
|
||||
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
||||
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
||||
&& (((unsigned char)m_s[p+3]) & 192) == 128
|
||||
;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -249,7 +273,6 @@ private:
|
|||
#ifdef UTF8ITER_CHECK
|
||||
assert(l <= 4);
|
||||
#endif
|
||||
m_error = true;
|
||||
return (unsigned int)-1;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue