more utf-8 err checking prevents bogus terms in index
This commit is contained in:
parent
1bdf2b67e5
commit
5341b5575b
6 changed files with 54 additions and 23 deletions
|
@ -350,6 +350,9 @@ bool TextSplit::text_to_words(const string &in)
|
||||||
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
||||||
in.substr(0,50).c_str()));
|
in.substr(0,50).c_str()));
|
||||||
|
|
||||||
|
if (in.empty())
|
||||||
|
return true;
|
||||||
|
|
||||||
m_span.erase();
|
m_span.erase();
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
||||||
|
|
|
@ -31,3 +31,7 @@ making progress.
|
||||||
|
|
||||||
The current conclusion would seem to be that the SGML version should stay
|
The current conclusion would seem to be that the SGML version should stay
|
||||||
operational to give an easy way to make the PDF one on FreeBSD.
|
operational to give an easy way to make the PDF one on FreeBSD.
|
||||||
|
|
||||||
|
But see also notes about dblatex on the asciidoc page. Actually asciidoc would
|
||||||
|
be a candidate replacement for the source format.
|
||||||
|
http://www.methods.co.nz/asciidoc/userguide.html
|
||||||
|
|
|
@ -540,8 +540,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||||
!stringlowercmp("x-user-defined", charset) ||
|
!stringlowercmp("x-user-defined", charset) ||
|
||||||
!stringlowercmp("x-unknown", charset) ||
|
!stringlowercmp("x-unknown", charset) ||
|
||||||
!stringlowercmp("unknown", charset) ) {
|
!stringlowercmp("unknown", charset) ) {
|
||||||
m_config->getConfParam("maildefcharset", charset);
|
if (!m_config->getConfParam("maildefcharset", charset))
|
||||||
if (charset.empty())
|
|
||||||
charset = "CP1252";
|
charset = "CP1252";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -462,7 +462,7 @@ string url_encode(const string& url, string::size_type offs)
|
||||||
string out = url.substr(0, offs);
|
string out = url.substr(0, offs);
|
||||||
const char *cp = url.c_str();
|
const char *cp = url.c_str();
|
||||||
for (string::size_type i = offs; i < url.size(); i++) {
|
for (string::size_type i = offs; i < url.size(); i++) {
|
||||||
int c;
|
unsigned int c;
|
||||||
const char *h = "0123456789ABCDEF";
|
const char *h = "0123456789ABCDEF";
|
||||||
c = cp[i];
|
c = cp[i];
|
||||||
if (c <= 0x20 ||
|
if (c <= 0x20 ||
|
||||||
|
|
|
@ -28,6 +28,7 @@ using std::string;
|
||||||
#include "cstr.h"
|
#include "cstr.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "strmatcher.h"
|
#include "strmatcher.h"
|
||||||
|
#include "pathut.h"
|
||||||
|
|
||||||
bool StrWildMatcher::match(const string& val) const
|
bool StrWildMatcher::match(const string& val) const
|
||||||
{
|
{
|
||||||
|
@ -38,8 +39,9 @@ bool StrWildMatcher::match(const string& val) const
|
||||||
case 0: return true;
|
case 0: return true;
|
||||||
case FNM_NOMATCH: return false;
|
case FNM_NOMATCH: return false;
|
||||||
default:
|
default:
|
||||||
LOGDEB0(("StrWildMatcher::match error: [%s] against [%s]\n",
|
LOGINFO(("StrWildMatcher::match:err: e [%s] s [%s] (%s) ret %d\n",
|
||||||
m_sexp.c_str(), val.c_str()));
|
m_sexp.c_str(), val.c_str(),
|
||||||
|
url_encode(val).c_str(), ret));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,7 +32,7 @@
|
||||||
class Utf8Iter {
|
class Utf8Iter {
|
||||||
public:
|
public:
|
||||||
Utf8Iter(const std::string &in)
|
Utf8Iter(const std::string &in)
|
||||||
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
|
: m_s(in), m_cl(0), m_pos(0), m_charpos(0)
|
||||||
{
|
{
|
||||||
update_cl();
|
update_cl();
|
||||||
}
|
}
|
||||||
|
@ -44,7 +44,6 @@ public:
|
||||||
m_cl = 0;
|
m_cl = 0;
|
||||||
m_pos = 0;
|
m_pos = 0;
|
||||||
m_charpos = 0;
|
m_charpos = 0;
|
||||||
m_error = false;
|
|
||||||
update_cl();
|
update_cl();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,15 +61,15 @@ public:
|
||||||
int l;
|
int l;
|
||||||
while (mypos < m_s.length() && mycp != charpos) {
|
while (mypos < m_s.length() && mycp != charpos) {
|
||||||
l = get_cl(mypos);
|
l = get_cl(mypos);
|
||||||
if (l <= 0)
|
if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
|
||||||
return (unsigned int)-1;
|
return (unsigned int)-1;
|
||||||
mypos += l;
|
mypos += l;
|
||||||
++mycp;
|
++mycp;
|
||||||
}
|
}
|
||||||
if (mypos < m_s.length() && mycp == charpos) {
|
if (mypos < m_s.length() && mycp == charpos) {
|
||||||
l = get_cl(mypos);
|
l = get_cl(mypos);
|
||||||
if (poslok(mypos, l))
|
if (poslok(mypos, l) && checkvalidat(mypos, l))
|
||||||
return getvalueat(mypos, get_cl(mypos));
|
return getvalueat(mypos, l);
|
||||||
}
|
}
|
||||||
return (unsigned int)-1;
|
return (unsigned int)-1;
|
||||||
}
|
}
|
||||||
|
@ -83,7 +82,7 @@ public:
|
||||||
#ifdef UTF8ITER_CHECK
|
#ifdef UTF8ITER_CHECK
|
||||||
assert(m_cl != 0);
|
assert(m_cl != 0);
|
||||||
#endif
|
#endif
|
||||||
if (m_cl <= 0)
|
if (m_cl == 0)
|
||||||
return std::string::npos;
|
return std::string::npos;
|
||||||
|
|
||||||
m_pos += m_cl;
|
m_pos += m_cl;
|
||||||
|
@ -96,9 +95,9 @@ public:
|
||||||
unsigned int operator*()
|
unsigned int operator*()
|
||||||
{
|
{
|
||||||
#ifdef UTF8ITER_CHECK
|
#ifdef UTF8ITER_CHECK
|
||||||
assert(m_cl != 0);
|
assert(m_cl > 0);
|
||||||
#endif
|
#endif
|
||||||
return getvalueat(m_pos, m_cl);
|
return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Append current utf-8 possibly multi-byte character to string param.
|
/** Append current utf-8 possibly multi-byte character to string param.
|
||||||
|
@ -116,15 +115,15 @@ public:
|
||||||
#ifdef UTF8ITER_CHECK
|
#ifdef UTF8ITER_CHECK
|
||||||
assert(m_cl != 0);
|
assert(m_cl != 0);
|
||||||
#endif
|
#endif
|
||||||
return m_s.substr(m_pos, m_cl);
|
return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool eof() {
|
bool eof() const {
|
||||||
return m_pos == m_s.length();
|
return m_pos == m_s.length();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool error() {
|
bool error() const {
|
||||||
return m_error;
|
return m_cl == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return current byte offset in input string */
|
/** Return current byte offset in input string */
|
||||||
|
@ -152,8 +151,6 @@ private:
|
||||||
std::string::size_type m_pos;
|
std::string::size_type m_pos;
|
||||||
// Current character position
|
// Current character position
|
||||||
unsigned int m_charpos;
|
unsigned int m_charpos;
|
||||||
// Am I ok ?
|
|
||||||
mutable bool m_error;
|
|
||||||
|
|
||||||
// Check position and cl against string length
|
// Check position and cl against string length
|
||||||
bool poslok(std::string::size_type p, int l) const {
|
bool poslok(std::string::size_type p, int l) const {
|
||||||
|
@ -163,7 +160,7 @@ private:
|
||||||
return p != std::string::npos && l > 0 && p + l <= m_s.length();
|
return p != std::string::npos && l > 0 && p + l <= m_s.length();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update current char length in object state, minimum checking
|
// Update current char length in object state, check
|
||||||
// for errors
|
// for errors
|
||||||
inline void update_cl()
|
inline void update_cl()
|
||||||
{
|
{
|
||||||
|
@ -176,7 +173,34 @@ private:
|
||||||
// basically prevents the caller to discriminate error and eof.
|
// basically prevents the caller to discriminate error and eof.
|
||||||
// m_pos = m_s.length();
|
// m_pos = m_s.length();
|
||||||
m_cl = 0;
|
m_cl = 0;
|
||||||
m_error = true;
|
return;
|
||||||
|
}
|
||||||
|
if (!checkvalidat(m_pos, m_cl)) {
|
||||||
|
m_cl = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool checkvalidat(std::string::size_type p, int l) const
|
||||||
|
{
|
||||||
|
switch (l) {
|
||||||
|
case 1:
|
||||||
|
return (unsigned char)m_s[p] < 128;
|
||||||
|
case 2:
|
||||||
|
return (((unsigned char)m_s[p]) & 224) == 192
|
||||||
|
&& (((unsigned char)m_s[p+1]) & 192) == 128;
|
||||||
|
case 3:
|
||||||
|
return (((unsigned char)m_s[p]) & 240) == 224
|
||||||
|
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
||||||
|
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
||||||
|
;
|
||||||
|
case 4:
|
||||||
|
return (((unsigned char)m_s[p]) & 248) == 240
|
||||||
|
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
||||||
|
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
||||||
|
&& (((unsigned char)m_s[p+3]) & 192) == 128
|
||||||
|
;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -249,7 +273,6 @@ private:
|
||||||
#ifdef UTF8ITER_CHECK
|
#ifdef UTF8ITER_CHECK
|
||||||
assert(l <= 4);
|
assert(l <= 4);
|
||||||
#endif
|
#endif
|
||||||
m_error = true;
|
|
||||||
return (unsigned int)-1;
|
return (unsigned int)-1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue