Html: Just ignore opening and closing <body> and <html> tags. Current browsers show text before or after the body and ignore multiple body tags. Not pushed to 1.17 maint because of possible disruption. Closes issue #92
This commit is contained in:
parent
80075b9b26
commit
289e88c1bc
5 changed files with 64 additions and 30 deletions
|
@ -176,7 +176,6 @@ static NamedEntsInitializer namedEntsInitializerInstance;
|
||||||
MyHtmlParser::MyHtmlParser()
|
MyHtmlParser::MyHtmlParser()
|
||||||
: in_script_tag(false),
|
: in_script_tag(false),
|
||||||
in_style_tag(false),
|
in_style_tag(false),
|
||||||
in_body_tag(false),
|
|
||||||
in_pre_tag(false),
|
in_pre_tag(false),
|
||||||
pending_space(false),
|
pending_space(false),
|
||||||
indexing_allowed(true)
|
indexing_allowed(true)
|
||||||
|
@ -308,11 +307,10 @@ MyHtmlParser::opening_tag(const string &tag)
|
||||||
if (tag == "address") pending_space = true;
|
if (tag == "address") pending_space = true;
|
||||||
break;
|
break;
|
||||||
case 'b':
|
case 'b':
|
||||||
if (tag == "body") {
|
// body: some bad docs have several opening body tags and
|
||||||
dump.resize(0);
|
// even text before the body is displayed by Opera and
|
||||||
in_body_tag = true;
|
// Firefox. We used to reset the dump each time we saw a
|
||||||
break;
|
// body tag, but I can't see any reason to do so.
|
||||||
}
|
|
||||||
if (tag == "blockquote" || tag == "br") {
|
if (tag == "blockquote" || tag == "br") {
|
||||||
dump += '\n';
|
dump += '\n';
|
||||||
pending_space = true;
|
pending_space = true;
|
||||||
|
@ -475,8 +473,11 @@ MyHtmlParser::closing_tag(const string &tag)
|
||||||
case 'b':
|
case 'b':
|
||||||
if (tag == "body") {
|
if (tag == "body") {
|
||||||
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
||||||
in_body_tag = false;
|
// We used to signal and end of doc here by returning
|
||||||
return false;
|
// false but the browsers just ignore body and html
|
||||||
|
// closing tags if there is further text, so it seems right
|
||||||
|
// to do the same
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
if (tag == "blockquote" || tag == "br") pending_space = true;
|
||||||
break;
|
break;
|
||||||
|
@ -562,6 +563,4 @@ MyHtmlParser::closing_tag(const string &tag)
|
||||||
void
|
void
|
||||||
MyHtmlParser::do_eof()
|
MyHtmlParser::do_eof()
|
||||||
{
|
{
|
||||||
// if (!in_body_tag)
|
|
||||||
// throw(false);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,7 +36,6 @@ class MyHtmlParser : public HtmlParser {
|
||||||
public:
|
public:
|
||||||
bool in_script_tag;
|
bool in_script_tag;
|
||||||
bool in_style_tag;
|
bool in_style_tag;
|
||||||
bool in_body_tag;
|
|
||||||
bool in_pre_tag;
|
bool in_pre_tag;
|
||||||
bool pending_space;
|
bool pending_space;
|
||||||
map<string,string> meta;
|
map<string,string> meta;
|
||||||
|
|
|
@ -24,6 +24,11 @@ recollq -q html5charsetaccentue
|
||||||
# Stripping trade mark and copyright signs
|
# Stripping trade mark and copyright signs
|
||||||
recollq -q filename:signs.html Registered Trademark Copyright SoundCopyright
|
recollq -q filename:signs.html Registered Trademark Copyright SoundCopyright
|
||||||
|
|
||||||
|
# Text in malformed html (before or after body) should be indexed anyway...
|
||||||
|
recollq -q BADHTMLTEXTBEFOREBODY
|
||||||
|
recollq -q BADHTMLTEXTINSECONDBODY
|
||||||
|
recollq -q BADHTMLTEXTAFTERBODY
|
||||||
|
|
||||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
|
|
||||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||||
|
|
|
@ -22,3 +22,9 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/iso.html] [Some
|
||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/html5charset.html] [html5charset.html] 113 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/html5charset.html] [html5charset.html] 113 bytes
|
||||||
1 results
|
1 results
|
||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/signs.html] [signs.html] 230 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/signs.html] [signs.html] 230 bytes
|
||||||
|
1 results
|
||||||
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes
|
||||||
|
1 results
|
||||||
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes
|
||||||
|
1 results
|
||||||
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes
|
||||||
|
|
|
@ -35,12 +35,55 @@
|
||||||
supposedly fixed in later versions. Bugs listed in the
|
supposedly fixed in later versions. Bugs listed in the
|
||||||
topmost section may also exist in older versions.</i></p>
|
topmost section may also exist in older versions.</i></p>
|
||||||
|
|
||||||
<h2><a name="b_latest">recoll 1.17.1</a></h2>
|
<h2><a name="b_latest">recoll 1.17.2</a></h2>
|
||||||
|
|
||||||
|
<li>It appears that recollindex will sometimes crash while
|
||||||
|
indexing mail files. There are 2 separate reports about
|
||||||
|
this, and no resolution for now. This is not specific to
|
||||||
|
1.17 as one of the reports is for 1.16. Refs:
|
||||||
|
<a href="https://bugzilla.redhat.com/show_bug.cgi?format=multiple&id=819408">Fedora</a>
|
||||||
|
(maildir, processing an attachment),
|
||||||
|
<a href="https://bugs.launchpad.net/ubuntu/+source/recoll/+bug/994228">Ubuntu</a>: apparently (no stack trace):
|
||||||
|
<em>Recoll was indexing files in .thunderbird when the crash
|
||||||
|
occurred. It seemed to be indexing the INBOX file on
|
||||||
|
disk. </em>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li>Text inside malformed HTML files (appearing before a <body>
|
||||||
|
tag, or after a second one, or after a </body> tag is
|
||||||
|
not indexed. At it would be displayed by current browsers,
|
||||||
|
this is wrong.</li>
|
||||||
|
|
||||||
|
<li>It will sometimes happen that the result list paragraph
|
||||||
|
format stored in the Qt preferences file will get garbled,
|
||||||
|
causing result lists with no displayed paragraphs (the
|
||||||
|
counts and pages are ok, the results can be seen in table
|
||||||
|
mode, but not in list mode). The workaround is to go to
|
||||||
|
<blockquote>
|
||||||
|
Preferences->Query configuration->User interface
|
||||||
|
</blockquote> and erase the result paragraph format string
|
||||||
|
(^A DEL in the text area), this will reset the string to the
|
||||||
|
default value.</li>
|
||||||
|
|
||||||
|
<li>Real time indexer: when running with gamin on FreeBSD, the
|
||||||
|
indexer can deadlock in the gamin dialog in some
|
||||||
|
cases.</li>
|
||||||
|
|
||||||
|
<li>After an upgrade, the recoll GUI sometimes crashes on
|
||||||
|
startup. This is fixed by removing (back it up just in case)
|
||||||
|
~/.config/Recoll.org/recoll.conf, the QSettings storage for
|
||||||
|
recoll.</li>
|
||||||
|
|
||||||
|
<h2><a name="b_1_17_1">recoll 1.17.1</a></h2>
|
||||||
|
|
||||||
<ul>
|
<ul>
|
||||||
<li>The version string is not correctly updated for 1.17.1, the
|
<li>The version string is not correctly updated for 1.17.1, the
|
||||||
help dialog and recollindex -v will print 1.17.0.</li>
|
help dialog and recollindex -v will print 1.17.0.</li>
|
||||||
|
|
||||||
|
<li>You can crash the GUI by starting simultaneous queries,
|
||||||
|
which could be accomplished among others by quickly clicking
|
||||||
|
the sort order buttons.</li>
|
||||||
|
|
||||||
<li>chm filter: url-encoded internal paths are mishandled.</li>
|
<li>chm filter: url-encoded internal paths are mishandled.</li>
|
||||||
|
|
||||||
<li>Does not compile on Solaris (flock() issue).</li>
|
<li>Does not compile on Solaris (flock() issue).</li>
|
||||||
|
@ -57,24 +100,6 @@
|
||||||
|
|
||||||
<li>Html output from Python (rclexecm) filters is not
|
<li>Html output from Python (rclexecm) filters is not
|
||||||
correctly escaped.</li>
|
correctly escaped.</li>
|
||||||
|
|
||||||
<li>It will sometimes happen that the result list paragraph format
|
|
||||||
stored in the Qt preferences file will get garbled,
|
|
||||||
causing result lists with no displayed paragraphs (the
|
|
||||||
counts and pages are ok, the results can be seen in table
|
|
||||||
mode, but not in list mode). The workaround is to go to
|
|
||||||
<blockquote>
|
|
||||||
Preferences->Query configuration->User interface
|
|
||||||
</blockquote> and erase the result paragraph format string
|
|
||||||
(^A DEL in the text area), this will reset the string to the
|
|
||||||
default value.</li>
|
|
||||||
<li>Real time indexer: when running with gamin on FreeBSD, the
|
|
||||||
indexer can deadlock in the gamin dialog in some
|
|
||||||
cases.</li>
|
|
||||||
<li>After an upgrade, the recoll GUI sometimes crashes on
|
|
||||||
startup. This is fixed by removing (back it up just in case)
|
|
||||||
~/.config/Recoll.org/recoll.conf, the QSettings storage for
|
|
||||||
recoll.</li>
|
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue