From 289e88c1bc0f912ae624d9fdd5fc974ea39ebd8f Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Wed, 16 May 2012 10:07:09 +0200
Subject: [PATCH] Html: Just ignore opening and closing and tags.
Current browsers show text before or after the body and ignore multiple body
tags. Not pushed to 1.17 maint because of possible disruption. Closes issue
#92
---
src/internfile/myhtmlparse.cpp | 19 +++++-----
src/internfile/myhtmlparse.h | 1 -
tests/html/html.sh | 5 +++
tests/html/html.txt | 6 ++++
website/BUGS.html | 63 ++++++++++++++++++++++++----------
5 files changed, 64 insertions(+), 30 deletions(-)
diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
index a85a7531..8c963029 100644
--- a/src/internfile/myhtmlparse.cpp
+++ b/src/internfile/myhtmlparse.cpp
@@ -176,7 +176,6 @@ static NamedEntsInitializer namedEntsInitializerInstance;
MyHtmlParser::MyHtmlParser()
: in_script_tag(false),
in_style_tag(false),
- in_body_tag(false),
in_pre_tag(false),
pending_space(false),
indexing_allowed(true)
@@ -308,11 +307,10 @@ MyHtmlParser::opening_tag(const string &tag)
if (tag == "address") pending_space = true;
break;
case 'b':
- if (tag == "body") {
- dump.resize(0);
- in_body_tag = true;
- break;
- }
+ // body: some bad docs have several opening body tags and
+ // even text before the body is displayed by Opera and
+ // Firefox. We used to reset the dump each time we saw a
+ // body tag, but I can't see any reason to do so.
if (tag == "blockquote" || tag == "br") {
dump += '\n';
pending_space = true;
@@ -475,8 +473,11 @@ MyHtmlParser::closing_tag(const string &tag)
case 'b':
if (tag == "body") {
LOGDEB1(("Myhtmlparse: body close tag found\n"));
- in_body_tag = false;
- return false;
+ // We used to signal and end of doc here by returning
+ // false but the browsers just ignore body and html
+ // closing tags if there is further text, so it seems right
+ // to do the same
+ break;
}
if (tag == "blockquote" || tag == "br") pending_space = true;
break;
@@ -562,6 +563,4 @@ MyHtmlParser::closing_tag(const string &tag)
void
MyHtmlParser::do_eof()
{
- // if (!in_body_tag)
- // throw(false);
}
diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h
index 6bad0637..38b3e8f6 100644
--- a/src/internfile/myhtmlparse.h
+++ b/src/internfile/myhtmlparse.h
@@ -36,7 +36,6 @@ class MyHtmlParser : public HtmlParser {
public:
bool in_script_tag;
bool in_style_tag;
- bool in_body_tag;
bool in_pre_tag;
bool pending_space;
map meta;
diff --git a/tests/html/html.sh b/tests/html/html.sh
index b3fe2840..181b2630 100755
--- a/tests/html/html.sh
+++ b/tests/html/html.sh
@@ -24,6 +24,11 @@ recollq -q html5charsetaccentue
# Stripping trade mark and copyright signs
recollq -q filename:signs.html Registered Trademark Copyright SoundCopyright
+# Text in malformed html (before or after body) should be indexed anyway...
+recollq -q BADHTMLTEXTBEFOREBODY
+recollq -q BADHTMLTEXTINSECONDBODY
+recollq -q BADHTMLTEXTAFTERBODY
+
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
diff --git a/tests/html/html.txt b/tests/html/html.txt
index f9d7e3eb..cdf4c9f4 100644
--- a/tests/html/html.txt
+++ b/tests/html/html.txt
@@ -22,3 +22,9 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/iso.html] [Some
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/html5charset.html] [html5charset.html] 113 bytes
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/signs.html] [signs.html] 230 bytes
+1 results
+text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes
+1 results
+text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes
+1 results
+text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes
diff --git a/website/BUGS.html b/website/BUGS.html
index 569b5f40..85c3c7f0 100644
--- a/website/BUGS.html
+++ b/website/BUGS.html
@@ -35,12 +35,55 @@
supposedly fixed in later versions. Bugs listed in the
topmost section may also exist in older versions.
-
+
+
+ It appears that recollindex will sometimes crash while
+ indexing mail files. There are 2 separate reports about
+ this, and no resolution for now. This is not specific to
+ 1.17 as one of the reports is for 1.16. Refs:
+ Fedora
+ (maildir, processing an attachment),
+ Ubuntu: apparently (no stack trace):
+ Recoll was indexing files in .thunderbird when the crash
+ occurred. It seemed to be indexing the INBOX file on
+ disk.
+
+
+ Text inside malformed HTML files (appearing before a <body>
+ tag, or after a second one, or after a </body> tag is
+ not indexed. At it would be displayed by current browsers,
+ this is wrong.
+
+ It will sometimes happen that the result list paragraph
+ format stored in the Qt preferences file will get garbled,
+ causing result lists with no displayed paragraphs (the
+ counts and pages are ok, the results can be seen in table
+ mode, but not in list mode). The workaround is to go to
+
+ Preferences->Query configuration->User interface
+
and erase the result paragraph format string
+ (^A DEL in the text area), this will reset the string to the
+ default value.
+
+ Real time indexer: when running with gamin on FreeBSD, the
+ indexer can deadlock in the gamin dialog in some
+ cases.
+
+ After an upgrade, the recoll GUI sometimes crashes on
+ startup. This is fixed by removing (back it up just in case)
+ ~/.config/Recoll.org/recoll.conf, the QSettings storage for
+ recoll.
+
+
- The version string is not correctly updated for 1.17.1, the
help dialog and recollindex -v will print 1.17.0.
+ - You can crash the GUI by starting simultaneous queries,
+ which could be accomplished among others by quickly clicking
+ the sort order buttons.
+
- chm filter: url-encoded internal paths are mishandled.
- Does not compile on Solaris (flock() issue).
@@ -57,24 +100,6 @@
- Html output from Python (rclexecm) filters is not
correctly escaped.
-
- - It will sometimes happen that the result list paragraph format
- stored in the Qt preferences file will get garbled,
- causing result lists with no displayed paragraphs (the
- counts and pages are ok, the results can be seen in table
- mode, but not in list mode). The workaround is to go to
-
- Preferences->Query configuration->User interface
-
and erase the result paragraph format string
- (^A DEL in the text area), this will reset the string to the
- default value.
- - Real time indexer: when running with gamin on FreeBSD, the
- indexer can deadlock in the gamin dialog in some
- cases.
- - After an upgrade, the recoll GUI sometimes crashes on
- startup. This is fixed by removing (back it up just in case)
- ~/.config/Recoll.org/recoll.conf, the QSettings storage for
- recoll.