From 289e88c1bc0f912ae624d9fdd5fc974ea39ebd8f Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Wed, 16 May 2012 10:07:09 +0200
Subject: [PATCH] Html: Just ignore opening and closing <body> and <html> tags.
 Current browsers show text before or after the body and ignore multiple body
 tags. Not pushed to 1.17 maint because of possible disruption. Closes issue
 #92

---
 src/internfile/myhtmlparse.cpp | 19 +++++-----
 src/internfile/myhtmlparse.h   |  1 -
 tests/html/html.sh             |  5 +++
 tests/html/html.txt            |  6 ++++
 website/BUGS.html              | 63 ++++++++++++++++++++++++----------
 5 files changed, 64 insertions(+), 30 deletions(-)
diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
index a85a7531..8c963029 100644
--- a/src/internfile/myhtmlparse.cpp
+++ b/src/internfile/myhtmlparse.cpp
@@ -176,7 +176,6 @@ static NamedEntsInitializer namedEntsInitializerInstance;
 MyHtmlParser::MyHtmlParser()
     : in_script_tag(false),
       in_style_tag(false),
-      in_body_tag(false),
       in_pre_tag(false),
       pending_space(false),
       indexing_allowed(true)
@@ -308,11 +307,10 @@ MyHtmlParser::opening_tag(const string &tag)
 	    if (tag == "address") pending_space = true;
 	    break;
 	case 'b':
-	    if (tag == "body") {
-		dump.resize(0);
-		in_body_tag = true;
-		break;
-	    }
+	    // body: some bad docs have several opening body tags and
+	    // even text before the body is displayed by Opera and
+	    // Firefox.  We used to reset the dump each time we saw a
+	    // body tag, but I can't see any reason to do so.
 	    if (tag == "blockquote" || tag == "br") {
 		dump += '\n';
 		pending_space = true;
@@ -475,8 +473,11 @@ MyHtmlParser::closing_tag(const string &tag)
 	case 'b':
 	    if (tag == "body") {
 		LOGDEB1(("Myhtmlparse: body close tag found\n"));
-		in_body_tag = false;
-		return false;
+		// We used to signal and end of doc here by returning
+		// false but the browsers just ignore body and html
+		// closing tags if there is further text, so it seems right
+		// to do the same
+		break;
 	    }
 	    if (tag == "blockquote" || tag == "br") pending_space = true;
 	    break;
@@ -562,6 +563,4 @@ MyHtmlParser::closing_tag(const string &tag)
 void
 MyHtmlParser::do_eof()
 {
-    //    if (!in_body_tag)
-    //	throw(false);
 }
diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h
index 6bad0637..38b3e8f6 100644
--- a/src/internfile/myhtmlparse.h
+++ b/src/internfile/myhtmlparse.h
@@ -36,7 +36,6 @@ class MyHtmlParser : public HtmlParser {
  public:
     bool in_script_tag;
     bool in_style_tag;
-    bool in_body_tag; 
     bool in_pre_tag;
     bool pending_space;
     map<string,string> meta;
diff --git a/tests/html/html.sh b/tests/html/html.sh
index b3fe2840..181b2630 100755
--- a/tests/html/html.sh
+++ b/tests/html/html.sh
@@ -24,6 +24,11 @@ recollq -q html5charsetaccentue
 # Stripping trade mark and copyright signs
 recollq -q filename:signs.html Registered Trademark Copyright SoundCopyright
 
+# Text in malformed html (before or after body) should be indexed anyway...
+recollq -q BADHTMLTEXTBEFOREBODY
+recollq -q BADHTMLTEXTINSECONDBODY
+recollq -q BADHTMLTEXTAFTERBODY
+
 ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
 
 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
diff --git a/tests/html/html.txt b/tests/html/html.txt
index f9d7e3eb..cdf4c9f4 100644
--- a/tests/html/html.txt
+++ b/tests/html/html.txt
@@ -22,3 +22,9 @@ text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/iso.html]	[Some
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/html5charset.html]	[html5charset.html]	113	bytes	
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/signs.html]	[signs.html]	230	bytes	
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html]	[Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)]	646	bytes	
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html]	[Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)]	646	bytes	
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html]	[Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)]	646	bytes	
diff --git a/website/BUGS.html b/website/BUGS.html
index 569b5f40..85c3c7f0 100644
--- a/website/BUGS.html
+++ b/website/BUGS.html
@@ -35,12 +35,55 @@
           supposedly fixed in later versions. Bugs listed in the
           topmost section may also exist in older versions.</i></p>
 
-      <h2><a name="b_latest">recoll 1.17.1</a></h2> 
+      <h2><a name="b_latest">recoll 1.17.2</a></h2> 
+
+        <li>It appears that recollindex will sometimes crash while
+          indexing mail files. There are 2 separate reports about
+          this, and no resolution for now. This is not specific to
+          1.17 as one of the reports is for 1.16. Refs:
+          <a href="https://bugzilla.redhat.com/show_bug.cgi?format=multiple&id=819408">Fedora</a>
+          (maildir, processing an attachment), 
+          <a href="https://bugs.launchpad.net/ubuntu/+source/recoll/+bug/994228">Ubuntu</a>: apparently (no stack trace): 
+          <em>Recoll was indexing files in .thunderbird when the crash
+            occurred. It seemed to be indexing the INBOX file on
+            disk. </em> 
+        </li>
+
+        <li>Text inside malformed HTML files (appearing before a &lt;body&gt;
+          tag, or after a second one, or after a &lt;/body&gt; tag is
+          not indexed. At it would be displayed by current browsers,
+          this is wrong.</li>
+
+        <li>It will sometimes happen that the result list paragraph
+          format stored in the Qt preferences file will get garbled,
+          causing result lists with no displayed paragraphs (the
+          counts and pages are ok, the results can be seen in table
+          mode, but not in list mode). The workaround is to go to
+          <blockquote>
+           Preferences->Query configuration->User interface
+          </blockquote> and erase the result paragraph format string
+          (^A DEL in the text area), this will reset the string to the
+          default value.</li>
+
+        <li>Real time indexer: when running with gamin on FreeBSD, the
+          indexer can deadlock in the gamin dialog in some
+          cases.</li>
+
+        <li>After an upgrade, the recoll GUI sometimes crashes on
+          startup. This is fixed by removing (back it up just in case)
+          ~/.config/Recoll.org/recoll.conf, the QSettings storage for
+          recoll.</li>
+
+      <h2><a name="b_1_17_1">recoll 1.17.1</a></h2> 
 
       <ul>
         <li>The version string is not correctly updated for 1.17.1, the
           help dialog and recollindex -v will print 1.17.0.</li>
 
+        <li>You can crash the GUI by starting simultaneous queries,
+          which could be accomplished among others by quickly clicking
+          the sort order buttons.</li>
+
         <li>chm filter: url-encoded internal paths are mishandled.</li>
 
         <li>Does not compile on Solaris (flock() issue).</li>
@@ -57,24 +100,6 @@
         
         <li>Html output from Python (rclexecm) filters is not
           correctly escaped.</li>
-
-        <li>It will sometimes happen that the result list paragraph format
-          stored in the Qt preferences file will get garbled,
-          causing result lists with no displayed paragraphs (the
-          counts and pages are ok, the results can be seen in table
-          mode, but not in list mode). The workaround is to go to
-          <blockquote>
-           Preferences->Query configuration->User interface
-          </blockquote> and erase the result paragraph format string
-          (^A DEL in the text area), this will reset the string to the
-          default value.</li>
-        <li>Real time indexer: when running with gamin on FreeBSD, the
-          indexer can deadlock in the gamin dialog in some
-          cases.</li>
-        <li>After an upgrade, the recoll GUI sometimes crashes on
-          startup. This is fixed by removing (back it up just in case)
-          ~/.config/Recoll.org/recoll.conf, the QSettings storage for
-          recoll.</li>
       </ul>