diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
index a85a7531..8c963029 100644
--- a/src/internfile/myhtmlparse.cpp
+++ b/src/internfile/myhtmlparse.cpp
@@ -176,7 +176,6 @@ static NamedEntsInitializer namedEntsInitializerInstance;
MyHtmlParser::MyHtmlParser()
: in_script_tag(false),
in_style_tag(false),
- in_body_tag(false),
in_pre_tag(false),
pending_space(false),
indexing_allowed(true)
@@ -308,11 +307,10 @@ MyHtmlParser::opening_tag(const string &tag)
if (tag == "address") pending_space = true;
break;
case 'b':
- if (tag == "body") {
- dump.resize(0);
- in_body_tag = true;
- break;
- }
+ // body: some bad docs have several opening body tags and
+ // even text before the body is displayed by Opera and
+ // Firefox. We used to reset the dump each time we saw a
+ // body tag, but I can't see any reason to do so.
if (tag == "blockquote" || tag == "br") {
dump += '\n';
pending_space = true;
@@ -475,8 +473,11 @@ MyHtmlParser::closing_tag(const string &tag)
case 'b':
if (tag == "body") {
LOGDEB1(("Myhtmlparse: body close tag found\n"));
- in_body_tag = false;
- return false;
+ // We used to signal and end of doc here by returning
+ // false but the browsers just ignore body and html
+ // closing tags if there is further text, so it seems right
+ // to do the same
+ break;
}
if (tag == "blockquote" || tag == "br") pending_space = true;
break;
@@ -562,6 +563,4 @@ MyHtmlParser::closing_tag(const string &tag)
void
MyHtmlParser::do_eof()
{
- // if (!in_body_tag)
- // throw(false);
}
diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h
index 6bad0637..38b3e8f6 100644
--- a/src/internfile/myhtmlparse.h
+++ b/src/internfile/myhtmlparse.h
@@ -36,7 +36,6 @@ class MyHtmlParser : public HtmlParser {
public:
bool in_script_tag;
bool in_style_tag;
- bool in_body_tag;
bool in_pre_tag;
bool pending_space;
map meta;
diff --git a/tests/html/html.sh b/tests/html/html.sh
index b3fe2840..181b2630 100755
--- a/tests/html/html.sh
+++ b/tests/html/html.sh
@@ -24,6 +24,11 @@ recollq -q html5charsetaccentue
# Stripping trade mark and copyright signs
recollq -q filename:signs.html Registered Trademark Copyright SoundCopyright
+# Text in malformed html (before or after body) should be indexed anyway...
+recollq -q BADHTMLTEXTBEFOREBODY
+recollq -q BADHTMLTEXTINSECONDBODY
+recollq -q BADHTMLTEXTAFTERBODY
+
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
diff --git a/tests/html/html.txt b/tests/html/html.txt
index f9d7e3eb..cdf4c9f4 100644
--- a/tests/html/html.txt
+++ b/tests/html/html.txt
@@ -22,3 +22,9 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/iso.html] [Some
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/html5charset.html] [html5charset.html] 113 bytes
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/html/signs.html] [signs.html] 230 bytes
+1 results
+text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes
+1 results
+text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes
+1 results
+text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes
diff --git a/website/BUGS.html b/website/BUGS.html
index 569b5f40..85c3c7f0 100644
--- a/website/BUGS.html
+++ b/website/BUGS.html
@@ -35,12 +35,55 @@
supposedly fixed in later versions. Bugs listed in the
topmost section may also exist in older versions.
-
+
+
+ It appears that recollindex will sometimes crash while
+ indexing mail files. There are 2 separate reports about
+ this, and no resolution for now. This is not specific to
+ 1.17 as one of the reports is for 1.16. Refs:
+ Fedora
+ (maildir, processing an attachment),
+ Ubuntu: apparently (no stack trace):
+ Recoll was indexing files in .thunderbird when the crash
+ occurred. It seemed to be indexing the INBOX file on
+ disk.
+
+
+ Text inside malformed HTML files (appearing before a <body>
+ tag, or after a second one, or after a </body> tag is
+ not indexed. At it would be displayed by current browsers,
+ this is wrong.
+
+ It will sometimes happen that the result list paragraph
+ format stored in the Qt preferences file will get garbled,
+ causing result lists with no displayed paragraphs (the
+ counts and pages are ok, the results can be seen in table
+ mode, but not in list mode). The workaround is to go to
+
+ Preferences->Query configuration->User interface
+
and erase the result paragraph format string
+ (^A DEL in the text area), this will reset the string to the
+ default value.
+
+ Real time indexer: when running with gamin on FreeBSD, the
+ indexer can deadlock in the gamin dialog in some
+ cases.
+
+ After an upgrade, the recoll GUI sometimes crashes on
+ startup. This is fixed by removing (back it up just in case)
+ ~/.config/Recoll.org/recoll.conf, the QSettings storage for
+ recoll.
+
+
- The version string is not correctly updated for 1.17.1, the
help dialog and recollindex -v will print 1.17.0.
+ - You can crash the GUI by starting simultaneous queries,
+ which could be accomplished among others by quickly clicking
+ the sort order buttons.
+
- chm filter: url-encoded internal paths are mishandled.
- Does not compile on Solaris (flock() issue).
@@ -57,24 +100,6 @@
- Html output from Python (rclexecm) filters is not
correctly escaped.
-
- - It will sometimes happen that the result list paragraph format
- stored in the Qt preferences file will get garbled,
- causing result lists with no displayed paragraphs (the
- counts and pages are ok, the results can be seen in table
- mode, but not in list mode). The workaround is to go to
-
- Preferences->Query configuration->User interface
-
and erase the result paragraph format string
- (^A DEL in the text area), this will reset the string to the
- default value.
- - Real time indexer: when running with gamin on FreeBSD, the
- indexer can deadlock in the gamin dialog in some
- cases.
- - After an upgrade, the recoll GUI sometimes crashes on
- startup. This is fixed by removing (back it up just in case)
- ~/.config/Recoll.org/recoll.conf, the QSettings storage for
- recoll.