diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index a85a7531..8c963029 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -176,7 +176,6 @@ static NamedEntsInitializer namedEntsInitializerInstance; MyHtmlParser::MyHtmlParser() : in_script_tag(false), in_style_tag(false), - in_body_tag(false), in_pre_tag(false), pending_space(false), indexing_allowed(true) @@ -308,11 +307,10 @@ MyHtmlParser::opening_tag(const string &tag) if (tag == "address") pending_space = true; break; case 'b': - if (tag == "body") { - dump.resize(0); - in_body_tag = true; - break; - } + // body: some bad docs have several opening body tags and + // even text before the body is displayed by Opera and + // Firefox. We used to reset the dump each time we saw a + // body tag, but I can't see any reason to do so. if (tag == "blockquote" || tag == "br") { dump += '\n'; pending_space = true; @@ -475,8 +473,11 @@ MyHtmlParser::closing_tag(const string &tag) case 'b': if (tag == "body") { LOGDEB1(("Myhtmlparse: body close tag found\n")); - in_body_tag = false; - return false; + // We used to signal and end of doc here by returning + // false but the browsers just ignore body and html + // closing tags if there is further text, so it seems right + // to do the same + break; } if (tag == "blockquote" || tag == "br") pending_space = true; break; @@ -562,6 +563,4 @@ MyHtmlParser::closing_tag(const string &tag) void MyHtmlParser::do_eof() { - // if (!in_body_tag) - // throw(false); } diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index 6bad0637..38b3e8f6 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -36,7 +36,6 @@ class MyHtmlParser : public HtmlParser { public: bool in_script_tag; bool in_style_tag; - bool in_body_tag; bool in_pre_tag; bool pending_space; map meta; diff --git a/tests/html/html.sh b/tests/html/html.sh index b3fe2840..181b2630 100755 --- a/tests/html/html.sh +++ b/tests/html/html.sh @@ -24,6 +24,11 @@ recollq -q html5charsetaccentue # Stripping trade mark and copyright signs recollq -q filename:signs.html Registered Trademark Copyright SoundCopyright +# Text in malformed html (before or after body) should be indexed anyway... +recollq -q BADHTMLTEXTBEFOREBODY +recollq -q BADHTMLTEXTINSECONDBODY +recollq -q BADHTMLTEXTAFTERBODY + ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 diff --git a/tests/html/html.txt b/tests/html/html.txt index f9d7e3eb..cdf4c9f4 100644 --- a/tests/html/html.txt +++ b/tests/html/html.txt @@ -22,3 +22,9 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/iso.html] [Some text/html [file:///home/dockes/projets/fulltext/testrecoll/html/html5charset.html] [html5charset.html] 113 bytes 1 results text/html [file:///home/dockes/projets/fulltext/testrecoll/html/signs.html] [signs.html] 230 bytes +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes diff --git a/website/BUGS.html b/website/BUGS.html index 569b5f40..85c3c7f0 100644 --- a/website/BUGS.html +++ b/website/BUGS.html @@ -35,12 +35,55 @@ supposedly fixed in later versions. Bugs listed in the topmost section may also exist in older versions.

-

recoll 1.17.1

+

recoll 1.17.2

+ +
  • It appears that recollindex will sometimes crash while + indexing mail files. There are 2 separate reports about + this, and no resolution for now. This is not specific to + 1.17 as one of the reports is for 1.16. Refs: + Fedora + (maildir, processing an attachment), + Ubuntu: apparently (no stack trace): + Recoll was indexing files in .thunderbird when the crash + occurred. It seemed to be indexing the INBOX file on + disk. +
  • + +
  • Text inside malformed HTML files (appearing before a <body> + tag, or after a second one, or after a </body> tag is + not indexed. At it would be displayed by current browsers, + this is wrong.
  • + +
  • It will sometimes happen that the result list paragraph + format stored in the Qt preferences file will get garbled, + causing result lists with no displayed paragraphs (the + counts and pages are ok, the results can be seen in table + mode, but not in list mode). The workaround is to go to +
    + Preferences->Query configuration->User interface +
    and erase the result paragraph format string + (^A DEL in the text area), this will reset the string to the + default value.
  • + +
  • Real time indexer: when running with gamin on FreeBSD, the + indexer can deadlock in the gamin dialog in some + cases.
  • + +
  • After an upgrade, the recoll GUI sometimes crashes on + startup. This is fixed by removing (back it up just in case) + ~/.config/Recoll.org/recoll.conf, the QSettings storage for + recoll.
  • + +

    recoll 1.17.1