From 289e88c1bc0f912ae624d9fdd5fc974ea39ebd8f Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 16 May 2012 10:07:09 +0200 Subject: [PATCH] Html: Just ignore opening and closing and tags. Current browsers show text before or after the body and ignore multiple body tags. Not pushed to 1.17 maint because of possible disruption. Closes issue #92 --- src/internfile/myhtmlparse.cpp | 19 +++++----- src/internfile/myhtmlparse.h | 1 - tests/html/html.sh | 5 +++ tests/html/html.txt | 6 ++++ website/BUGS.html | 63 ++++++++++++++++++++++++---------- 5 files changed, 64 insertions(+), 30 deletions(-) diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index a85a7531..8c963029 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -176,7 +176,6 @@ static NamedEntsInitializer namedEntsInitializerInstance; MyHtmlParser::MyHtmlParser() : in_script_tag(false), in_style_tag(false), - in_body_tag(false), in_pre_tag(false), pending_space(false), indexing_allowed(true) @@ -308,11 +307,10 @@ MyHtmlParser::opening_tag(const string &tag) if (tag == "address") pending_space = true; break; case 'b': - if (tag == "body") { - dump.resize(0); - in_body_tag = true; - break; - } + // body: some bad docs have several opening body tags and + // even text before the body is displayed by Opera and + // Firefox. We used to reset the dump each time we saw a + // body tag, but I can't see any reason to do so. if (tag == "blockquote" || tag == "br") { dump += '\n'; pending_space = true; @@ -475,8 +473,11 @@ MyHtmlParser::closing_tag(const string &tag) case 'b': if (tag == "body") { LOGDEB1(("Myhtmlparse: body close tag found\n")); - in_body_tag = false; - return false; + // We used to signal and end of doc here by returning + // false but the browsers just ignore body and html + // closing tags if there is further text, so it seems right + // to do the same + break; } if (tag == "blockquote" || tag == "br") pending_space = true; break; @@ -562,6 +563,4 @@ MyHtmlParser::closing_tag(const string &tag) void MyHtmlParser::do_eof() { - // if (!in_body_tag) - // throw(false); } diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index 6bad0637..38b3e8f6 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -36,7 +36,6 @@ class MyHtmlParser : public HtmlParser { public: bool in_script_tag; bool in_style_tag; - bool in_body_tag; bool in_pre_tag; bool pending_space; map meta; diff --git a/tests/html/html.sh b/tests/html/html.sh index b3fe2840..181b2630 100755 --- a/tests/html/html.sh +++ b/tests/html/html.sh @@ -24,6 +24,11 @@ recollq -q html5charsetaccentue # Stripping trade mark and copyright signs recollq -q filename:signs.html Registered Trademark Copyright SoundCopyright +# Text in malformed html (before or after body) should be indexed anyway... +recollq -q BADHTMLTEXTBEFOREBODY +recollq -q BADHTMLTEXTINSECONDBODY +recollq -q BADHTMLTEXTAFTERBODY + ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 diff --git a/tests/html/html.txt b/tests/html/html.txt index f9d7e3eb..cdf4c9f4 100644 --- a/tests/html/html.txt +++ b/tests/html/html.txt @@ -22,3 +22,9 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/iso.html] [Some text/html [file:///home/dockes/projets/fulltext/testrecoll/html/html5charset.html] [html5charset.html] 113 bytes 1 results text/html [file:///home/dockes/projets/fulltext/testrecoll/html/signs.html] [signs.html] 230 bytes +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/html/verybadhtml.html] [Conversation with xxx at Sun 13 May 2012 10:42:23 PM PDT on xxx (xxx)] 646 bytes diff --git a/website/BUGS.html b/website/BUGS.html index 569b5f40..85c3c7f0 100644 --- a/website/BUGS.html +++ b/website/BUGS.html @@ -35,12 +35,55 @@ supposedly fixed in later versions. Bugs listed in the topmost section may also exist in older versions.

-

recoll 1.17.1

+

recoll 1.17.2

+ +
  • It appears that recollindex will sometimes crash while + indexing mail files. There are 2 separate reports about + this, and no resolution for now. This is not specific to + 1.17 as one of the reports is for 1.16. Refs: + Fedora + (maildir, processing an attachment), + Ubuntu: apparently (no stack trace): + Recoll was indexing files in .thunderbird when the crash + occurred. It seemed to be indexing the INBOX file on + disk. +
  • + +
  • Text inside malformed HTML files (appearing before a <body> + tag, or after a second one, or after a </body> tag is + not indexed. At it would be displayed by current browsers, + this is wrong.
  • + +
  • It will sometimes happen that the result list paragraph + format stored in the Qt preferences file will get garbled, + causing result lists with no displayed paragraphs (the + counts and pages are ok, the results can be seen in table + mode, but not in list mode). The workaround is to go to +
    + Preferences->Query configuration->User interface +
    and erase the result paragraph format string + (^A DEL in the text area), this will reset the string to the + default value.
  • + +
  • Real time indexer: when running with gamin on FreeBSD, the + indexer can deadlock in the gamin dialog in some + cases.
  • + +
  • After an upgrade, the recoll GUI sometimes crashes on + startup. This is fixed by removing (back it up just in case) + ~/.config/Recoll.org/recoll.conf, the QSettings storage for + recoll.
  • + +

    recoll 1.17.1