mirror of
https://github.com/geometer/FBReaderJ.git
synced 2025-10-04 10:19:33 +02:00
included html format support files (not used at this moment)
This commit is contained in:
parent
0a19a10264
commit
8804e70d28
14 changed files with 1851 additions and 0 deletions
|
@ -98,6 +98,12 @@ LOCAL_SRC_FILES := \
|
||||||
NativeFormats/fbreader/src/formats/fb2/FB2TagManager.cpp \
|
NativeFormats/fbreader/src/formats/fb2/FB2TagManager.cpp \
|
||||||
NativeFormats/fbreader/src/formats/css/StyleSheetParser.cpp \
|
NativeFormats/fbreader/src/formats/css/StyleSheetParser.cpp \
|
||||||
NativeFormats/fbreader/src/formats/css/StyleSheetTable.cpp \
|
NativeFormats/fbreader/src/formats/css/StyleSheetTable.cpp \
|
||||||
|
NativeFormats/fbreader/src/formats/html/HtmlBookReader.cpp \
|
||||||
|
NativeFormats/fbreader/src/formats/html/HtmlDescriptionReader.cpp \
|
||||||
|
NativeFormats/fbreader/src/formats/html/HtmlEntityCollection.cpp \
|
||||||
|
NativeFormats/fbreader/src/formats/html/HtmlPlugin.cpp \
|
||||||
|
NativeFormats/fbreader/src/formats/html/HtmlReader.cpp \
|
||||||
|
NativeFormats/fbreader/src/formats/html/HtmlReaderStream.cpp \
|
||||||
NativeFormats/fbreader/src/formats/oeb/NCXReader.cpp \
|
NativeFormats/fbreader/src/formats/oeb/NCXReader.cpp \
|
||||||
NativeFormats/fbreader/src/formats/oeb/OEBBookReader.cpp \
|
NativeFormats/fbreader/src/formats/oeb/OEBBookReader.cpp \
|
||||||
NativeFormats/fbreader/src/formats/oeb/OEBCoverReader.cpp \
|
NativeFormats/fbreader/src/formats/oeb/OEBCoverReader.cpp \
|
||||||
|
|
583
jni/NativeFormats/fbreader/src/formats/html/HtmlBookReader.cpp
Normal file
583
jni/NativeFormats/fbreader/src/formats/html/HtmlBookReader.cpp
Normal file
|
@ -0,0 +1,583 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <cctype>
|
||||||
|
|
||||||
|
#include <ZLFile.h>
|
||||||
|
#include <ZLFileImage.h>
|
||||||
|
#include <ZLStringUtil.h>
|
||||||
|
|
||||||
|
#include "HtmlBookReader.h"
|
||||||
|
#include "HtmlTagActions.h"
|
||||||
|
#include "../txt/PlainTextFormat.h"
|
||||||
|
#include "../util/MiscUtil.h"
|
||||||
|
#include "../../bookmodel/BookModel.h"
|
||||||
|
#include "../css/StyleSheetParser.h"
|
||||||
|
|
||||||
|
HtmlTagAction::HtmlTagAction(HtmlBookReader &reader) : myReader(reader) {
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlTagAction::~HtmlTagAction() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlTagAction::reset() {
|
||||||
|
}
|
||||||
|
|
||||||
|
DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void DummyHtmlTagAction::run(const HtmlReader::HtmlTag&) {
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlControlTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
std::vector<FBTextKind> &list = myReader.myKindList;
|
||||||
|
int index;
|
||||||
|
for (index = list.size() - 1; index >= 0; --index) {
|
||||||
|
if (list[index] == myKind) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (tag.Start) {
|
||||||
|
if (index == -1) {
|
||||||
|
bookReader().pushKind(myKind);
|
||||||
|
myReader.myKindList.push_back(myKind);
|
||||||
|
bookReader().addControl(myKind, true);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (index >= 0) {
|
||||||
|
for (int i = list.size() - 1; i >= index; --i) {
|
||||||
|
bookReader().addControl(list[i], false);
|
||||||
|
bookReader().popKind();
|
||||||
|
}
|
||||||
|
for (unsigned int j = index + 1; j < list.size(); ++j) {
|
||||||
|
bookReader().addControl(list[j], true);
|
||||||
|
bookReader().pushKind(list[j]);
|
||||||
|
}
|
||||||
|
list.erase(list.begin() + index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
myReader.myIsStarted = false;
|
||||||
|
if (tag.Start) {
|
||||||
|
if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
|
||||||
|
if (!bookReader().contentsParagraphIsOpen()) {
|
||||||
|
bookReader().insertEndOfSectionParagraph();
|
||||||
|
bookReader().enterTitle();
|
||||||
|
bookReader().beginContentsParagraph();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bookReader().pushKind(myKind);
|
||||||
|
} else {
|
||||||
|
bookReader().popKind();
|
||||||
|
if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
|
||||||
|
bookReader().endContentsParagraph();
|
||||||
|
bookReader().exitTitle();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bookReader().beginParagraph();
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
if (tag.Start) {
|
||||||
|
if (myTagNames.find(tag.Name) == myTagNames.end()) {
|
||||||
|
++myReader.myIgnoreDataCounter;
|
||||||
|
myTagNames.insert(tag.Name);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (myTagNames.find(tag.Name) != myTagNames.end()) {
|
||||||
|
--myReader.myIgnoreDataCounter;
|
||||||
|
myTagNames.erase(tag.Name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
if (tag.Start) {
|
||||||
|
for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
|
||||||
|
if (tag.Attributes[i].Name == "NAME") {
|
||||||
|
bookReader().addHyperlinkLabel(tag.Attributes[i].Value);
|
||||||
|
} else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "HREF")) {
|
||||||
|
std::string value = tag.Attributes[i].Value;
|
||||||
|
if (!myReader.myFileName.empty() &&
|
||||||
|
(value.length() > myReader.myFileName.length()) &&
|
||||||
|
(value.substr(0, myReader.myFileName.length()) == myReader.myFileName)) {
|
||||||
|
value = value.substr(myReader.myFileName.length());
|
||||||
|
}
|
||||||
|
if (!value.empty()) {
|
||||||
|
if (value[0] == '#') {
|
||||||
|
setHyperlinkType(INTERNAL_HYPERLINK);
|
||||||
|
bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, value.substr(1));
|
||||||
|
} else {
|
||||||
|
FBTextKind hyperlinkType = MiscUtil::referenceType(value);
|
||||||
|
if (hyperlinkType != INTERNAL_HYPERLINK) {
|
||||||
|
setHyperlinkType(hyperlinkType);
|
||||||
|
bookReader().addHyperlinkControl(hyperlinkType, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (hyperlinkType() != REGULAR) {
|
||||||
|
bookReader().addControl(hyperlinkType(), false);
|
||||||
|
setHyperlinkType(REGULAR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlHrefTagAction::reset() {
|
||||||
|
setHyperlinkType(REGULAR);
|
||||||
|
}
|
||||||
|
|
||||||
|
FBTextKind HtmlHrefTagAction::hyperlinkType() const {
|
||||||
|
return myHyperlinkType;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType) {
|
||||||
|
myHyperlinkType = hyperlinkType;
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
if (tag.Start) {
|
||||||
|
bookReader().endParagraph();
|
||||||
|
for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
|
||||||
|
if (tag.Attributes[i].Name == "SRC") {
|
||||||
|
const std::string fileName = MiscUtil::decodeHtmlURL(tag.Attributes[i].Value);
|
||||||
|
const ZLFile file(myReader.myBaseDirPath + fileName);
|
||||||
|
if (file.exists()) {
|
||||||
|
bookReader().addImageReference(fileName, 0, false);
|
||||||
|
bookReader().addImage(fileName, new ZLFileImage(file, "", 0));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bookReader().beginParagraph();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType) : HtmlTagAction(reader), myBreakType(breakType) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlBreakTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
if (myReader.myDontBreakParagraph) {
|
||||||
|
myReader.myDontBreakParagraph = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((tag.Start && (myBreakType & BREAK_AT_START)) ||
|
||||||
|
(!tag.Start && (myBreakType & BREAK_AT_END))) {
|
||||||
|
bookReader().endParagraph();
|
||||||
|
if (bookReader().isKindStackEmpty()) {
|
||||||
|
bookReader().pushKind(REGULAR);
|
||||||
|
}
|
||||||
|
bookReader().beginParagraph();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlPreTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
bookReader().endParagraph();
|
||||||
|
myReader.myIsPreformatted = tag.Start;
|
||||||
|
myReader.mySpaceCounter = -1;
|
||||||
|
myReader.myBreakCounter = 0;
|
||||||
|
if (myReader.myFormat.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
|
||||||
|
if (tag.Start) {
|
||||||
|
bookReader().pushKind(PREFORMATTED);
|
||||||
|
} else {
|
||||||
|
bookReader().popKind();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bookReader().beginParagraph();
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlListTagAction::HtmlListTagAction(HtmlBookReader &reader, int startIndex) : HtmlTagAction(reader), myStartIndex(startIndex) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlListTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
if (tag.Start) {
|
||||||
|
myReader.myListNumStack.push(myStartIndex);
|
||||||
|
} else if (!myReader.myListNumStack.empty()) {
|
||||||
|
myReader.myListNumStack.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlListItemTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
if (tag.Start) {
|
||||||
|
bookReader().endParagraph();
|
||||||
|
bookReader().beginParagraph();
|
||||||
|
if (!myReader.myListNumStack.empty()) {
|
||||||
|
bookReader().addFixedHSpace(3 * myReader.myListNumStack.size());
|
||||||
|
int &index = myReader.myListNumStack.top();
|
||||||
|
if (index == 0) {
|
||||||
|
myReader.addConvertedDataToBuffer("\342\200\242 ", 4, false);
|
||||||
|
} else {
|
||||||
|
std::string number;
|
||||||
|
ZLStringUtil::appendNumber(number, index++);
|
||||||
|
number += ". ";
|
||||||
|
myReader.addConvertedDataToBuffer(number.data(), number.length(), false);
|
||||||
|
}
|
||||||
|
myReader.myDontBreakParagraph = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
myReader.myDontBreakParagraph = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlTableTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
if (tag.Start) {
|
||||||
|
myReader.myIgnoreTitles = true;
|
||||||
|
} else {
|
||||||
|
myReader.myIgnoreTitles = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlStyleTagAction::run(const HtmlReader::HtmlTag &tag) {
|
||||||
|
myReader.myStyleSheetParser = tag.Start ? new StyleSheetTableParser(myReader.myStyleSheetTable) : 0;
|
||||||
|
/*
|
||||||
|
if (!tag.Start) {
|
||||||
|
myReader.myStyleSheetTable.dump();
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
shared_ptr<HtmlTagAction> HtmlBookReader::createAction(const std::string &tag) {
|
||||||
|
if (tag == "EM") {
|
||||||
|
return new HtmlControlTagAction(*this, EMPHASIS);
|
||||||
|
} else if (tag == "STRONG") {
|
||||||
|
return new HtmlControlTagAction(*this, STRONG);
|
||||||
|
} else if (tag == "B") {
|
||||||
|
return new HtmlControlTagAction(*this, BOLD);
|
||||||
|
} else if (tag == "I") {
|
||||||
|
return new HtmlControlTagAction(*this, ITALIC);
|
||||||
|
} else if (tag == "TT") {
|
||||||
|
return new HtmlControlTagAction(*this, CODE);
|
||||||
|
} else if (tag == "CODE") {
|
||||||
|
return new HtmlControlTagAction(*this, CODE);
|
||||||
|
} else if (tag == "CITE") {
|
||||||
|
return new HtmlControlTagAction(*this, CITE);
|
||||||
|
} else if (tag == "SUB") {
|
||||||
|
return new HtmlControlTagAction(*this, SUB);
|
||||||
|
} else if (tag == "SUP") {
|
||||||
|
return new HtmlControlTagAction(*this, SUP);
|
||||||
|
} else if (tag == "H1") {
|
||||||
|
return new HtmlHeaderTagAction(*this, H1);
|
||||||
|
} else if (tag == "H2") {
|
||||||
|
return new HtmlHeaderTagAction(*this, H2);
|
||||||
|
} else if (tag == "H3") {
|
||||||
|
return new HtmlHeaderTagAction(*this, H3);
|
||||||
|
} else if (tag == "H4") {
|
||||||
|
return new HtmlHeaderTagAction(*this, H4);
|
||||||
|
} else if (tag == "H5") {
|
||||||
|
return new HtmlHeaderTagAction(*this, H5);
|
||||||
|
} else if (tag == "H6") {
|
||||||
|
return new HtmlHeaderTagAction(*this, H6);
|
||||||
|
} else if (tag == "HEAD") {
|
||||||
|
return new HtmlIgnoreTagAction(*this);
|
||||||
|
} else if (tag == "TITLE") {
|
||||||
|
return new HtmlIgnoreTagAction(*this);
|
||||||
|
} else if (tag == "STYLE") {
|
||||||
|
return new HtmlStyleTagAction(*this);
|
||||||
|
} else if (tag == "SELECT") {
|
||||||
|
return new HtmlIgnoreTagAction(*this);
|
||||||
|
} else if (tag == "SCRIPT") {
|
||||||
|
return new HtmlIgnoreTagAction(*this);
|
||||||
|
} else if (tag == "A") {
|
||||||
|
return new HtmlHrefTagAction(*this);
|
||||||
|
} else if (tag == "TD") {
|
||||||
|
//return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
|
||||||
|
} else if (tag == "TR") {
|
||||||
|
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
|
||||||
|
} else if (tag == "DIV") {
|
||||||
|
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
|
||||||
|
} else if (tag == "DT") {
|
||||||
|
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START);
|
||||||
|
} else if (tag == "P") {
|
||||||
|
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
|
||||||
|
} else if (tag == "BR") {
|
||||||
|
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
|
||||||
|
} else if (tag == "IMG") {
|
||||||
|
return new HtmlImageTagAction(*this);
|
||||||
|
} else if (tag == "UL") {
|
||||||
|
return new HtmlListTagAction(*this, 0);
|
||||||
|
} else if (tag == "MENU") {
|
||||||
|
return new HtmlListTagAction(*this, 0);
|
||||||
|
} else if (tag == "DIR") {
|
||||||
|
return new HtmlListTagAction(*this, 0);
|
||||||
|
} else if (tag == "OL") {
|
||||||
|
return new HtmlListTagAction(*this, 1);
|
||||||
|
} else if (tag == "LI") {
|
||||||
|
return new HtmlListItemTagAction(*this);
|
||||||
|
} else if (tag == "PRE") {
|
||||||
|
if (myProcessPreTag) {
|
||||||
|
return new HtmlPreTagAction(*this);
|
||||||
|
}
|
||||||
|
} else if (tag == "TABLE") {
|
||||||
|
return new HtmlTableTagAction(*this);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
} else if (tag == "DD") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "DL") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "DFN") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "SAMP") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "KBD") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "VAR") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "ABBR") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "ACRONYM") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "BLOCKQUOTE") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "Q") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "INS") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "DEL") {
|
||||||
|
return 0;
|
||||||
|
} else if (tag == "BODY") {
|
||||||
|
return 0;
|
||||||
|
*/
|
||||||
|
return new DummyHtmlTagAction(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlBookReader::setBuildTableOfContent(bool build) {
|
||||||
|
myBuildTableOfContent = build;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlBookReader::setProcessPreTag(bool process) {
|
||||||
|
myProcessPreTag = process;
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlBookReader::HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlReader(encoding), myBookReader(model), myBaseDirPath(baseDirectoryPath), myFormat(format), myBuildTableOfContent(true), myProcessPreTag(true) {
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlBookReader::~HtmlBookReader() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlBookReader::addConvertedDataToBuffer(const char *text, size_t len, bool convert) {
|
||||||
|
if (len > 0) {
|
||||||
|
if (myDontBreakParagraph) {
|
||||||
|
while ((len > 0) && isspace(*text)) {
|
||||||
|
--len;
|
||||||
|
++text;
|
||||||
|
}
|
||||||
|
if (len == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (convert) {
|
||||||
|
myConverter->convert(myConverterBuffer, text, text + len);
|
||||||
|
myBookReader.addData(myConverterBuffer);
|
||||||
|
myBookReader.addContentsData(myConverterBuffer);
|
||||||
|
myConverterBuffer.erase();
|
||||||
|
} else {
|
||||||
|
std::string strText(text, len);
|
||||||
|
myBookReader.addData(strText);
|
||||||
|
myBookReader.addContentsData(strText);
|
||||||
|
}
|
||||||
|
myDontBreakParagraph = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HtmlBookReader::tagHandler(const HtmlTag &tag) {
|
||||||
|
myConverter->reset();
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
|
||||||
|
if (tag.Attributes[i].Name == "ID") {
|
||||||
|
myBookReader.addHyperlinkLabel(tag.Attributes[i].Value);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
shared_ptr<HtmlTagAction> action = myActionMap[tag.Name];
|
||||||
|
if (action.isNull()) {
|
||||||
|
action = createAction(tag.Name);
|
||||||
|
myActionMap[tag.Name] = action;
|
||||||
|
}
|
||||||
|
action->run(tag);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlBookReader::preformattedCharacterDataHandler(const char *text, size_t len, bool convert) {
|
||||||
|
const char *start = text;
|
||||||
|
const char *end = text + len;
|
||||||
|
|
||||||
|
int breakType = myFormat.breakType();
|
||||||
|
if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
|
||||||
|
for (const char *ptr = text; ptr != end; ++ptr) {
|
||||||
|
if (*ptr == '\n') {
|
||||||
|
mySpaceCounter = 0;
|
||||||
|
if (start < ptr) {
|
||||||
|
addConvertedDataToBuffer(start, ptr - start, convert);
|
||||||
|
} else {
|
||||||
|
static const std::string SPACE = " ";
|
||||||
|
myBookReader.addData(SPACE);
|
||||||
|
}
|
||||||
|
myBookReader.endParagraph();
|
||||||
|
myBookReader.beginParagraph();
|
||||||
|
start = ptr + 1;
|
||||||
|
} else if (mySpaceCounter >= 0) {
|
||||||
|
if (isspace((unsigned char)*ptr)) {
|
||||||
|
++mySpaceCounter;
|
||||||
|
} else {
|
||||||
|
myBookReader.addFixedHSpace(mySpaceCounter);
|
||||||
|
mySpaceCounter = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
addConvertedDataToBuffer(start, end - start, convert);
|
||||||
|
} else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) {
|
||||||
|
for (const char *ptr = text; ptr != end; ++ptr) {
|
||||||
|
if (isspace((unsigned char)*ptr)) {
|
||||||
|
if (*ptr == '\n') {
|
||||||
|
mySpaceCounter = 0;
|
||||||
|
} else if (mySpaceCounter >= 0) {
|
||||||
|
++mySpaceCounter;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (mySpaceCounter > myFormat.ignoredIndent()) {
|
||||||
|
if (ptr - start > mySpaceCounter) {
|
||||||
|
addConvertedDataToBuffer(start, ptr - start - mySpaceCounter, convert);
|
||||||
|
myBookReader.endParagraph();
|
||||||
|
myBookReader.beginParagraph();
|
||||||
|
}
|
||||||
|
start = ptr;
|
||||||
|
}
|
||||||
|
mySpaceCounter = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mySpaceCounter = std::max(mySpaceCounter, 0);
|
||||||
|
if (end - start > mySpaceCounter) {
|
||||||
|
addConvertedDataToBuffer(start, end - start - mySpaceCounter, convert);
|
||||||
|
}
|
||||||
|
} else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) {
|
||||||
|
for (const char *ptr = start; ptr != end; ++ptr) {
|
||||||
|
if (isspace((unsigned char)*ptr)) {
|
||||||
|
if (*ptr == '\n') {
|
||||||
|
++myBreakCounter;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (myBreakCounter > 1) {
|
||||||
|
addConvertedDataToBuffer(start, ptr - start, convert);
|
||||||
|
myBookReader.endParagraph();
|
||||||
|
myBookReader.beginParagraph();
|
||||||
|
start = ptr;
|
||||||
|
}
|
||||||
|
myBreakCounter = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
addConvertedDataToBuffer(start, end - start, convert);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HtmlBookReader::characterDataHandler(const char *text, size_t len, bool convert) {
|
||||||
|
if (!myStyleSheetParser.isNull()) {
|
||||||
|
myStyleSheetParser->parse(text, len);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (myIgnoreDataCounter != 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (myIsPreformatted) {
|
||||||
|
preformattedCharacterDataHandler(text, len, convert);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *ptr = text;
|
||||||
|
const char *end = text + len;
|
||||||
|
if (!myIsStarted) {
|
||||||
|
for (; ptr != end; ++ptr) {
|
||||||
|
if (!isspace((unsigned char)*ptr)) {
|
||||||
|
myIsStarted = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (myIsStarted) {
|
||||||
|
addConvertedDataToBuffer(ptr, end - ptr, convert);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlBookReader::startDocumentHandler() {
|
||||||
|
while (!myListNumStack.empty()) {
|
||||||
|
myListNumStack.pop();
|
||||||
|
}
|
||||||
|
myConverterBuffer.erase();
|
||||||
|
myKindList.clear();
|
||||||
|
|
||||||
|
myBookReader.reset();
|
||||||
|
myBookReader.setMainTextModel();
|
||||||
|
myBookReader.pushKind(REGULAR);
|
||||||
|
myBookReader.beginParagraph();
|
||||||
|
myIgnoreDataCounter = 0;
|
||||||
|
myIsPreformatted = false;
|
||||||
|
myDontBreakParagraph = false;
|
||||||
|
for (std::map<std::string,shared_ptr<HtmlTagAction> >::const_iterator it = myActionMap.begin(); it != myActionMap.end(); ++it) {
|
||||||
|
it->second->reset();
|
||||||
|
}
|
||||||
|
myIsStarted = false;
|
||||||
|
myIgnoreTitles = false;
|
||||||
|
|
||||||
|
myStyleSheetParser = 0;
|
||||||
|
|
||||||
|
mySpaceCounter = -1;
|
||||||
|
myBreakCounter = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlBookReader::endDocumentHandler() {
|
||||||
|
myBookReader.endParagraph();
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlBookReader::setFileName(const std::string fileName) {
|
||||||
|
myFileName = fileName;
|
||||||
|
}
|
101
jni/NativeFormats/fbreader/src/formats/html/HtmlBookReader.h
Normal file
101
jni/NativeFormats/fbreader/src/formats/html/HtmlBookReader.h
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __HTMLBOOKREADER_H__
|
||||||
|
#define __HTMLBOOKREADER_H__
|
||||||
|
|
||||||
|
#include <stack>
|
||||||
|
|
||||||
|
#include <shared_ptr.h>
|
||||||
|
|
||||||
|
#include "HtmlReader.h"
|
||||||
|
#include "../../bookmodel/BookReader.h"
|
||||||
|
#include "../css/StyleSheetTable.h"
|
||||||
|
|
||||||
|
class BookModel;
|
||||||
|
class PlainTextFormat;
|
||||||
|
class StyleSheetParser;
|
||||||
|
|
||||||
|
class HtmlTagAction;
|
||||||
|
|
||||||
|
class HtmlBookReader : public HtmlReader {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding);
|
||||||
|
~HtmlBookReader();
|
||||||
|
void setFileName(const std::string fileName);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual shared_ptr<HtmlTagAction> createAction(const std::string &tag);
|
||||||
|
void setBuildTableOfContent(bool build);
|
||||||
|
void setProcessPreTag(bool process);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void startDocumentHandler();
|
||||||
|
void endDocumentHandler();
|
||||||
|
bool tagHandler(const HtmlTag &tag);
|
||||||
|
bool characterDataHandler(const char *text, size_t len, bool convert);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void preformattedCharacterDataHandler(const char *text, size_t len, bool convert);
|
||||||
|
void addConvertedDataToBuffer(const char *text, size_t len, bool convert);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
BookReader myBookReader;
|
||||||
|
std::string myBaseDirPath;
|
||||||
|
|
||||||
|
private:
|
||||||
|
const PlainTextFormat &myFormat;
|
||||||
|
int myIgnoreDataCounter;
|
||||||
|
bool myIsPreformatted;
|
||||||
|
bool myDontBreakParagraph;
|
||||||
|
|
||||||
|
bool myIsStarted;
|
||||||
|
bool myBuildTableOfContent;
|
||||||
|
bool myProcessPreTag;
|
||||||
|
bool myIgnoreTitles;
|
||||||
|
std::stack<int> myListNumStack;
|
||||||
|
|
||||||
|
StyleSheetTable myStyleSheetTable;
|
||||||
|
shared_ptr<StyleSheetParser> myStyleSheetParser;
|
||||||
|
|
||||||
|
int mySpaceCounter;
|
||||||
|
int myBreakCounter;
|
||||||
|
std::string myConverterBuffer;
|
||||||
|
|
||||||
|
std::map<std::string,shared_ptr<HtmlTagAction> > myActionMap;
|
||||||
|
std::vector<FBTextKind> myKindList;
|
||||||
|
|
||||||
|
std::string myFileName;
|
||||||
|
|
||||||
|
friend class HtmlTagAction;
|
||||||
|
friend class HtmlControlTagAction;
|
||||||
|
friend class HtmlHeaderTagAction;
|
||||||
|
friend class HtmlIgnoreTagAction;
|
||||||
|
friend class HtmlHrefTagAction;
|
||||||
|
friend class HtmlImageTagAction;
|
||||||
|
friend class HtmlBreakTagAction;
|
||||||
|
friend class HtmlPreTagAction;
|
||||||
|
friend class HtmlListTagAction;
|
||||||
|
friend class HtmlListItemTagAction;
|
||||||
|
friend class HtmlTableTagAction;
|
||||||
|
friend class HtmlStyleTagAction;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* __HTMLBOOKREADER_H__ */
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "HtmlDescriptionReader.h"
|
||||||
|
|
||||||
|
#include "../../library/Book.h"
|
||||||
|
|
||||||
|
HtmlDescriptionReader::HtmlDescriptionReader(Book &book) : HtmlReader(book.encoding()), myBook(book) {
|
||||||
|
myBook.setTitle("");
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlDescriptionReader::startDocumentHandler() {
|
||||||
|
myReadTitle = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlDescriptionReader::endDocumentHandler() {
|
||||||
|
if (!myBook.title().empty()) {
|
||||||
|
const char *titleStart = myBook.title().data();
|
||||||
|
const char *titleEnd = titleStart + myBook.title().length();
|
||||||
|
std::string newTitle;
|
||||||
|
myConverter->convert(newTitle, titleStart, titleEnd);
|
||||||
|
myBook.setTitle(newTitle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HtmlDescriptionReader::tagHandler(const HtmlTag &tag) {
|
||||||
|
if (tag.Name == "TITLE") {
|
||||||
|
if (myReadTitle && !tag.Start) {
|
||||||
|
myBook.setTitle(myBuffer);
|
||||||
|
myBuffer.erase();
|
||||||
|
}
|
||||||
|
myReadTitle = tag.Start && myBook.title().empty();
|
||||||
|
return true;
|
||||||
|
} else if (tag.Start && tag.Name == "META") {
|
||||||
|
std::vector<HtmlAttribute>::const_iterator it = tag.Attributes.begin();
|
||||||
|
for (; it != tag.Attributes.end(); ++it) {
|
||||||
|
if (it->Name == "CONTENT") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (it != tag.Attributes.end()) {
|
||||||
|
const std::string prefix = "charset=";
|
||||||
|
size_t index = it->Value.find(prefix);
|
||||||
|
if (index != std::string::npos) {
|
||||||
|
std::string charset = it->Value.substr(index + prefix.length());
|
||||||
|
index = charset.find(';');
|
||||||
|
if (index != std::string::npos) {
|
||||||
|
charset = charset.substr(0, index);
|
||||||
|
}
|
||||||
|
index = charset.find(' ');
|
||||||
|
if (index != std::string::npos) {
|
||||||
|
charset = charset.substr(0, index);
|
||||||
|
}
|
||||||
|
myBook.setEncoding(charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tag.Name != "BODY";
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HtmlDescriptionReader::characterDataHandler(const char *text, size_t len, bool) {
|
||||||
|
if (myReadTitle) {
|
||||||
|
myBuffer.append(text, len);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __HTMLDESCRIPTIONREADER_H__
|
||||||
|
#define __HTMLDESCRIPTIONREADER_H__
|
||||||
|
|
||||||
|
#include "HtmlReader.h"
|
||||||
|
|
||||||
|
class Book;
|
||||||
|
|
||||||
|
class HtmlDescriptionReader : public HtmlReader {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlDescriptionReader(Book &book);
|
||||||
|
~HtmlDescriptionReader();
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void startDocumentHandler();
|
||||||
|
void endDocumentHandler();
|
||||||
|
|
||||||
|
bool tagHandler(const HtmlTag &tag);
|
||||||
|
bool characterDataHandler(const char *text, size_t len, bool convert);
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool myReadTitle;
|
||||||
|
std::string myBuffer;
|
||||||
|
Book &myBook;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline HtmlDescriptionReader::~HtmlDescriptionReader() {}
|
||||||
|
|
||||||
|
#endif /* __HTMLDESCRIPTIONREADER_H__ */
|
|
@ -0,0 +1,71 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cctype>
|
||||||
|
|
||||||
|
#include <ZLibrary.h>
|
||||||
|
#include <ZLFile.h>
|
||||||
|
#include <ZLXMLReader.h>
|
||||||
|
|
||||||
|
#include "HtmlEntityCollection.h"
|
||||||
|
|
||||||
|
class CollectionReader : public ZLXMLReader {
|
||||||
|
|
||||||
|
public:
|
||||||
|
CollectionReader(std::map<std::string,int> &collection);
|
||||||
|
void startElementHandler(const char *tag, const char **attributes);
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::map<std::string,int> &myCollection;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::map<std::string,int> HtmlEntityCollection::ourCollection;
|
||||||
|
|
||||||
|
int HtmlEntityCollection::symbolNumber(const std::string &name) {
|
||||||
|
if (ourCollection.empty()) {
|
||||||
|
CollectionReader(ourCollection).readDocument(ZLFile(
|
||||||
|
ZLibrary::ApplicationDirectory() + ZLibrary::FileNameDelimiter +
|
||||||
|
"formats" + ZLibrary::FileNameDelimiter +
|
||||||
|
"html" + ZLibrary::FileNameDelimiter + "html.ent"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
std::map<std::string,int>::const_iterator it = ourCollection.find(name);
|
||||||
|
return (it == ourCollection.end()) ? 0 : it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
CollectionReader::CollectionReader(std::map<std::string,int> &collection) : myCollection(collection) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void CollectionReader::startElementHandler(const char *tag, const char **attributes) {
|
||||||
|
static const std::string ENTITY = "entity";
|
||||||
|
|
||||||
|
if (ENTITY == tag) {
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
if (attributes[i] == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static const std::string _name = "name";
|
||||||
|
static const std::string _number = "number";
|
||||||
|
if ((_name == attributes[0]) && (_number == attributes[2])) {
|
||||||
|
myCollection[attributes[1]] = atoi(attributes[3]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __HTMLENTITYCOLLECTION_H__
|
||||||
|
#define __HTMLENTITYCOLLECTION_H__
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
class HtmlEntityCollection {
|
||||||
|
|
||||||
|
public:
|
||||||
|
static int symbolNumber(const std::string &name);
|
||||||
|
|
||||||
|
private:
|
||||||
|
static std::map<std::string,int> ourCollection;
|
||||||
|
|
||||||
|
private:
|
||||||
|
HtmlEntityCollection();
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* __HTMLENTITYCOLLECTION_H__ */
|
81
jni/NativeFormats/fbreader/src/formats/html/HtmlPlugin.cpp
Normal file
81
jni/NativeFormats/fbreader/src/formats/html/HtmlPlugin.cpp
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <ZLStringUtil.h>
|
||||||
|
#include <ZLFile.h>
|
||||||
|
#include <ZLInputStream.h>
|
||||||
|
|
||||||
|
#include "HtmlPlugin.h"
|
||||||
|
#include "HtmlDescriptionReader.h"
|
||||||
|
#include "HtmlBookReader.h"
|
||||||
|
#include "HtmlReaderStream.h"
|
||||||
|
#include "../txt/PlainTextFormat.h"
|
||||||
|
#include "../util/MiscUtil.h"
|
||||||
|
#include "../../library/Book.h"
|
||||||
|
#include "../../bookmodel/BookModel.h"
|
||||||
|
|
||||||
|
const std::string HtmlPlugin::supportedFileType() const {
|
||||||
|
return "HTML";
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HtmlPlugin::readMetaInfo(Book &book) const {
|
||||||
|
shared_ptr<ZLInputStream> stream = book.file().inputStream();
|
||||||
|
if (stream.isNull()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
shared_ptr<ZLInputStream> htmlStream = new HtmlReaderStream(stream, 50000);
|
||||||
|
detectEncodingAndLanguage(book, *htmlStream);
|
||||||
|
if (book.encoding().empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
HtmlDescriptionReader(book).readDocument(*stream);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HtmlPlugin::readModel(BookModel &model) const {
|
||||||
|
const Book& book = *model.book();
|
||||||
|
const ZLFile &file = book.file();
|
||||||
|
shared_ptr<ZLInputStream> stream = file.inputStream();
|
||||||
|
if (stream.isNull()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
PlainTextFormat format(file);
|
||||||
|
if (!format.initialized()) {
|
||||||
|
PlainTextFormatDetector detector;
|
||||||
|
detector.detect(*stream, format);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string directoryPrefix = MiscUtil::htmlDirectoryPrefix(file.path());
|
||||||
|
HtmlBookReader reader(directoryPrefix, model, format, book.encoding());
|
||||||
|
reader.setFileName(MiscUtil::htmlFileName(file.path()));
|
||||||
|
reader.readDocument(*stream);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
//FormatInfoPage *HtmlPlugin::createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file) {
|
||||||
|
// return new PlainTextInfoPage(dialog, file, ZLResourceKey("<PRE>"), false);
|
||||||
|
//}
|
||||||
|
|
||||||
|
bool HtmlPlugin::readLanguageAndEncoding(Book &book) const {
|
||||||
|
return true;
|
||||||
|
}
|
42
jni/NativeFormats/fbreader/src/formats/html/HtmlPlugin.h
Normal file
42
jni/NativeFormats/fbreader/src/formats/html/HtmlPlugin.h
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __HTMLPLUGIN_H__
|
||||||
|
#define __HTMLPLUGIN_H__
|
||||||
|
|
||||||
|
#include "../FormatPlugin.h"
|
||||||
|
|
||||||
|
class HtmlPlugin : public FormatPlugin {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlPlugin();
|
||||||
|
~HtmlPlugin();
|
||||||
|
bool providesMetaInfo() const;
|
||||||
|
const std::string supportedFileType() const;
|
||||||
|
bool readMetaInfo(Book &book) const;
|
||||||
|
bool readLanguageAndEncoding(Book &book) const;
|
||||||
|
bool readModel(BookModel &model) const;
|
||||||
|
// FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file);
|
||||||
|
};
|
||||||
|
|
||||||
|
inline HtmlPlugin::HtmlPlugin() {}
|
||||||
|
inline HtmlPlugin::~HtmlPlugin() {}
|
||||||
|
inline bool HtmlPlugin::providesMetaInfo() const { return false; }
|
||||||
|
|
||||||
|
#endif /* __HTMLPLUGIN_H__ */
|
373
jni/NativeFormats/fbreader/src/formats/html/HtmlReader.cpp
Normal file
373
jni/NativeFormats/fbreader/src/formats/html/HtmlReader.cpp
Normal file
|
@ -0,0 +1,373 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cctype>
|
||||||
|
|
||||||
|
#include <ZLInputStream.h>
|
||||||
|
#include <ZLXMLReader.h>
|
||||||
|
#include <ZLFile.h>
|
||||||
|
#include <ZLStringUtil.h>
|
||||||
|
#include <ZLUnicodeUtil.h>
|
||||||
|
|
||||||
|
#include "HtmlReader.h"
|
||||||
|
#include "HtmlEntityCollection.h"
|
||||||
|
|
||||||
|
HtmlReader::HtmlReader(const std::string &encoding) : EncodedTextReader(encoding) {
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlReader::~HtmlReader() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlReader::setTag(HtmlTag &tag, const std::string &name) {
|
||||||
|
tag.Attributes.clear();
|
||||||
|
|
||||||
|
if (name.length() == 0) {
|
||||||
|
tag.Name = name;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
tag.Start = name[0] != '/';
|
||||||
|
if (tag.Start) {
|
||||||
|
tag.Name = name;
|
||||||
|
} else {
|
||||||
|
tag.Name = name.substr(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t len = tag.Name.length();
|
||||||
|
for (size_t i = 0; i < len; ++i) {
|
||||||
|
tag.Name[i] = toupper(tag.Name[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum ParseState {
|
||||||
|
PS_TEXT,
|
||||||
|
PS_TAGSTART,
|
||||||
|
PS_TAGNAME,
|
||||||
|
PS_WAIT_END_OF_TAG,
|
||||||
|
PS_ATTRIBUTENAME,
|
||||||
|
PS_ATTRIBUTEVALUE,
|
||||||
|
PS_SKIPTAG,
|
||||||
|
PS_COMMENT,
|
||||||
|
PS_SPECIAL,
|
||||||
|
PS_SPECIAL_IN_ATTRIBUTEVALUE,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum SpecialType {
|
||||||
|
ST_UNKNOWN,
|
||||||
|
ST_NUM,
|
||||||
|
ST_NAME,
|
||||||
|
ST_DEC,
|
||||||
|
ST_HEX
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool allowSymbol(SpecialType type, char ch) {
|
||||||
|
return
|
||||||
|
((type == ST_NAME) && isalpha(ch)) ||
|
||||||
|
((type == ST_DEC) && isdigit(ch)) ||
|
||||||
|
((type == ST_HEX) && isxdigit(ch));
|
||||||
|
}
|
||||||
|
|
||||||
|
static int specialSymbolNumber(SpecialType type, const std::string &txt) {
|
||||||
|
char *end = 0;
|
||||||
|
switch (type) {
|
||||||
|
case ST_NAME:
|
||||||
|
return HtmlEntityCollection::symbolNumber(txt);
|
||||||
|
case ST_DEC:
|
||||||
|
return strtol(txt.c_str() + 1, &end, 10);
|
||||||
|
case ST_HEX:
|
||||||
|
return strtol(txt.c_str() + 2, &end, 16);
|
||||||
|
default:
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlReader::appendString(std::string &to, std::string &from) {
|
||||||
|
if (myConverter.isNull()) {
|
||||||
|
to += from;
|
||||||
|
} else {
|
||||||
|
myConverter->convert(to, from);
|
||||||
|
myConverter->reset();
|
||||||
|
}
|
||||||
|
from.erase();
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlReader::readDocument(ZLInputStream &stream) {
|
||||||
|
if (!stream.open()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
startDocumentHandler();
|
||||||
|
|
||||||
|
ParseState state = PS_TEXT;
|
||||||
|
SpecialType state_special = ST_UNKNOWN;
|
||||||
|
std::string currentString;
|
||||||
|
std::string attributeValueString;
|
||||||
|
std::string specialString;
|
||||||
|
int quotationCounter = 0;
|
||||||
|
HtmlTag currentTag;
|
||||||
|
char endOfComment[2] = "\0";
|
||||||
|
|
||||||
|
const size_t BUFSIZE = 2048;
|
||||||
|
char *buffer = new char[BUFSIZE];
|
||||||
|
size_t length;
|
||||||
|
size_t offset = 0;
|
||||||
|
do {
|
||||||
|
length = stream.read(buffer, BUFSIZE);
|
||||||
|
char *start = buffer;
|
||||||
|
char *endOfBuffer = buffer + length;
|
||||||
|
for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
|
||||||
|
switch (state) {
|
||||||
|
case PS_TEXT:
|
||||||
|
if (*ptr == '<') {
|
||||||
|
if (!characterDataHandler(start, ptr - start, true)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
start = ptr + 1;
|
||||||
|
state = PS_TAGSTART;
|
||||||
|
currentTag.Offset = offset + (ptr - buffer);
|
||||||
|
}
|
||||||
|
if (*ptr == '&') {
|
||||||
|
if (!characterDataHandler(start, ptr - start, true)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
start = ptr + 1;
|
||||||
|
state = PS_SPECIAL;
|
||||||
|
state_special = ST_UNKNOWN;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case PS_SPECIAL:
|
||||||
|
case PS_SPECIAL_IN_ATTRIBUTEVALUE:
|
||||||
|
if (state_special == ST_UNKNOWN) {
|
||||||
|
if (*ptr == '#') {
|
||||||
|
state_special = ST_NUM;
|
||||||
|
} else if (isalpha(*ptr)) {
|
||||||
|
state_special = ST_NAME;
|
||||||
|
} else {
|
||||||
|
start = ptr;
|
||||||
|
state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
|
||||||
|
}
|
||||||
|
} else if (state_special == ST_NUM) {
|
||||||
|
if (*ptr == 'x') {
|
||||||
|
state_special = ST_HEX;
|
||||||
|
} else if (isdigit(*ptr)) {
|
||||||
|
state_special = ST_DEC;
|
||||||
|
} else {
|
||||||
|
start = ptr;
|
||||||
|
state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (*ptr == ';') {
|
||||||
|
specialString.append(start, ptr - start);
|
||||||
|
int number = specialSymbolNumber(state_special, specialString);
|
||||||
|
if ((128 <= number) && (number <= 159)) {
|
||||||
|
char ch = number;
|
||||||
|
if (state == PS_SPECIAL) {
|
||||||
|
characterDataHandler(&ch, 1, true);
|
||||||
|
} else {
|
||||||
|
myConverter->convert(attributeValueString, &ch, &ch + 1);
|
||||||
|
}
|
||||||
|
} else if (number != 0) {
|
||||||
|
char buffer[4];
|
||||||
|
int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number);
|
||||||
|
if (state == PS_SPECIAL) {
|
||||||
|
characterDataHandler(buffer, len, false);
|
||||||
|
} else {
|
||||||
|
attributeValueString.append(buffer, len);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
specialString = "&" + specialString + ";";
|
||||||
|
if (state == PS_SPECIAL) {
|
||||||
|
characterDataHandler(specialString.c_str(), specialString.length(), false);
|
||||||
|
} else {
|
||||||
|
attributeValueString += specialString;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
specialString.erase();
|
||||||
|
start = ptr + 1;
|
||||||
|
state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
|
||||||
|
} else if (!allowSymbol(state_special, *ptr)) {
|
||||||
|
start = ptr;
|
||||||
|
state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case PS_TAGSTART:
|
||||||
|
state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME;
|
||||||
|
break;
|
||||||
|
case PS_COMMENT:
|
||||||
|
if ((endOfComment[0] == '\0') && (*ptr != '-')) {
|
||||||
|
state = PS_TAGNAME;
|
||||||
|
} else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) {
|
||||||
|
start = ptr + 1;
|
||||||
|
state = PS_TEXT;
|
||||||
|
endOfComment[0] = '\0';
|
||||||
|
endOfComment[1] = '\0';
|
||||||
|
} else {
|
||||||
|
endOfComment[0] = endOfComment[1];
|
||||||
|
endOfComment[1] = *ptr;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case PS_WAIT_END_OF_TAG:
|
||||||
|
if (*ptr == '>') {
|
||||||
|
start = ptr + 1;
|
||||||
|
state = PS_TEXT;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case PS_TAGNAME:
|
||||||
|
if ((*ptr == '>') || (*ptr == '/') || isspace((unsigned char)*ptr)) {
|
||||||
|
currentString.append(start, ptr - start);
|
||||||
|
start = ptr + 1;
|
||||||
|
setTag(currentTag, currentString);
|
||||||
|
currentString.erase();
|
||||||
|
if (currentTag.Name == "") {
|
||||||
|
state = (*ptr == '>') ? PS_TEXT : PS_SKIPTAG;
|
||||||
|
} else {
|
||||||
|
if (*ptr == '>') {
|
||||||
|
if (!tagHandler(currentTag)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
state = PS_TEXT;
|
||||||
|
} else if (*ptr == '/') {
|
||||||
|
if (!tagHandler(currentTag)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
currentTag.Start = false;
|
||||||
|
if (!tagHandler(currentTag)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
state = PS_WAIT_END_OF_TAG;
|
||||||
|
} else {
|
||||||
|
state = PS_ATTRIBUTENAME;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case PS_ATTRIBUTENAME:
|
||||||
|
if ((*ptr == '>') || (*ptr == '/') || (*ptr == '=') || isspace((unsigned char)*ptr)) {
|
||||||
|
if ((ptr != start) || !currentString.empty()) {
|
||||||
|
currentString.append(start, ptr - start);
|
||||||
|
for (unsigned int i = 0; i < currentString.length(); ++i) {
|
||||||
|
currentString[i] = toupper(currentString[i]);
|
||||||
|
}
|
||||||
|
currentTag.addAttribute(currentString);
|
||||||
|
currentString.erase();
|
||||||
|
}
|
||||||
|
start = ptr + 1;
|
||||||
|
if (*ptr == '>') {
|
||||||
|
if (!tagHandler(currentTag)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
state = PS_TEXT;
|
||||||
|
} else if (*ptr == '/') {
|
||||||
|
if (!tagHandler(currentTag)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
currentTag.Start = false;
|
||||||
|
if (!tagHandler(currentTag)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
state = PS_WAIT_END_OF_TAG;
|
||||||
|
} else {
|
||||||
|
state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case PS_ATTRIBUTEVALUE:
|
||||||
|
if (*ptr == '"') {
|
||||||
|
if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) {
|
||||||
|
++quotationCounter;
|
||||||
|
}
|
||||||
|
} else if (*ptr == '&') {
|
||||||
|
currentString.append(start, ptr - start);
|
||||||
|
start = ptr + 1;
|
||||||
|
appendString(attributeValueString, currentString);
|
||||||
|
state = PS_SPECIAL_IN_ATTRIBUTEVALUE;
|
||||||
|
state_special = ST_UNKNOWN;
|
||||||
|
} else if ((quotationCounter != 1) && ((*ptr == '>') || (*ptr == '/') || isspace((unsigned char)*ptr))) {
|
||||||
|
if ((ptr != start) || !currentString.empty()) {
|
||||||
|
currentString.append(start, ptr - start);
|
||||||
|
appendString(attributeValueString, currentString);
|
||||||
|
if (attributeValueString[0] == '"') {
|
||||||
|
attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2);
|
||||||
|
}
|
||||||
|
currentTag.setLastAttributeValue(attributeValueString);
|
||||||
|
attributeValueString.erase();
|
||||||
|
quotationCounter = 0;
|
||||||
|
}
|
||||||
|
start = ptr + 1;
|
||||||
|
if (*ptr == '>') {
|
||||||
|
if (!tagHandler(currentTag)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
state = PS_TEXT;
|
||||||
|
} else if (*ptr == '/') {
|
||||||
|
if (!tagHandler(currentTag)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
currentTag.Start = false;
|
||||||
|
if (!tagHandler(currentTag)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
state = PS_WAIT_END_OF_TAG;
|
||||||
|
} else {
|
||||||
|
state = PS_ATTRIBUTENAME;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case PS_SKIPTAG:
|
||||||
|
if (*ptr == '>') {
|
||||||
|
start = ptr + 1;
|
||||||
|
state = PS_TEXT;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (start != endOfBuffer) {
|
||||||
|
switch (state) {
|
||||||
|
case PS_TEXT:
|
||||||
|
if (!characterDataHandler(start, endOfBuffer - start, true)) {
|
||||||
|
goto endOfProcessing;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case PS_TAGNAME:
|
||||||
|
case PS_ATTRIBUTENAME:
|
||||||
|
case PS_ATTRIBUTEVALUE:
|
||||||
|
currentString.append(start, endOfBuffer - start);
|
||||||
|
break;
|
||||||
|
case PS_SPECIAL:
|
||||||
|
case PS_SPECIAL_IN_ATTRIBUTEVALUE:
|
||||||
|
specialString.append(start, endOfBuffer - start);
|
||||||
|
break;
|
||||||
|
case PS_TAGSTART:
|
||||||
|
case PS_SKIPTAG:
|
||||||
|
case PS_COMMENT:
|
||||||
|
case PS_WAIT_END_OF_TAG:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
offset += length;
|
||||||
|
} while (length == BUFSIZE);
|
||||||
|
endOfProcessing:
|
||||||
|
delete[] buffer;
|
||||||
|
|
||||||
|
endDocumentHandler();
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
}
|
92
jni/NativeFormats/fbreader/src/formats/html/HtmlReader.h
Normal file
92
jni/NativeFormats/fbreader/src/formats/html/HtmlReader.h
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __HTMLREADER_H__
|
||||||
|
#define __HTMLREADER_H__
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <ZLEncodingConverter.h>
|
||||||
|
#include "../EncodedTextReader.h"
|
||||||
|
|
||||||
|
class ZLInputStream;
|
||||||
|
|
||||||
|
class HtmlReader : public EncodedTextReader {
|
||||||
|
|
||||||
|
public:
|
||||||
|
struct HtmlAttribute {
|
||||||
|
std::string Name;
|
||||||
|
std::string Value;
|
||||||
|
bool HasValue;
|
||||||
|
|
||||||
|
HtmlAttribute(const std::string &name);
|
||||||
|
~HtmlAttribute();
|
||||||
|
void setValue(const std::string &value);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct HtmlTag {
|
||||||
|
std::string Name;
|
||||||
|
size_t Offset;
|
||||||
|
bool Start;
|
||||||
|
std::vector<HtmlAttribute> Attributes;
|
||||||
|
|
||||||
|
HtmlTag();
|
||||||
|
~HtmlTag();
|
||||||
|
void addAttribute(const std::string &name);
|
||||||
|
void setLastAttributeValue(const std::string &value);
|
||||||
|
|
||||||
|
private:
|
||||||
|
HtmlTag(const HtmlTag&);
|
||||||
|
const HtmlTag &operator = (const HtmlTag&);
|
||||||
|
};
|
||||||
|
|
||||||
|
private:
|
||||||
|
static void setTag(HtmlTag &tag, const std::string &fullName);
|
||||||
|
|
||||||
|
public:
|
||||||
|
virtual void readDocument(ZLInputStream &stream);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
HtmlReader(const std::string &encoding);
|
||||||
|
virtual ~HtmlReader();
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual void startDocumentHandler() = 0;
|
||||||
|
virtual void endDocumentHandler() = 0;
|
||||||
|
|
||||||
|
// returns false iff processing must be stopped
|
||||||
|
virtual bool tagHandler(const HtmlTag &tag) = 0;
|
||||||
|
// returns false iff processing must be stopped
|
||||||
|
virtual bool characterDataHandler(const char *text, size_t len, bool convert) = 0;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void appendString(std::string &to, std::string &from);
|
||||||
|
};
|
||||||
|
|
||||||
|
inline HtmlReader::HtmlAttribute::HtmlAttribute(const std::string &name) : Name(name), HasValue(false) {}
|
||||||
|
inline HtmlReader::HtmlAttribute::~HtmlAttribute() {}
|
||||||
|
inline void HtmlReader::HtmlAttribute::setValue(const std::string &value) { Value = value; HasValue = true; }
|
||||||
|
|
||||||
|
inline HtmlReader::HtmlTag::HtmlTag() : Start(true) {}
|
||||||
|
inline HtmlReader::HtmlTag::~HtmlTag() {}
|
||||||
|
inline void HtmlReader::HtmlTag::addAttribute(const std::string &name) { Attributes.push_back(HtmlAttribute(name)); }
|
||||||
|
inline void HtmlReader::HtmlTag::setLastAttributeValue(const std::string &value) { if (!Attributes.empty()) Attributes.back().setValue(value); }
|
||||||
|
|
||||||
|
#endif /* __HTMLREADER_H__ */
|
128
jni/NativeFormats/fbreader/src/formats/html/HtmlReaderStream.cpp
Normal file
128
jni/NativeFormats/fbreader/src/formats/html/HtmlReaderStream.cpp
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2008-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
#include "HtmlReaderStream.h"
|
||||||
|
#include "HtmlReader.h"
|
||||||
|
|
||||||
|
class HtmlTextOnlyReader : public HtmlReader {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlTextOnlyReader(char *buffer, size_t maxSize);
|
||||||
|
size_t size() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void startDocumentHandler();
|
||||||
|
void endDocumentHandler();
|
||||||
|
|
||||||
|
bool tagHandler(const HtmlTag &tag);
|
||||||
|
bool characterDataHandler(const char *text, size_t len, bool convert);
|
||||||
|
|
||||||
|
private:
|
||||||
|
char *myBuffer;
|
||||||
|
size_t myMaxSize;
|
||||||
|
size_t myFilledSize;
|
||||||
|
bool myIgnoreText;
|
||||||
|
};
|
||||||
|
|
||||||
|
HtmlTextOnlyReader::HtmlTextOnlyReader(char *buffer, size_t maxSize) : HtmlReader(std::string()), myBuffer(buffer), myMaxSize(maxSize), myFilledSize(0), myIgnoreText(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t HtmlTextOnlyReader::size() const {
|
||||||
|
return myFilledSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlTextOnlyReader::startDocumentHandler() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlTextOnlyReader::endDocumentHandler() {
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HtmlTextOnlyReader::tagHandler(const HtmlTag &tag) {
|
||||||
|
if (tag.Name == "SCRIPT") {
|
||||||
|
myIgnoreText = tag.Start;
|
||||||
|
}
|
||||||
|
if ((myFilledSize < myMaxSize) && (myFilledSize > 0) && (myBuffer[myFilledSize - 1] != '\n')) {
|
||||||
|
myBuffer[myFilledSize++] = '\n';
|
||||||
|
}
|
||||||
|
return myFilledSize < myMaxSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HtmlTextOnlyReader::characterDataHandler(const char *text, size_t len, bool) {
|
||||||
|
if (!myIgnoreText) {
|
||||||
|
len = std::min((size_t)len, myMaxSize - myFilledSize);
|
||||||
|
memcpy(myBuffer + myFilledSize, text, len);
|
||||||
|
myFilledSize += len;
|
||||||
|
}
|
||||||
|
return myFilledSize < myMaxSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlReaderStream::HtmlReaderStream(shared_ptr<ZLInputStream> base, size_t maxSize) : myBase(base), myBuffer(0), mySize(maxSize) {
|
||||||
|
}
|
||||||
|
|
||||||
|
HtmlReaderStream::~HtmlReaderStream() {
|
||||||
|
close();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HtmlReaderStream::open() {
|
||||||
|
if (myBase.isNull() || !myBase->open()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
myBuffer = new char[mySize];
|
||||||
|
HtmlTextOnlyReader reader(myBuffer, mySize);
|
||||||
|
reader.readDocument(*myBase);
|
||||||
|
mySize = reader.size();
|
||||||
|
myOffset = 0;
|
||||||
|
myBase->close();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t HtmlReaderStream::read(char *buffer, size_t maxSize) {
|
||||||
|
maxSize = std::min(maxSize, mySize - myOffset);
|
||||||
|
if (buffer != 0) {
|
||||||
|
memcpy(buffer, myBuffer, maxSize);
|
||||||
|
}
|
||||||
|
myOffset += maxSize;
|
||||||
|
return maxSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlReaderStream::close() {
|
||||||
|
if (myBuffer != 0) {
|
||||||
|
delete[] myBuffer;
|
||||||
|
myBuffer = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void HtmlReaderStream::seek(int offset, bool absoluteOffset) {
|
||||||
|
if (!absoluteOffset) {
|
||||||
|
offset += myOffset;
|
||||||
|
}
|
||||||
|
myOffset = std::min(mySize, (size_t)std::max(0, offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t HtmlReaderStream::offset() const {
|
||||||
|
return myOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t HtmlReaderStream::sizeOfOpened() {
|
||||||
|
return mySize;
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2008-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __HTMLREADERSTREAM_H__
|
||||||
|
#define __HTMLREADERSTREAM_H__
|
||||||
|
|
||||||
|
#include <shared_ptr.h>
|
||||||
|
#include <ZLInputStream.h>
|
||||||
|
|
||||||
|
class HtmlReaderStream : public ZLInputStream {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlReaderStream(shared_ptr<ZLInputStream> base, size_t maxSize);
|
||||||
|
~HtmlReaderStream();
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool open();
|
||||||
|
size_t read(char *buffer, size_t maxSize);
|
||||||
|
void close();
|
||||||
|
|
||||||
|
void seek(int offset, bool absoluteOffset);
|
||||||
|
size_t offset() const;
|
||||||
|
size_t sizeOfOpened();
|
||||||
|
|
||||||
|
private:
|
||||||
|
shared_ptr<ZLInputStream> myBase;
|
||||||
|
char *myBuffer;
|
||||||
|
size_t mySize;
|
||||||
|
size_t myOffset;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* __HTMLREADERSTREAM_H__ */
|
158
jni/NativeFormats/fbreader/src/formats/html/HtmlTagActions.h
Normal file
158
jni/NativeFormats/fbreader/src/formats/html/HtmlTagActions.h
Normal file
|
@ -0,0 +1,158 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __HTMLTAGACTIONS_H__
|
||||||
|
#define __HTMLTAGACTIONS_H__
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
|
||||||
|
#include "HtmlBookReader.h"
|
||||||
|
|
||||||
|
class HtmlTagAction {
|
||||||
|
|
||||||
|
protected:
|
||||||
|
HtmlTagAction(HtmlBookReader &reader);
|
||||||
|
|
||||||
|
public:
|
||||||
|
virtual ~HtmlTagAction();
|
||||||
|
virtual void run(const HtmlReader::HtmlTag &tag) = 0;
|
||||||
|
virtual void reset();
|
||||||
|
|
||||||
|
protected:
|
||||||
|
BookReader &bookReader();
|
||||||
|
|
||||||
|
protected:
|
||||||
|
HtmlBookReader &myReader;
|
||||||
|
};
|
||||||
|
|
||||||
|
class DummyHtmlTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
DummyHtmlTagAction(HtmlBookReader &reader);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlControlTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
|
||||||
|
private:
|
||||||
|
FBTextKind myKind;
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlHeaderTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
|
||||||
|
private:
|
||||||
|
FBTextKind myKind;
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlIgnoreTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlIgnoreTagAction(HtmlBookReader &reader);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::set<std::string> myTagNames;
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlHrefTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlHrefTagAction(HtmlBookReader &reader);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
void reset();
|
||||||
|
|
||||||
|
protected:
|
||||||
|
FBTextKind hyperlinkType() const;
|
||||||
|
void setHyperlinkType(FBTextKind hyperlinkType);
|
||||||
|
|
||||||
|
private:
|
||||||
|
FBTextKind myHyperlinkType;
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlImageTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlImageTagAction(HtmlBookReader &reader);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlBreakTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
enum BreakType {
|
||||||
|
BREAK_AT_START = 1,
|
||||||
|
BREAK_AT_END = 2,
|
||||||
|
BREAK_AT_START_AND_AT_END = BREAK_AT_START | BREAK_AT_END
|
||||||
|
};
|
||||||
|
HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
|
||||||
|
private:
|
||||||
|
BreakType myBreakType;
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlPreTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlPreTagAction(HtmlBookReader &reader);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlListTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlListTagAction(HtmlBookReader &reader, int startIndex);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
|
||||||
|
private:
|
||||||
|
int myStartIndex;
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlListItemTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlListItemTagAction(HtmlBookReader &reader);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlTableTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlTableTagAction(HtmlBookReader &reader);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
};
|
||||||
|
|
||||||
|
class HtmlStyleTagAction : public HtmlTagAction {
|
||||||
|
|
||||||
|
public:
|
||||||
|
HtmlStyleTagAction(HtmlBookReader &reader);
|
||||||
|
void run(const HtmlReader::HtmlTag &tag);
|
||||||
|
};
|
||||||
|
|
||||||
|
inline BookReader &HtmlTagAction::bookReader() { return myReader.myBookReader; }
|
||||||
|
|
||||||
|
#endif /* __HTMLTAGACTIONS_H__ */
|
Loading…
Add table
Add a link
Reference in a new issue