1
0
Fork 0
mirror of https://github.com/geometer/FBReaderJ.git synced 2025-10-03 09:49:19 +02:00
FBReaderJ/jni/NativeFormats/fbreader/src/formats/html/HtmlBookReader.cpp

621 lines
18 KiB
C++

/*
* Copyright (C) 2004-2015 FBReader.ORG Limited <contact@fbreader.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <cctype>
#include <ZLFile.h>
#include <ZLFileImage.h>
#include <ZLStringUtil.h>
#include "HtmlBookReader.h"
#include "HtmlTagActions.h"
#include "../txt/PlainTextFormat.h"
#include "../util/MiscUtil.h"
#include "../../bookmodel/BookModel.h"
#include "../css/StyleSheetParser.h"
HtmlTagAction::HtmlTagAction(HtmlBookReader &reader) : myReader(reader) {
}
HtmlTagAction::~HtmlTagAction() {
}
void HtmlTagAction::reset() {
}
DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
}
void DummyHtmlTagAction::run(const HtmlReader::HtmlTag&) {
}
HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
}
void HtmlControlTagAction::run(const HtmlReader::HtmlTag &tag) {
std::vector<FBTextKind> &list = myReader.myKindList;
int index;
for (index = list.size() - 1; index >= 0; --index) {
if (list[index] == myKind) {
break;
}
}
if (tag.Start) {
if (index == -1) {
bookReader().pushKind(myKind);
myReader.myKindList.push_back(myKind);
bookReader().addControl(myKind, true);
}
} else {
if (index >= 0) {
for (int i = list.size() - 1; i >= index; --i) {
bookReader().addControl(list[i], false);
bookReader().popKind();
}
for (unsigned int j = index + 1; j < list.size(); ++j) {
bookReader().addControl(list[j], true);
bookReader().pushKind(list[j]);
}
list.erase(list.begin() + index);
}
}
}
HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
}
void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag &tag) {
myReader.myIsStarted = false;
if (tag.Start) {
if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
if (!bookReader().contentsParagraphIsOpen()) {
bookReader().insertEndOfSectionParagraph();
bookReader().enterTitle();
bookReader().beginContentsParagraph();
}
}
bookReader().pushKind(myKind);
} else {
bookReader().popKind();
if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
bookReader().endContentsParagraph();
bookReader().exitTitle();
}
}
bookReader().beginParagraph();
}
HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
}
void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag &tag) {
if (tag.Start) {
if (myTagNames.find(tag.Name) == myTagNames.end()) {
++myReader.myIgnoreDataCounter;
myTagNames.insert(tag.Name);
}
} else {
if (myTagNames.find(tag.Name) != myTagNames.end()) {
--myReader.myIgnoreDataCounter;
myTagNames.erase(tag.Name);
}
}
}
HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
}
void HtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) {
if (tag.Start) {
for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
if (tag.Attributes[i].Name == "name") {
bookReader().addHyperlinkLabel(tag.Attributes[i].Value);
} else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "href")) {
std::string value = tag.Attributes[i].Value;
if (!myReader.myFileName.empty() &&
(value.length() > myReader.myFileName.length()) &&
(value.substr(0, myReader.myFileName.length()) == myReader.myFileName)) {
value = value.substr(myReader.myFileName.length());
}
if (!value.empty()) {
if (value[0] == '#') {
setHyperlinkType(INTERNAL_HYPERLINK);
bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, value.substr(1));
} else {
FBTextKind hyperlinkType = MiscUtil::referenceType(value);
if (hyperlinkType != INTERNAL_HYPERLINK) {
setHyperlinkType(hyperlinkType);
bookReader().addHyperlinkControl(hyperlinkType, value);
}
}
}
}
}
} else if (hyperlinkType() != REGULAR) {
bookReader().addControl(hyperlinkType(), false);
setHyperlinkType(REGULAR);
}
}
void HtmlHrefTagAction::reset() {
setHyperlinkType(REGULAR);
}
FBTextKind HtmlHrefTagAction::hyperlinkType() const {
return myHyperlinkType;
}
void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType) {
myHyperlinkType = hyperlinkType;
}
HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
}
void HtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) {
if (tag.Start) {
bookReader().endParagraph();
const std::string *ptr = tag.find("src");
if (ptr != 0) {
const std::string fileName = MiscUtil::decodeHtmlURL(*ptr);
const ZLFile file(myReader.myBaseDirPath + fileName);
if (file.exists()) {
bookReader().addImageReference(fileName, 0, false);
bookReader().addImage(fileName, new ZLFileImage(file, "", 0));
}
}
bookReader().beginParagraph();
}
}
HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType) : HtmlTagAction(reader), myBreakType(breakType) {
}
void HtmlBreakTagAction::run(const HtmlReader::HtmlTag &tag) {
if (myReader.myDontBreakParagraph) {
myReader.myDontBreakParagraph = false;
return;
}
if ((tag.Start && (myBreakType & BREAK_AT_START)) ||
(!tag.Start && (myBreakType & BREAK_AT_END))) {
bookReader().endParagraph();
if (bookReader().isKindStackEmpty()) {
bookReader().pushKind(REGULAR);
}
bookReader().beginParagraph();
}
}
HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
}
void HtmlPreTagAction::run(const HtmlReader::HtmlTag &tag) {
bookReader().endParagraph();
myReader.myIsPreformatted = tag.Start;
myReader.mySpaceCounter = -1;
myReader.myBreakCounter = 0;
if (myReader.myFormat.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
if (tag.Start) {
bookReader().pushKind(PREFORMATTED);
} else {
bookReader().popKind();
}
}
bookReader().beginParagraph();
}
HtmlListTagAction::HtmlListTagAction(HtmlBookReader &reader, int startIndex) : HtmlTagAction(reader), myStartIndex(startIndex) {
}
void HtmlListTagAction::run(const HtmlReader::HtmlTag &tag) {
if (tag.Start) {
myReader.myListNumStack.push(myStartIndex);
} else if (!myReader.myListNumStack.empty()) {
myReader.myListNumStack.pop();
}
}
HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
}
void HtmlListItemTagAction::run(const HtmlReader::HtmlTag &tag) {
if (tag.Start) {
bookReader().endParagraph();
bookReader().beginParagraph();
if (!myReader.myListNumStack.empty()) {
bookReader().addFixedHSpace(3 * myReader.myListNumStack.size());
int &index = myReader.myListNumStack.top();
if (index == 0) {
myReader.addConvertedDataToBuffer("\342\200\242", 3, false);
} else {
const std::string number = ZLStringUtil::numberToString(index++) + ".";
myReader.addConvertedDataToBuffer(number.data(), number.length(), false);
}
bookReader().addFixedHSpace(1);
myReader.myDontBreakParagraph = true;
}
} else {
myReader.myDontBreakParagraph = false;
}
}
HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
}
void HtmlTableTagAction::run(const HtmlReader::HtmlTag &tag) {
if (tag.Start) {
myReader.myIgnoreTitles = true;
} else {
myReader.myIgnoreTitles = false;
}
}
HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
}
shared_ptr<StyleSheetParser> HtmlBookReader::createCSSParser() {
return new StyleSheetTableParser(myBaseDirPath, myStyleSheetTable, myFontMap, 0);
}
void HtmlStyleTagAction::run(const HtmlReader::HtmlTag &tag) {
myReader.myStyleSheetParser = tag.Start ? myReader.createCSSParser() : 0;
/*
if (!tag.Start) {
myReader.myStyleSheetTable.dump();
}
*/
}
shared_ptr<HtmlTagAction> HtmlBookReader::createAction(const std::string &tag) {
if (tag == "em") {
return new HtmlControlTagAction(*this, EMPHASIS);
} else if (tag == "strong") {
return new HtmlControlTagAction(*this, STRONG);
} else if (tag == "b") {
return new HtmlControlTagAction(*this, BOLD);
} else if (tag == "i") {
return new HtmlControlTagAction(*this, ITALIC);
} else if (tag == "tt") {
return new HtmlControlTagAction(*this, CODE);
} else if (tag == "code") {
return new HtmlControlTagAction(*this, CODE);
} else if (tag == "cite") {
return new HtmlControlTagAction(*this, CITE);
} else if (tag == "sub") {
return new HtmlControlTagAction(*this, SUB);
} else if (tag == "sup") {
return new HtmlControlTagAction(*this, SUP);
} else if (tag == "h1") {
return new HtmlHeaderTagAction(*this, H1);
} else if (tag == "h2") {
return new HtmlHeaderTagAction(*this, H2);
} else if (tag == "h3") {
return new HtmlHeaderTagAction(*this, H3);
} else if (tag == "h4") {
return new HtmlHeaderTagAction(*this, H4);
} else if (tag == "h5") {
return new HtmlHeaderTagAction(*this, H5);
} else if (tag == "h6") {
return new HtmlHeaderTagAction(*this, H6);
} else if (tag == "head") {
return new HtmlIgnoreTagAction(*this);
} else if (tag == "title") {
return new HtmlIgnoreTagAction(*this);
} else if (tag == "style") {
return new HtmlStyleTagAction(*this);
} else if (tag == "select") {
return new HtmlIgnoreTagAction(*this);
} else if (tag == "script") {
return new HtmlIgnoreTagAction(*this);
} else if (tag == "a") {
return new HtmlHrefTagAction(*this);
} else if (tag == "td") {
//return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
} else if (tag == "tr") {
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
} else if (tag == "div") {
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
} else if (tag == "dt") {
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START);
} else if (tag == "p") {
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
} else if (tag == "br") {
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
} else if (tag == "blockquote") {
return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
} else if (tag == "img") {
return new HtmlImageTagAction(*this);
} else if (tag == "ul") {
return new HtmlListTagAction(*this, 0);
} else if (tag == "menu") {
return new HtmlListTagAction(*this, 0);
} else if (tag == "dir") {
return new HtmlListTagAction(*this, 0);
} else if (tag == "ol") {
return new HtmlListTagAction(*this, 1);
} else if (tag == "li") {
return new HtmlListItemTagAction(*this);
} else if (tag == "pre") {
if (myProcessPreTag) {
return new HtmlPreTagAction(*this);
}
} else if (tag == "table") {
return new HtmlTableTagAction(*this);
}
/*
} else if (tag == "dd") {
return 0;
} else if (tag == "dl") {
return 0;
} else if (tag == "dfn") {
return 0;
} else if (tag == "samp") {
return 0;
} else if (tag == "kbd") {
return 0;
} else if (tag == "var") {
return 0;
} else if (tag == "abbr") {
return 0;
} else if (tag == "acronym") {
return 0;
} else if (tag == "q") {
return 0;
} else if (tag == "ins") {
return 0;
} else if (tag == "del") {
return 0;
} else if (tag == "body") {
return 0;
*/
return new DummyHtmlTagAction(*this);
}
void HtmlBookReader::setBuildTableOfContent(bool build) {
myBuildTableOfContent = build;
}
void HtmlBookReader::setProcessPreTag(bool process) {
myProcessPreTag = process;
}
HtmlBookReader::HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlReader(encoding), myBookReader(model), myBaseDirPath(baseDirectoryPath), myFormat(format), myBuildTableOfContent(true), myProcessPreTag(true) {
myFontMap = new FontMap();
}
HtmlBookReader::~HtmlBookReader() {
}
void HtmlBookReader::addConvertedDataToBuffer(const char *text, std::size_t len, bool convert) {
if (len > 0) {
if (myDontBreakParagraph) {
while (len > 0 && std::isspace(*text)) {
--len;
++text;
}
if (len == 0) {
return;
}
}
if (convert) {
myConverter->convert(myConverterBuffer, text, text + len);
myBookReader.addData(myConverterBuffer);
myBookReader.addContentsData(myConverterBuffer);
myConverterBuffer.erase();
} else {
std::string strText(text, len);
myBookReader.addData(strText);
myBookReader.addContentsData(strText);
}
myDontBreakParagraph = false;
}
}
void HtmlBookReader::TagData::addEntry(shared_ptr<ZLTextStyleEntry> entry) {
if (!entry.isNull()) {
StyleEntries.push_back(entry);
}
}
bool HtmlBookReader::tagHandler(const HtmlTag &tag) {
myConverter->reset();
if (tag.Start) {
shared_ptr<TagData> tagData = new TagData();
tagData->addEntry(myStyleSheetTable.control(tag.Name, ""));
const std::string *cls = tag.find("class");
if (cls != 0) {
tagData->addEntry(myStyleSheetTable.control("", *cls));
tagData->addEntry(myStyleSheetTable.control(tag.Name, *cls));
}
myTagDataStack.push_back(tagData);
} else if (!myTagDataStack.empty()) {
for (int i = myTagDataStack.back()->StyleEntries.size(); i > 0; --i) {
myBookReader.addStyleCloseEntry();
}
myTagDataStack.pop_back();
}
const std::string *id = tag.find("id");
if (id != 0) {
myBookReader.addHyperlinkLabel(*id);
}
shared_ptr<HtmlTagAction> action = myActionMap[tag.Name];
if (action.isNull()) {
action = createAction(tag.Name);
myActionMap[tag.Name] = action;
}
action->run(tag);
if (tag.Start) {
for (std::vector<shared_ptr<TagData> >::const_iterator it = myTagDataStack.begin(); it != myTagDataStack.end(); ++it) {
const unsigned char depth = it - myTagDataStack.begin() + 1;
const std::vector<shared_ptr<ZLTextStyleEntry> > &entries = (*it)->StyleEntries;
const bool inheritedOnly = it + 1 != myTagDataStack.end();
for (std::vector<shared_ptr<ZLTextStyleEntry> >::const_iterator jt = entries.begin(); jt != entries.end(); ++jt) {
shared_ptr<ZLTextStyleEntry> entry = inheritedOnly ? (*jt)->inherited() : *jt;
myBookReader.addStyleEntry(*entry, depth);
}
}
}
return true;
}
void HtmlBookReader::preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert) {
const char *start = text;
const char *end = text + len;
int breakType = myFormat.breakType();
if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
for (const char *ptr = text; ptr != end; ++ptr) {
if (*ptr == '\n') {
mySpaceCounter = 0;
if (start < ptr) {
addConvertedDataToBuffer(start, ptr - start, convert);
} else {
static const std::string SPACE = " ";
myBookReader.addData(SPACE);
}
myBookReader.endParagraph();
myBookReader.beginParagraph();
start = ptr + 1;
} else if (mySpaceCounter >= 0) {
if (std::isspace((unsigned char)*ptr)) {
++mySpaceCounter;
} else {
myBookReader.addFixedHSpace(mySpaceCounter);
mySpaceCounter = -1;
}
}
}
addConvertedDataToBuffer(start, end - start, convert);
} else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) {
for (const char *ptr = text; ptr != end; ++ptr) {
if (std::isspace((unsigned char)*ptr)) {
if (*ptr == '\n') {
mySpaceCounter = 0;
} else if (mySpaceCounter >= 0) {
++mySpaceCounter;
}
} else {
if (mySpaceCounter > myFormat.ignoredIndent()) {
if (ptr - start > mySpaceCounter) {
addConvertedDataToBuffer(start, ptr - start - mySpaceCounter, convert);
myBookReader.endParagraph();
myBookReader.beginParagraph();
}
start = ptr;
}
mySpaceCounter = -1;
}
}
mySpaceCounter = std::max(mySpaceCounter, 0);
if (end - start > mySpaceCounter) {
addConvertedDataToBuffer(start, end - start - mySpaceCounter, convert);
}
} else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) {
for (const char *ptr = start; ptr != end; ++ptr) {
if (std::isspace((unsigned char)*ptr)) {
if (*ptr == '\n') {
++myBreakCounter;
}
} else {
if (myBreakCounter > 1) {
addConvertedDataToBuffer(start, ptr - start, convert);
myBookReader.endParagraph();
myBookReader.beginParagraph();
start = ptr;
}
myBreakCounter = 0;
}
}
addConvertedDataToBuffer(start, end - start, convert);
}
}
bool HtmlBookReader::characterDataHandler(const char *text, std::size_t len, bool convert) {
if (!myStyleSheetParser.isNull()) {
myStyleSheetParser->parseString(text, len);
return true;
}
if (myIgnoreDataCounter != 0) {
return true;
}
if (myIsPreformatted) {
preformattedCharacterDataHandler(text, len, convert);
return true;
}
const char *ptr = text;
const char *end = text + len;
if (!myIsStarted) {
for (; ptr != end; ++ptr) {
if (!std::isspace((unsigned char)*ptr)) {
myIsStarted = true;
break;
}
}
}
if (myIsStarted) {
addConvertedDataToBuffer(ptr, end - ptr, convert);
}
return true;
}
void HtmlBookReader::startDocumentHandler() {
while (!myListNumStack.empty()) {
myListNumStack.pop();
}
myTagDataStack.clear();
myConverterBuffer.erase();
myKindList.clear();
myBookReader.reset();
myBookReader.setMainTextModel();
myBookReader.pushKind(REGULAR);
myBookReader.beginParagraph();
myIgnoreDataCounter = 0;
myIsPreformatted = false;
myDontBreakParagraph = false;
for (std::map<std::string,shared_ptr<HtmlTagAction> >::const_iterator it = myActionMap.begin(); it != myActionMap.end(); ++it) {
it->second->reset();
}
myIsStarted = false;
myIgnoreTitles = false;
myStyleSheetParser = 0;
mySpaceCounter = -1;
myBreakCounter = 0;
}
void HtmlBookReader::endDocumentHandler() {
myBookReader.endParagraph();
}
void HtmlBookReader::setFileName(const std::string fileName) {
myFileName = fileName;
}
size_t HtmlBookReader::listStackDepth() const {
return myListNumStack.size();
}