1
0
Fork 0
mirror of https://github.com/geometer/FBReaderJ.git synced 2025-10-05 02:39:23 +02:00

first version of ms-word doc plugin has been added

This commit is contained in:
Alexander Turkin 2012-06-15 01:27:06 +04:00
parent 899528c2be
commit 52607ae0f1
19 changed files with 2686 additions and 0 deletions

View file

@ -125,6 +125,14 @@ LOCAL_SRC_FILES := \
NativeFormats/fbreader/src/formats/util/MiscUtil.cpp \
NativeFormats/fbreader/src/formats/util/XMLTextStream.cpp \
NativeFormats/fbreader/src/formats/xhtml/XHTMLReader.cpp \
NativeFormats/fbreader/src/formats/doc/DocBookReader.cpp \
NativeFormats/fbreader/src/formats/doc/DocMetaInfoReader.cpp \
NativeFormats/fbreader/src/formats/doc/DocPlugin.cpp \
NativeFormats/fbreader/src/formats/doc/OleMainStream.cpp \
NativeFormats/fbreader/src/formats/doc/OleStorage.cpp \
NativeFormats/fbreader/src/formats/doc/OleStream.cpp \
NativeFormats/fbreader/src/formats/doc/OleStreamReader.cpp \
NativeFormats/fbreader/src/formats/doc/OleUtil.cpp \
NativeFormats/fbreader/src/library/Author.cpp \
NativeFormats/fbreader/src/library/Book.cpp \
NativeFormats/fbreader/src/library/Comparators.cpp \

View file

@ -36,6 +36,7 @@
//#include "chm/CHMPlugin.h"
#include "rtf/RtfPlugin.h"
//#include "openreader/OpenReaderPlugin.h"
#include "doc/DocPlugin.h"
PluginCollection *PluginCollection::ourInstance = 0;
@ -54,6 +55,7 @@ PluginCollection &PluginCollection::Instance() {
// ourInstance->myPlugins.push_back(new CHMPlugin());
ourInstance->myPlugins.push_back(new OEBPlugin());
ourInstance->myPlugins.push_back(new RtfPlugin());
ourInstance->myPlugins.push_back(new DocPlugin());
// ourInstance->myPlugins.push_back(new OpenReaderPlugin());
}
return *ourInstance;

View file

@ -0,0 +1,357 @@
/*
* Copyright (C) 2004-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <iostream>
#include <vector>
#include <string>
#include <ZLInputStream.h>
#include <ZLLogger.h>
#include <ZLFile.h>
#include <ZLStringUtil.h>
#include "DocBookReader.h"
#include "../../bookmodel/BookModel.h"
#include "../../library/Book.h"
#include "OleStorage.h"
#include "OleMainStream.h"
DocBookReader::DocBookReader(BookModel &model, const std::string &encoding) :
OleStreamReader(encoding),
myModelReader(model) {
myReadState = READ_TEXT;
}
bool DocBookReader::readBook() {
const ZLFile &file = myModelReader.model().book()->file();
shared_ptr<ZLInputStream> stream = file.inputStream();
if (stream.isNull()) {
return false;
}
return readDocument(stream, file.size());
}
bool DocBookReader::readDocument(shared_ptr<ZLInputStream> inputStream, size_t streamSize) {
static const std::string WORD_DOCUMENT = "WordDocument";
if (inputStream.isNull() || !inputStream->open()) {
return false;
}
myModelReader.setMainTextModel();
myModelReader.pushKind(REGULAR);
myModelReader.beginParagraph();
shared_ptr<OleStorage> storage = new OleStorage;
if (!storage->init(inputStream, streamSize)) {
ZLLogger::Instance().println("DocBookReader", "Broken OLE file!");
return false;
}
OleEntry wordDocumentEntry;
bool result = storage->getEntryByName(WORD_DOCUMENT, wordDocumentEntry);
if (!result) {
return false;
}
OleMainStream oleStream(storage, wordDocumentEntry, inputStream);
result = readStream(oleStream);
if (!result) {
return false;
}
myModelReader.insertEndOfTextParagraph();
return true;
}
void DocBookReader::handleChar(ZLUnicodeUtil::Ucs2Char ucs2char) {
if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_INFO) {
myFieldInfoBuffer.push_back(ucs2char);
return;
}
if (myReadState == READ_FIELD && myReadFieldState == DONT_READ_FIELD_TEXT) {
return;
}
if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_TEXT && ucs2char == WORD_HORIZONTAL_TAB) {
//to remove pagination from TOC (from doc saved in OpenOffice)
myReadFieldState = DONT_READ_FIELD_TEXT;
return;
}
std::string utf8String;
ZLUnicodeUtil::Ucs2String ucs2String;
ucs2String.push_back(ucs2char);
ZLUnicodeUtil::ucs2ToUtf8(utf8String, ucs2String);
if (!myModelReader.paragraphIsOpen()) {
myModelReader.beginParagraph();
}
myModelReader.addData(utf8String);
}
void DocBookReader::handleHardLinebreak() {
if (myModelReader.paragraphIsOpen()) {
myModelReader.endParagraph();
}
myModelReader.beginParagraph();
if (!myCurStyleEntry.isNull()) {
myModelReader.addStyleEntry(*myCurStyleEntry);
}
for (size_t i = 0; i < myKindStack.size(); ++i) {
myModelReader.addControl(myKindStack.at(i), true);
}
}
void DocBookReader::handleParagraphEnd() {
if (myModelReader.paragraphIsOpen()) {
myModelReader.endParagraph();
}
myModelReader.beginParagraph();
myCurStyleEntry = 0;
}
void DocBookReader::handlePageBreak() {
if (myModelReader.paragraphIsOpen()) {
myModelReader.endParagraph();
}
myCurStyleEntry = 0;
myModelReader.insertEndOfSectionParagraph();
myModelReader.beginParagraph();
}
void DocBookReader::handleTableSeparator() {
handleChar(SPACE);
handleChar(VERTICAL_LINE);
handleChar(SPACE);
}
void DocBookReader::handleTableEndRow() {
handleParagraphEnd();
}
void DocBookReader::handleFootNoteMark() {
//TODO implement
}
void DocBookReader::handleStartField() {
if (myReadState == READ_FIELD) { //for nested fields
handleEndField();
}
myReadState = READ_FIELD;
myReadFieldState = READ_FIELD_INFO;
myHyperlinkTypeState = NO_HYPERLINK;
}
void DocBookReader::handleSeparatorField() {
static const std::string HYPERLINK = "HYPERLINK";
// static const std::string PAGE = "PAGE";
// static const std::string PAGEREF = "PAGEREF";
// static const std::string SHAPE = "SHAPE";
static const std::string SPACE_DELIMETER = " ";
static const std::string LOCAL_LINK = "\\l";
static const std::string QUOTE = "\"";
myReadFieldState = READ_FIELD_TEXT;
myHyperlinkTypeState = NO_HYPERLINK;
ZLUnicodeUtil::Ucs2String buffer = myFieldInfoBuffer;
myFieldInfoBuffer.clear();
std::string utf8String;
ZLUnicodeUtil::ucs2ToUtf8(utf8String, buffer);
ZLStringUtil::stripWhiteSpaces(utf8String);
if (utf8String.empty()) {
return;
}
std::vector<std::string> result = ZLStringUtil::split(utf8String, SPACE_DELIMETER);
//TODO split function can returns empty string, maybe fix it
std::vector<std::string> splitted;
for (size_t i = 0; i < result.size(); ++i) {
if (!result.at(i).empty()) {
splitted.push_back(result.at(i));
}
}
if (splitted.size() < 2 || splitted.at(0) != HYPERLINK) {
myReadFieldState = DONT_READ_FIELD_TEXT;
//to remove pagination from TOC and not hyperlink fields
return;
}
if (splitted.at(1) == LOCAL_LINK) {
std::string link = parseLink(buffer);
if (!link.empty()) {
myModelReader.addHyperlinkControl(INTERNAL_HYPERLINK, link);
myHyperlinkTypeState = INT_HYPERLINK_INSERTED;
}
} else {
std::string link = parseLink(buffer, true);
if (!link.empty()) {
myModelReader.addHyperlinkControl(EXTERNAL_HYPERLINK, link);
myHyperlinkTypeState = EXT_HYPERLINK_INSERTED;
}
}
}
void DocBookReader::handleEndField() {
myFieldInfoBuffer.clear();
if (myReadState == READ_TEXT) {
return;
}
if (myHyperlinkTypeState == EXT_HYPERLINK_INSERTED) {
myModelReader.addControl(EXTERNAL_HYPERLINK, false);
} else if (myHyperlinkTypeState == INT_HYPERLINK_INSERTED) {
myModelReader.addControl(INTERNAL_HYPERLINK, false);
}
myReadState = READ_TEXT;
myHyperlinkTypeState = NO_HYPERLINK;
}
void DocBookReader::handleStartOfHeading() {
//heading can be, for example, a picture
//TODO implement
}
void DocBookReader::handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char) {
if (ucs2char == WORD_SHORT_DEFIS) {
handleChar(SHORT_DEFIS);
} else if (ucs2char == WORD_SOFT_HYPHEN) {
//skip
} else if (ucs2char == WORD_HORIZONTAL_TAB) {
handleChar(ucs2char);
} else {
// myTextBuffer.clear();
}
}
void DocBookReader::handleFontStyle(unsigned int fontStyle) {
if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_TEXT && myHyperlinkTypeState != NO_HYPERLINK) {
//to fix bug with hyperlink, that's only bold and doesn't looks like hyperlink
return;
}
while (!myKindStack.empty()) {
myModelReader.addControl(myKindStack.back(), false);
myKindStack.pop_back();
}
if (fontStyle & OleMainStream::CharInfo::BOLD) {
myKindStack.push_back(BOLD);
}
if (fontStyle & OleMainStream::CharInfo::ITALIC) {
myKindStack.push_back(ITALIC);
}
for (size_t i = 0; i < myKindStack.size(); ++i) {
myModelReader.addControl(myKindStack.at(i), true);
}
}
void DocBookReader::handleParagraphStyle(const OleMainStream::Style &styleInfo) {
if (styleInfo.hasPageBreakBefore) {
handlePageBreak();
}
shared_ptr<ZLTextStyleEntry> entry = new ZLTextStyleEntry();
if (styleInfo.alignment == OleMainStream::Style::LEFT) {
entry->setAlignmentType(ALIGN_JUSTIFY); //force justify align
} else if (styleInfo.alignment == OleMainStream::Style::CENTER) {
entry->setAlignmentType(ALIGN_CENTER);
} else if (styleInfo.alignment == OleMainStream::Style::RIGHT) {
entry->setAlignmentType(ALIGN_RIGHT);
} else if (styleInfo.alignment == OleMainStream::Style::JUSTIFY) {
entry->setAlignmentType(ALIGN_JUSTIFY);
}
//TODO in case, where style is heading, but size is small it works wrong
ZLTextStyleEntry::SizeUnit unit = ZLTextStyleEntry::SIZE_UNIT_PERCENT;
if (styleInfo.istd == OleMainStream::H1) {
entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 140, unit);
} else if (styleInfo.istd == OleMainStream::H2) {
entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 120, unit);
} else if (styleInfo.istd == OleMainStream::H3) {
entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 110, unit);
}
myCurStyleEntry = entry;
myModelReader.addStyleEntry(*myCurStyleEntry);
//we should have the same font style, as for the previous paragraph, if it has the same istd
if (myCurStyleInfo.istd != OleMainStream::ISTD_INVALID && myCurStyleInfo.istd == styleInfo.istd) {
for (size_t i = 0; i < myKindStack.size(); ++i) {
myModelReader.addControl(myKindStack.at(i), true);
}
} else {
myKindStack.clear();
handleFontStyle(styleInfo.charInfo.fontStyle); //fill by the fontstyle, that was got from Stylesheet
}
myCurStyleInfo = styleInfo;
}
void DocBookReader::handleBookmark(const std::string &name) {
myModelReader.addHyperlinkLabel(name);
}
std::string DocBookReader::parseLink(ZLUnicodeUtil::Ucs2String s, bool urlencode) {
//TODO add support for HYPERLINK like that:
// [0x13] HYPERLINK "http://site.ru/some text" \t "_blank" [0x14] text [0x15]
//Current implementation search for last QUOTE, so, it reads \t and _blank as part of link
//Last quote searching is need to handle link like that:
// [0x13] HYPERLINK "http://yandex.ru/yandsearch?text='some text' и "some text2"" [0x14] link text [0x15]
static const ZLUnicodeUtil::Ucs2Char QUOTE = 0x22;
size_t i, first = 0;
//TODO maybe functions findFirstOf and findLastOf should be in ZLUnicodeUtil class
for (i = 0; i < s.size(); ++i) {
if (s.at(i) == QUOTE) {
first = i;
break;
}
}
if (i == s.size()) {
return std::string();
}
size_t j, last = 0;
for (j = s.size(); j > 0 ; --j) {
if (s.at(j - 1) == QUOTE) {
last = j - 1;
break;
}
}
if (j == 0 || last == first) {
return std::string();
}
ZLUnicodeUtil::Ucs2String link;
for (size_t k = first + 1; k < last; ++k) {
ZLUnicodeUtil::Ucs2Char ch = s.at(k);
if (urlencode && ZLUnicodeUtil::isSpace(ch)) {
//TODO maybe implement function for encoding all signs in url, not only spaces and quotes
//TODO maybe add backslash support
link.push_back('%');
link.push_back('2');
link.push_back('0');
} else if (urlencode && ch == QUOTE) {
link.push_back('%');
link.push_back('2');
link.push_back('2');
} else {
link.push_back(ch);
}
}
std::string utf8String;
ZLUnicodeUtil::ucs2ToUtf8(utf8String, link);
return utf8String;
}

View file

@ -0,0 +1,96 @@
/*
* Copyright (C) 2004-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#ifndef __DOCBOOKREADER_H__
#define __DOCBOOKREADER_H__
#include <vector>
#include <shared_ptr.h>
#include <ZLFile.h>
#include <ZLTextStyleEntry.h>
#include "../../bookmodel/BookReader.h"
#include "OleMainStream.h"
#include "OleStreamReader.h"
class DocBookReader : public OleStreamReader {
public:
DocBookReader(BookModel &model, const std::string &encoding);
~DocBookReader();
bool readBook();
private:
bool readDocument(shared_ptr<ZLInputStream> stream, size_t streamSize);
void handleChar(ZLUnicodeUtil::Ucs2Char ucs2char);
void handleHardLinebreak();
void handleParagraphEnd();
void handlePageBreak();
void handleTableSeparator();
void handleTableEndRow();
void handleFootNoteMark();
void handleStartField();
void handleSeparatorField();
void handleEndField();
void handleStartOfHeading();
void handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char);
//formatting:
void handleFontStyle(unsigned int fontStyle);
void handleParagraphStyle(const OleMainStream::Style &styleInfo);
void handleBookmark(const std::string &name);
private:
static std::string parseLink(ZLUnicodeUtil::Ucs2String s, bool urlencode = false);
private:
BookReader myModelReader;
ZLUnicodeUtil::Ucs2String myFieldInfoBuffer;
enum {
READ_FIELD,
READ_TEXT
} myReadState;
enum {
READ_FIELD_TEXT,
DONT_READ_FIELD_TEXT,
READ_FIELD_INFO
} myReadFieldState;
//maybe it should be flag?
enum {
NO_HYPERLINK,
EXT_HYPERLINK_INSERTED,
INT_HYPERLINK_INSERTED
} myHyperlinkTypeState;
//formatting
std::vector<FBTextKind> myKindStack;
shared_ptr<ZLTextStyleEntry> myCurStyleEntry;
OleMainStream::Style myCurStyleInfo;
};
inline DocBookReader::~DocBookReader() {}
#endif /* __DOCBOOKREADER_H__ */

View file

@ -0,0 +1,50 @@
/*
* Copyright (C) 2004-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <ZLInputStream.h>
#include "../../library/Book.h"
#include "DocMetaInfoReader.h"
DocMetaInfoReader::DocMetaInfoReader(Book &book) : myBook(book) {
myBook.removeAllAuthors();
myBook.setTitle(std::string());
myBook.setLanguage(std::string());
myBook.removeAllTags();
}
/*
void DocMetaInfoReader::characterDataHandler(const char *text, size_t len) {
}
void DocMetaInfoReader::startElementHandler(int tag, const char **) {
}
void DocMetaInfoReader::endElementHandler(int tag) {
}
*/
bool DocMetaInfoReader::readMetaInfo() {
myBook.removeAllAuthors();
myBook.setTitle(myBook.file().name(true));
myBook.setEncoding("windows-1251"); //TODO implement encoding retrieving
myBook.removeAllTags();
return true;
}

View file

@ -0,0 +1,46 @@
/*
* Copyright (C) 2004-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#ifndef __DOCMETAINFOREADER_H__
#define __DOCMETAINFOREADER_H__
#include <string>
class Book;
class DocMetaInfoReader {
public:
DocMetaInfoReader(Book &book);
~DocMetaInfoReader();
bool readMetaInfo();
/*
void startElementHandler(int tag, const char **attributes);
void endElementHandler(int tag);
void characterDataHandler(const char *text, size_t len);
*/
private:
Book &myBook;
};
inline DocMetaInfoReader::~DocMetaInfoReader() {}
#endif /* __DOCMETAINFOREADER_H__ */

View file

@ -0,0 +1,61 @@
/*
* Copyright (C) 2004-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <iostream>
#include <ZLFile.h>
#include <ZLInputStream.h>
#include <ZLLogger.h>
#include <ZLImage.h>
#include "DocPlugin.h"
#include "DocMetaInfoReader.h"
#include "DocBookReader.h"
#include "../../bookmodel/BookModel.h"
#include "../../library/Book.h"
DocPlugin::DocPlugin() {
}
DocPlugin::~DocPlugin() {
}
bool DocPlugin::providesMetaInfo() const {
return true;
}
const std::string DocPlugin::supportedFileType() const {
return "doc";
}
bool DocPlugin::acceptsFile(const ZLFile &file) const {
return file.extension() == "doc";
}
bool DocPlugin::readMetaInfo(Book &book) const {
return DocMetaInfoReader(book).readMetaInfo();
}
bool DocPlugin::readLanguageAndEncoding(Book &/*book*/) const {
return true;
}
bool DocPlugin::readModel(BookModel &model) const {
return DocBookReader(model, model.book()->encoding()).readBook();
}

View file

@ -0,0 +1,39 @@
/*
* Copyright (C) 2004-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#ifndef __DOCPLUGIN_H__
#define __DOCPLUGIN_H__
#include "../FormatPlugin.h"
class DocPlugin : public FormatPlugin {
public:
DocPlugin();
~DocPlugin();
bool providesMetaInfo() const;
const std::string supportedFileType() const;
bool acceptsFile(const ZLFile &file) const;
bool readMetaInfo(Book &book) const;
bool readLanguageAndEncoding(Book &book) const;
bool readModel(BookModel &model) const;
};
#endif /* __DOCPLUGIN_H__ */

View file

@ -0,0 +1,889 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <cstring> //for memset
#include <string>
#include <ZLLogger.h>
#include <ZLUnicodeUtil.h>
#include "OleUtil.h"
#include "OleStorage.h"
#include "OleMainStream.h"
OleMainStream::Style::Style() {
(void)memset(this, 0, sizeof(*this));
istd = ISTD_INVALID;
istdNext = ISTD_INVALID;
hasPageBreakBefore = false;
charInfo.fontSize = 20;
}
OleMainStream::CharInfo::CharInfo():
fontStyle(0),
fontSize(20) {
}
OleMainStream::SectionInfo::SectionInfo() :
charPos(0),
newPage(true) {
}
OleMainStream::OleMainStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream) :
OleStream(storage, oleEntry, stream) {
}
bool OleMainStream::open() {
if (OleStream::open() == false) {
return false;
}
static const size_t HEADER_SIZE = 768; //size of data in header of main stream
char headerBuffer[HEADER_SIZE];
seek(0, true);
if (read(headerBuffer, HEADER_SIZE) != HEADER_SIZE) {
return false;
}
bool result = readFIB(headerBuffer);
if (!result) {
return false;
}
//determining table stream number
unsigned int tableNumber = (OleUtil::getU2Bytes(headerBuffer, 0xA) & 0x0200) ? 1 : 0;
std::string tableName = tableNumber == 0 ? "0" : "1";
tableName += "Table";
OleEntry tableEntry;
result = myStorage->getEntryByName(tableName, tableEntry);
if (!result) {
//cant't find table stream (that can be only in case if file format is below Word 7/8), so building simple table stream
Piece piece = {myStartOfText, myEndOfText - myStartOfText, true, Piece::TEXT, 0}; //CHECK may be not all old documents have ANSI
myPieces.push_back(piece);
return true;
}
return readPieceTable(headerBuffer, tableEntry) &&
readBookmarks(headerBuffer, tableEntry) &&
readStylesheet(headerBuffer, tableEntry) &&
//readSectionsInfoTable(headerBuffer, tableEntry) && //it doesn't uses now
readParagraphStyleTable(headerBuffer, tableEntry) &&
readCharInfoTable(headerBuffer, tableEntry);
}
const OleMainStream::Pieces &OleMainStream::getPieces() const {
return myPieces;
}
const OleMainStream::CharInfoList &OleMainStream::getCharInfoList() const {
return myCharInfoList;
}
const OleMainStream::StyleInfoList &OleMainStream::getStyleInfoList() const {
return myStyleInfoList;
}
const OleMainStream::Bookmarks &OleMainStream::getBookmarks() const {
return myBookmarks;
}
bool OleMainStream::readFIB(const char *headerBuffer) {
int flags = OleUtil::getU2Bytes(headerBuffer, 0xA); //offset for flags
if (flags & 0x0004) { //flag for complex format
ZLLogger::Instance().println("OleMainStream", "This was fast-saved. Some information is lost");
//lostInfo = (flags & 0xF0) >> 4);
}
if (flags & 0x1000) { //flag for using extending charset
ZLLogger::Instance().println("OleMainStream", "File uses extended character set (get_word8_char)");
} else {
ZLLogger::Instance().println("OleMainStream", "File uses get_8bit_char character set");
}
if (flags & 0x100) { //flag for encrypted files
ZLLogger::Instance().println("OleMainStream", "File is encrypted");
// Encryption key = %08lx ; NumUtil::get4Bytes(header, 14)
return false;
}
unsigned int charset = OleUtil::getU2Bytes(headerBuffer, 0x14); //offset for charset number
if (charset && charset != 0x100) { //0x100 = default charset
ZLLogger::Instance().println("OleMainStream", "Using not default character set %d");
} else {
ZLLogger::Instance().println("OleMainStream", "Using default character set");
}
myStartOfText = OleUtil::get4Bytes(headerBuffer, 0x18); //offset for start of text value
myEndOfText = OleUtil::get4Bytes(headerBuffer, 0x1c); //offset for end of text value
return true;
}
void OleMainStream::splitPieces(const Pieces &s, Pieces &dest1, Pieces &dest2, Piece::PieceType type1, Piece::PieceType type2, int boundary) {
Pieces source = s;
dest1.clear();
dest2.clear();
int sumLength = 0;
size_t i = 0;
for (i = 0; i < source.size(); ++i) {
Piece piece = source.at(i);
if (piece.length + sumLength >= boundary) {
Piece piece2 = piece;
piece.length = boundary - sumLength;
piece.type = type1;
piece2.type = type2;
piece2.offset += piece.length * 2;
piece2.length -= piece.length;
if (piece.length > 0) {
dest1.push_back(piece);
}
if (piece2.length > 0) {
dest2.push_back(piece2);
}
++i;
break;
}
sumLength += piece.length;
piece.type = type1;
dest1.push_back(piece);
}
for (; i < source.size(); ++i) {
Piece piece = source.at(i);
piece.type = type2;
dest2.push_back(piece);
}
}
std::string OleMainStream::getPiecesTableBuffer(const char *headerBuffer, OleStream &tableStream) {
unsigned int clxOffset = OleUtil::getU4Bytes(headerBuffer, 0x01A2); //offset for CLX structure
unsigned int clxLength = OleUtil::getU4Bytes(headerBuffer, 0x01A6); //offset for value of CLX structure length
//1 step : loading CLX table from table stream
char *clxBuffer = new char[clxLength];
tableStream.seek(clxOffset, true);
tableStream.read(clxBuffer, clxLength);
std::string clx(clxBuffer, clxLength);
delete clxBuffer;
//2 step: searching for pieces table buffer at CLX
//(determines it by 0x02 as start symbol)
size_t from = 0;
size_t i;
std::string pieceTableBuffer;
while ((i = clx.find_first_of(0x02, from)) != std::string::npos) {
unsigned int pieceTableLength = OleUtil::getU4Bytes(clx.c_str(), i + 1);
pieceTableBuffer = std::string(clx, i + 1 + 4);
if (pieceTableBuffer.length() != pieceTableLength) {
from = i + 1;
continue;
}
break;
}
return pieceTableBuffer;
}
bool OleMainStream::readPieceTable(const char *headerBuffer, const OleEntry &tableEntry) {
OleStream tableStream(myStorage, tableEntry, myBaseStream);
std::string piecesTableBuffer = getPiecesTableBuffer(headerBuffer, tableStream);
//getting count of Character Positions for different types of subdocuments in Main Stream
int ccpText = OleUtil::get4Bytes(headerBuffer, 0x004C); //text
int ccpFtn = OleUtil::get4Bytes(headerBuffer, 0x0050); //footnote subdocument
int ccpHdd = OleUtil::get4Bytes(headerBuffer, 0x0054); //header subdocument
int ccpMcr = OleUtil::get4Bytes(headerBuffer, 0x0058); //macro subdocument
int ccpAtn = OleUtil::get4Bytes(headerBuffer, 0x005C); //comment subdocument
int ccpEdn = OleUtil::get4Bytes(headerBuffer, 0x0060); //endnote subdocument
int ccpTxbx = OleUtil::get4Bytes(headerBuffer, 0x0064); //textbox subdocument
int ccpHdrTxbx = OleUtil::get4Bytes(headerBuffer, 0x0068); //textbox subdocument of the header
int lastCP = ccpFtn + ccpHdd + ccpMcr + ccpAtn + ccpEdn + ccpTxbx + ccpHdrTxbx;
if (lastCP != 0) {
++lastCP;
}
lastCP += ccpText;
//getting the CP (character positions) and CP descriptors
std::vector<int> cp; //array of character positions for pieces
unsigned int j = 0;
for (j = 0; ; j += 4) {
int curCP = OleUtil::get4Bytes(piecesTableBuffer.c_str(), j);
cp.push_back(curCP);
if (curCP == lastCP) {
break;
}
}
std::vector<std::string> descriptors;
for (size_t k = 0; k < cp.size() - 1; ++k) {
//j + 4, because it should be taken after CP in PiecesTable Buffer
//k * 8, because it should be taken 8 byte for each descriptor
descriptors.push_back(piecesTableBuffer.substr(j + 4 + k * 8, 8));
}
//filling the Pieces vector
for (size_t i = 0; i < descriptors.size(); ++i) {
//4byte integer with offset and ANSI flag
int fcValue = OleUtil::get4Bytes(descriptors.at(i).c_str(), 0x2); //offset for piece structure
Piece piece;
piece.isANSI = (fcValue & 0x40000000) == 0x40000000; //ansi flag
piece.offset = fcValue & 0x3FFFFFFF; //gettting offset for current piece
piece.length = cp.at(i + 1) - cp.at(i);
myPieces.push_back(piece);
}
//split pieces into different types
Pieces piecesText, piecesFootnote, piecesOther;
splitPieces(myPieces, piecesText, piecesFootnote, Piece::TEXT, Piece::FOOTNOTE, ccpText);
splitPieces(piecesFootnote, piecesFootnote, piecesOther, Piece::FOOTNOTE, Piece::OTHER, ccpFtn);
myPieces.clear();
for (size_t i = 0; i < piecesText.size(); ++i) {
myPieces.push_back(piecesText.at(i));
}
for (size_t i = 0; i < piecesFootnote.size(); ++i) {
myPieces.push_back(piecesFootnote.at(i));
}
for (size_t i = 0; i < piecesOther.size(); ++i) {
myPieces.push_back(piecesOther.at(i));
}
//converting length and offset depending on isANSI
for (size_t i = 0; i < myPieces.size(); ++i) {
Piece &piece = myPieces.at(i);
if (!piece.isANSI) {
piece.length *= 2;
} else {
piece.offset /= 2;
}
}
//filling startCP field
unsigned int curStartCP = 0;
for (size_t i = 0; i < myPieces.size(); ++i) {
Piece &piece = myPieces.at(i);
piece.startCP = curStartCP;
if (piece.isANSI) {
curStartCP += piece.length;
} else {
curStartCP += piece.length / 2;
}
}
return true;
}
bool OleMainStream::readBookmarks(const char *headerBuffer, const OleEntry &tableEntry) {
//SttbfBkmk structure is a table of bookmark name strings
unsigned int beginNamesInfo = OleUtil::getU4Bytes(headerBuffer, 0x142); // address of SttbfBkmk structure
size_t namesInfoLength = (size_t)OleUtil::getU4Bytes(headerBuffer, 0x146); // length of SttbfBkmk structure
if (namesInfoLength == 0) {
return true; //there's no bookmarks
}
OleStream tableStream(myStorage, tableEntry, myBaseStream);
std::string buffer;
if (!readToBuffer(buffer, beginNamesInfo, namesInfoLength, tableStream)) {
return false;
}
unsigned int recordsNumber = OleUtil::getU2Bytes(buffer.c_str(), 0x2); //count of records
std::vector<std::string> names;
unsigned int offset = 0x6; //initial offset
for (unsigned int i = 0; i < recordsNumber; ++i) {
unsigned int length = OleUtil::getU2Bytes(buffer.c_str(), offset) * 2; //legnth of string in bytes
ZLUnicodeUtil::Ucs2String name;
for (unsigned int j = 0; j < length; j+=2) {
char ch1 = buffer.at(offset + 2 + j);
char ch2 = buffer.at(offset + 2 + j + 1);
ZLUnicodeUtil::Ucs2Char ucs2Char = (unsigned int)ch1 | ((unsigned int)ch2 << 8);
name.push_back(ucs2Char);
}
std::string utf8Name;
ZLUnicodeUtil::ucs2ToUtf8(utf8Name, name);
names.push_back(utf8Name);
offset += length + 2;
}
//plcfBkmkf structure is table recording beginning CPs of bookmarks
unsigned int beginCharPosInfo = OleUtil::getU4Bytes(headerBuffer, 0x14A); // address of plcfBkmkf structure
size_t charPosInfoLen = (size_t)OleUtil::getU4Bytes(headerBuffer, 0x14E); // length of plcfBkmkf structure
if (charPosInfoLen == 0) {
return true; //there's no bookmarks
}
if (!readToBuffer(buffer, beginCharPosInfo, charPosInfoLen, tableStream)) {
return false;
}
size_t size = (charPosInfoLen / 4 - 1) / 2;
std::vector<unsigned int> charPage;
for (size_t index = 0, offset = 0; index < size; ++index, offset += 4) {
charPage.push_back(OleUtil::getU4Bytes(buffer.c_str(), offset));
}
for (size_t i = 0; i < names.size(); ++i) {
if (i >= charPage.size()) {
break; //for the case if something in these structures goes wrong, to not to lose all bookmarks
}
Bookmark bookmark;
bookmark.charPos = charPage.at(i);
bookmark.name = names.at(i);
myBookmarks.push_back(bookmark);
}
return true;
}
bool OleMainStream::readStylesheet(const char *headerBuffer, const OleEntry &tableEntry) {
//STSH structure is a stylesheet
unsigned int beginStshInfo = OleUtil::getU4Bytes(headerBuffer, 0xa2); // address of STSH structure
size_t stshInfoLength = (size_t)OleUtil::getU4Bytes(headerBuffer, 0xa6); // length of STSH structure
OleStream tableStream(myStorage, tableEntry, myBaseStream);
char *buffer = new char[stshInfoLength];
tableStream.seek(beginStshInfo, true);
if (tableStream.read(buffer, stshInfoLength) != stshInfoLength) {
return false;
}
size_t stdCount = (size_t)OleUtil::getU2Bytes(buffer, 2);
size_t stdBaseInFile = (size_t)OleUtil::getU2Bytes(buffer, 4);
myStyleSheet.resize(stdCount);
std::vector<bool> isFilled;
isFilled.resize(stdCount, false);
size_t stdLen = 0;
bool styleSheetWasChanged = false;
do { //make it in while loop, because some base style can be after their successors
styleSheetWasChanged = false;
for (size_t index = 0, offset = 2 + (size_t)OleUtil::getU2Bytes(buffer, 0); index < stdCount; index++, offset += 2 + stdLen) {
stdLen = (size_t)OleUtil::getU2Bytes(buffer, offset);
if (isFilled.at(index)) {
continue;
}
if (stdLen == 0) {
//if record is empty, left it default
isFilled[index] = true;
continue;
}
Style styleInfo = myStyleSheet.at(index);
unsigned int styleAndBaseType = OleUtil::getU2Bytes(buffer, offset + 4);
unsigned int styleType = styleAndBaseType % 16;
unsigned int baseStyle = styleAndBaseType / 16;
if (baseStyle == STI_NIL || baseStyle == STI_USER) {
//if based on nil or user style, left defaukt
} else {
int baseStyleIndex = getStyleIndex(baseStyle, isFilled, myStyleSheet);
if (baseStyleIndex < 0) {
//this base style is not filled yet, sp pass it at some time
continue;
}
styleInfo = myStyleSheet.at(baseStyleIndex);
styleInfo.istd = ISTD_INVALID;
}
// parse STD structure
unsigned int tmp = OleUtil::getU2Bytes(buffer, offset + 6);
unsigned int upxCount = tmp % 16;
styleInfo.istdNext = tmp / 16;
//adding current style
myStyleSheet[index] = styleInfo;
isFilled[index] = true;
styleSheetWasChanged = true;
size_t pos = 2 + stdBaseInFile;
size_t nameLen = (size_t)OleUtil::getU2Bytes(buffer, offset + pos);
nameLen = nameLen * 2 + 2; //from Unicode characters to bytes + Unicode null charachter length
pos += 2 + nameLen;
if (pos % 2 != 0) {
++pos;
}
if (pos >= stdLen) {
continue;
}
size_t upxLen = (size_t)OleUtil::getU2Bytes(buffer, offset + pos);
if (pos + upxLen > stdLen) {
//UPX length too large
continue;
}
//for style info styleType must be equal 1
if (styleType == 1 && upxCount >= 1) {
if (upxLen >= 2) {
styleInfo.istd = OleUtil::getU2Bytes(buffer, offset + pos + 2);
getStyleInfo(0, buffer + offset + pos + 4, upxLen - 2, styleInfo);
myStyleSheet[index] = styleInfo;
}
pos += 2 + upxLen;
if (pos % 2 != 0) {
++pos;
}
upxLen = (size_t)OleUtil::getU2Bytes(buffer, offset + pos);
}
if (upxLen == 0 || pos + upxLen > stdLen) {
//too small/too large
continue;
}
//for char info styleType can be equal 1 or 2
if ((styleType == 1 && upxCount >= 2) || (styleType == 2 && upxCount >= 1)) {
CharInfo charInfo;
getCharInfo(0, ISTD_INVALID, buffer + offset + pos + 2, upxLen, charInfo);
styleInfo.charInfo = charInfo;
myStyleSheet[index] = styleInfo;
}
}
} while (styleSheetWasChanged);
delete buffer;
return true;
}
bool OleMainStream::readCharInfoTable(const char *headerBuffer, const OleEntry &tableEntry) {
//fcPlcfbteChpx structure is table with formatting for particular run of text
unsigned int beginCharInfo = OleUtil::getU4Bytes(headerBuffer, 0xfa); // address of fcPlcfbteChpx structure
size_t charInfoLength = (size_t)OleUtil::getU4Bytes(headerBuffer, 0xfe); // length of fcPlcfbteChpx structure
if (charInfoLength < 4) {
return false;
}
OleStream tableStream(myStorage, tableEntry, myBaseStream);
std::string buffer;
if (!readToBuffer(buffer, beginCharInfo, charInfoLength, tableStream)) {
return false;
}
size_t size = (charInfoLength / 4 - 1) / 2;
std::vector<unsigned int> charBlocks;
for (size_t index = 0, offset = (size + 1) * 4; index < size; ++index, offset += 4) {
charBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), offset));
}
char *formatPageBuffer = new char[OleStorage::BBD_BLOCK_SIZE];
for (size_t index = 0; index < size; ++index) {
seek(charBlocks.at(index) * OleStorage::BBD_BLOCK_SIZE, true);
if (read(formatPageBuffer, OleStorage::BBD_BLOCK_SIZE) != OleStorage::BBD_BLOCK_SIZE) {
return false;
}
unsigned int crun = OleUtil::getU1Byte(formatPageBuffer, 0x1ff); //offset with crun (count of 'run of text')
for (unsigned int index2 = 0; index2 < crun; ++index2) {
unsigned int offset = OleUtil::getU4Bytes(formatPageBuffer, index2 * 4);
unsigned int chpxOffset = 2 * OleUtil::getU1Byte(formatPageBuffer, (crun + 1) * 4 + index2);
unsigned int len = OleUtil::getU1Byte(formatPageBuffer, chpxOffset);
unsigned int charPos = 0;
if (!offsetToCharPos(offset, charPos, myPieces)) {
continue;
}
unsigned int istd = getIstdByCharPos(charPos, myStyleInfoList);
CharInfo charInfo = getStyleFromStylesheet(istd, myStyleSheet).charInfo;
if (chpxOffset != 0) {
getCharInfo(chpxOffset, istd, formatPageBuffer + 1, len - 1, charInfo);
}
myCharInfoList.push_back(CharPosToCharInfo(charPos, charInfo));
}
}
delete formatPageBuffer;
return true;
}
bool OleMainStream::readParagraphStyleTable(const char *headerBuffer, const OleEntry &tableEntry) {
//PlcBtePapx structure is table with formatting for all paragraphs
unsigned int beginParagraphInfo = OleUtil::getU4Bytes(headerBuffer, 0x102); // address of PlcBtePapx structure
size_t paragraphInfoLength = (size_t)OleUtil::getU4Bytes(headerBuffer, 0x106); // length of PlcBtePapx structure
if (paragraphInfoLength < 4) {
return false;
}
OleStream tableStream(myStorage, tableEntry, myBaseStream);
std::string buffer;
if (!readToBuffer(buffer, beginParagraphInfo, paragraphInfoLength, tableStream)) {
return false;
}
size_t size = (paragraphInfoLength / 4 - 1) / 2;
std::vector<unsigned int> paragraphBlocks;
for (size_t index = 0, tOffset = (size + 1) * 4; index < size; ++index, tOffset += 4) {
paragraphBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset));
}
char *formatPageBuffer = new char[OleStorage::BBD_BLOCK_SIZE];
for (size_t index = 0; index < size; ++index) {
seek(paragraphBlocks.at(index) * OleStorage::BBD_BLOCK_SIZE, true);
if (read(formatPageBuffer, OleStorage::BBD_BLOCK_SIZE) != OleStorage::BBD_BLOCK_SIZE) {
return false;
}
unsigned int cpara = OleUtil::getU1Byte(formatPageBuffer, 0x1ff); //offset with cpara (count of paragraphs)
for (unsigned int index2 = 0; index2 < cpara; ++index2) {
unsigned int offset = OleUtil::getU4Bytes(formatPageBuffer, index2 * 4);
unsigned int papxOffset = OleUtil::getU1Byte(formatPageBuffer, (cpara + 1) * 4 + index2 * 13) * 2;
if (papxOffset <= 0) {
continue;
}
unsigned int len = OleUtil::getU1Byte(formatPageBuffer, papxOffset) * 2;
if (len == 0) {
++papxOffset;
len = OleUtil::getU1Byte(formatPageBuffer, papxOffset) * 2;
}
unsigned int istd = OleUtil::getU2Bytes(formatPageBuffer, papxOffset + 1);
Style styleInfo = getStyleFromStylesheet(istd, myStyleSheet);
if (len >= 3) {
getStyleInfo(papxOffset, formatPageBuffer + 3, len - 3, styleInfo);
}
unsigned int charPos = 0;
if (!offsetToCharPos(offset, charPos, myPieces)) {
continue;
}
myStyleInfoList.push_back(CharPosToStyle(charPos, styleInfo));
}
}
delete formatPageBuffer;
return true;
}
bool OleMainStream::readSectionsInfoTable(const char *headerBuffer, const OleEntry &tableEntry) {
//PlcfSed structure is a section table
unsigned int beginOfText = OleUtil::getU4Bytes(headerBuffer, 0x18); //address of text's begin in main stream
unsigned int beginSectInfo = OleUtil::getU4Bytes(headerBuffer, 0xca); //address if PlcfSed structure
size_t sectInfoLen = (size_t)OleUtil::getU4Bytes(headerBuffer, 0xce); //length of PlcfSed structure
if (sectInfoLen < 4) {
return false;
}
OleStream tableStream(myStorage, tableEntry, myBaseStream);
std::string buffer;
if (!readToBuffer(buffer, beginSectInfo, sectInfoLen, tableStream)) {
return false;
}
size_t decriptorsCount = (sectInfoLen - 4) / 16;
//saving the section offsets (in character positions)
std::vector<unsigned int> charPos;
for (size_t index = 0, tOffset = 0; index < decriptorsCount; ++index, tOffset += 4) {
unsigned int ulTextOffset = OleUtil::getU4Bytes(buffer.c_str(), tOffset);
charPos.push_back(beginOfText + ulTextOffset);
}
//saving sepx offsets
std::vector<unsigned int> sectPage;
for (size_t index = 0, tOffset = (decriptorsCount + 1) * 4; index < decriptorsCount; ++index, tOffset += 12) {
sectPage.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset + 2));
}
//reading the section properties
char tmpBuffer[2];
for (size_t index = 0; index < decriptorsCount; ++index) {
if (sectPage.at(index) == 0xffffffffUL) { //check for invalid record, to make default section info
SectionInfo sectionInfo;
sectionInfo.charPos = charPos.at(index);
mySectionInfoList.push_back(sectionInfo);
continue;
}
//getting number of bytes to read
seek(sectPage.at(index), true);
if (read(tmpBuffer, 2) != 2) {
return false;
}
size_t bytes = 2 + (size_t)OleUtil::getU2Bytes(tmpBuffer, 0);
char *formatPageBuffer = new char[bytes];
seek(sectPage.at(index), true);
if (read(formatPageBuffer, bytes) != bytes) {
delete formatPageBuffer;
continue;
}
SectionInfo sectionInfo;
sectionInfo.charPos = charPos.at(index);
getSectionInfo(formatPageBuffer + 2, bytes - 2, sectionInfo);
mySectionInfoList.push_back(sectionInfo);
delete formatPageBuffer;
}
return true;
}
void OleMainStream::getStyleInfo(unsigned int papxOffset, const char *grpprlBuffer, unsigned int bytes, Style &styleInfo) {
int tmp, toDelete, toAdd;
unsigned int offset = 0;
while (bytes >= offset + 2) {
unsigned int curPrlLength = 0;
switch (OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset)) {
case 0x2403:
styleInfo.alignment = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 2);
break;
case 0x4610:
styleInfo.leftIndent += OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
if (styleInfo.leftIndent < 0) {
styleInfo.leftIndent = 0;
}
break;
case 0xc60d: // ChgTabsPapx
case 0xc615: // ChgTabs
tmp = OleUtil::get1Byte(grpprlBuffer, papxOffset + offset + 2);
if (tmp < 2) {
curPrlLength = 1;
break;
}
toDelete = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 3);
if (tmp < 2 + 2 * toDelete) {
curPrlLength = 1;
break;
}
toAdd = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 4 + 2 * toDelete);
if (tmp < 2 + 2 * toDelete + 2 * toAdd) {
curPrlLength = 1;
break;
}
break;
case 0x840e:
styleInfo.rightIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
break;
case 0x840f:
styleInfo.leftIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
break;
case 0x8411:
styleInfo.firstLineIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
break;
case 0xa413:
styleInfo.beforeIndent = OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
break;
case 0xa414:
styleInfo.afterIndent = OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
break;
case 0x2407:
styleInfo.hasPageBreakBefore = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 2) == 0x01;
break;
default:
break;
}
if (curPrlLength == 0) {
curPrlLength = getPrlLength(grpprlBuffer, papxOffset + offset);
}
offset += curPrlLength;
}
}
void OleMainStream::getCharInfo(unsigned int chpxOffset, unsigned int /*istd*/, const char *grpprlBuffer, unsigned int bytes, CharInfo &charInfo) {
unsigned int sprm = 0; //single propery modifier
unsigned int offset = 0;
while (bytes >= offset + 2) {
switch (OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset)) {
case 0x0835: //bold
sprm = OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2);
switch (sprm) {
case UNSET:
charInfo.fontStyle &= ~CharInfo::BOLD;
break;
case SET:
charInfo.fontStyle |= CharInfo::BOLD;
break;
case UNCHANGED:
break;
case NEGATION:
charInfo.fontStyle ^= CharInfo::BOLD;
break;
default:
break;
}
break;
case 0x0836: //italic
sprm = OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2);
switch (sprm) {
case UNSET:
charInfo.fontStyle &= ~CharInfo::ITALIC;
break;
case SET:
charInfo.fontStyle |= CharInfo::ITALIC;
break;
case UNCHANGED:
break;
case NEGATION:
charInfo.fontStyle ^= CharInfo::ITALIC;
break;
default:
break;
}
break;
case 0x4a43: //size of font
charInfo.fontSize = OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset + 2);
break;
default:
break;
}
offset += getPrlLength(grpprlBuffer, chpxOffset + offset);
}
}
void OleMainStream::getSectionInfo(const char *grpprlBuffer, size_t bytes, SectionInfo &sectionInfo) {
unsigned int tmp;
size_t offset = 0;
while (bytes >= offset + 2) {
switch (OleUtil::getU2Bytes(grpprlBuffer, offset)) {
case 0x3009: //new page
tmp = OleUtil::getU1Byte(grpprlBuffer, offset + 2);
sectionInfo.newPage = (tmp != 0 && tmp != 1);
break;
default:
break;
}
offset += getPrlLength(grpprlBuffer, offset);
}
}
OleMainStream::Style OleMainStream::getStyleFromStylesheet(unsigned int istd, const StyleSheet &stylesheet) {
//TODO optimize it: StyleSheet can be map structure with istd key
Style style;
if (istd != ISTD_INVALID && istd != STI_NIL && istd != STI_USER) {
for (size_t index = 0; index < stylesheet.size(); ++index) {
if (stylesheet.at(index).istd == istd) {
return stylesheet.at(index);
}
}
}
style.istd = istd;
return style;
}
int OleMainStream::getStyleIndex(unsigned int istd, const std::vector<bool> &isFilled, const StyleSheet &stylesheet) {
//TODO optimize it: StyleSheet can be map structure with istd key
//in that case, this method will be excess
if (istd == ISTD_INVALID) {
return -1;
}
for (int index = 0; index < (int)stylesheet.size(); ++index) {
if (isFilled.at(index) && stylesheet.at(index).istd == istd) {
return index;
}
}
return -1;
}
unsigned int OleMainStream::getIstdByCharPos(unsigned int charPos, const StyleInfoList &styleInfoList) {
unsigned int istd = ISTD_INVALID;
for (size_t i = 0; i < styleInfoList.size(); ++i) {
const Style &info = styleInfoList.at(i).second;
if (i == styleInfoList.size() - 1) { //if last
istd = info.istd;
break;
}
unsigned int curOffset = styleInfoList.at(i).first;
unsigned int nextOffset = styleInfoList.at(i + 1).first;
if (charPos >= curOffset && charPos < nextOffset) {
istd = info.istd;
break;
}
}
return istd;
}
bool OleMainStream::offsetToCharPos(unsigned int offset, unsigned int &charPos, const Pieces &pieces) {
if (pieces.empty()) {
return false;
}
if ((unsigned int)pieces.front().offset > offset) {
return false;
}
if ((unsigned int)(pieces.back().offset + pieces.back().length) <= offset) {
return false;
}
size_t pieceNumber = 0;
for (size_t i = 0; i < pieces.size(); ++i) {
if (i == pieces.size() - 1) { //if last
pieceNumber = i;
break;
}
unsigned int curOffset = pieces.at(i).offset;
unsigned int nextOffset = pieces.at(i + 1).offset;
if (offset >= curOffset && offset < nextOffset) {
pieceNumber = i;
break;
}
}
const Piece &piece = pieces.at(pieceNumber);
unsigned int diffOffset = offset - piece.offset;
if (!piece.isANSI) {
diffOffset /= 2;
}
charPos = piece.startCP + diffOffset;
return true;
}
bool OleMainStream::readToBuffer(std::string &result, unsigned int offset, size_t length, OleStream &stream) {
char *buffer = new char[length];
stream.seek(offset, true);
if (stream.read(buffer, length) != length) {
return false;
}
result = std::string(buffer, length);
delete buffer;
return true;
}
unsigned int OleMainStream::getPrlLength(const char *grpprlBuffer, unsigned int byteNumber) {
unsigned int tmp;
unsigned int opCode = OleUtil::getU2Bytes(grpprlBuffer, byteNumber);
switch (opCode & 0xe000) {
case 0x0000:
case 0x2000:
return 3;
case 0x4000:
case 0x8000:
case 0xA000:
return 4;
case 0xE000:
return 5;
case 0x6000:
return 6;
case 0xC000:
//counting of info length
tmp = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 2);
if (opCode == 0xc615 && tmp == 255) {
unsigned int del = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 3);
unsigned int add = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 4 + del * 4);
tmp = 2 + del * 4 + add * 3;
}
return 3 + tmp;
default:
return 1;
}
}

View file

@ -0,0 +1,178 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#ifndef __OLEMAINSTREAM_H__
#define __OLEMAINSTREAM_H__
#include <vector>
#include <string>
#include "OleStream.h"
class OleMainStream : public OleStream {
public:
struct Piece {
enum PieceType {
TEXT,
FOOTNOTE,
OTHER
};
int offset; //maybe make it unsigned int
int length; //maybe make it unsigned int
bool isANSI;
PieceType type;
unsigned int startCP;
};
typedef std::vector<Piece> Pieces;
struct CharInfo {
enum Font {
REGULAR = 0x0000,
BOLD = 0x0001,
ITALIC = 0x0002,
UNDERLINE = 0x0004,
CAPITALS = 0x0008,
SMALL_CAPITALS = 0x0010,
STRIKE = 0x0020,
HIDDEN = 0x0040,
MARKDEL = 0x0080,
SUPERSCRIPT = 0x0100,
SUBSCRIPT = 0x0200
};
unsigned int fontStyle;
unsigned int fontSize;
CharInfo();
};
typedef std::pair<unsigned int, CharInfo> CharPosToCharInfo;
typedef std::vector<CharPosToCharInfo > CharInfoList;
struct Style {
enum Alignment {
LEFT = 0x00,
CENTER = 0x01,
RIGHT = 0x02,
JUSTIFY = 0x03
};
unsigned int istd; //Current style
unsigned int istdNext; //Next style unless overruled
bool hasPageBreakBefore;
unsigned int beforeIndent; //Vertical indent before paragraph
unsigned int afterIndent; //Vertical indent after paragraph
int leftIndent; //Left indent
int firstLineIndent; //First line left indent
int rightIndent; //Right indent
unsigned int alignment;
CharInfo charInfo;
Style();
};
typedef std::pair<unsigned int, Style> CharPosToStyle;
typedef std::vector<CharPosToStyle> StyleInfoList;
typedef std::vector<Style> StyleSheet;
enum StyleID {
H1 = 0x1,
H2 = 0x2,
H3 = 0x3,
STI_USER = 0xFFE,
STI_NIL = 0xFFF,
ISTD_INVALID = 0xFFFF
};
struct SectionInfo {
unsigned int charPos;
bool newPage;
SectionInfo();
};
typedef std::vector<SectionInfo> SectionInfoList;
struct Bookmark {
unsigned int charPos;
std::string name;
};
typedef std::vector<Bookmark> Bookmarks;
public:
OleMainStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream);
public:
bool open();
const Pieces &getPieces() const;
const CharInfoList &getCharInfoList() const;
const StyleInfoList &getStyleInfoList() const;
const Bookmarks &getBookmarks() const;
private:
bool readFIB(const char *headerBuffer);
bool readPieceTable(const char *headerBuffer, const OleEntry &tableEntry);
bool readBookmarks(const char *headerBuffer, const OleEntry &tableEntry);
bool readStylesheet(const char *headerBuffer, const OleEntry &tableEntry);
bool readSectionsInfoTable(const char *headerBuffer, const OleEntry &tableEntry);
bool readParagraphStyleTable(const char *headerBuffer, const OleEntry &tableEntry);
bool readCharInfoTable(const char *headerBuffer, const OleEntry &tableEntry);
private: //readPieceTable helpers methods
static std::string getPiecesTableBuffer(const char *headerBuffer, OleStream &tableStream);
static void splitPieces(const Pieces &source, Pieces &dest1, Pieces &dest2, Piece::PieceType type1, Piece::PieceType type2, int boundary);
private: //formatting reader helpers methods
static unsigned int getPrlLength(const char *grpprlBuffer, unsigned int byteNumber);
static void getCharInfo(unsigned int chpxOffset, unsigned int istd, const char *grpprlBuffer, unsigned int bytes, CharInfo &charInfo);
static void getStyleInfo(unsigned int papxOffset, const char *grpprlBuffer, unsigned int bytes, Style &styleInfo);
static void getSectionInfo(const char *grpprlBuffer, size_t bytes, SectionInfo &sectionInfo);
static Style getStyleFromStylesheet(unsigned int istd, const StyleSheet &stylesheet);
static int getStyleIndex(unsigned int istd, const std::vector<bool> &isFilled, const StyleSheet &stylesheet);
static unsigned int getIstdByCharPos(unsigned int offset, const StyleInfoList &styleInfoList);
static bool offsetToCharPos(unsigned int offset, unsigned int &charPos, const Pieces &pieces);
static bool readToBuffer(std::string &result, unsigned int offset, size_t length, OleStream &stream);
private:
enum PrlFlag {
UNSET = 0,
SET = 1,
UNCHANGED = 128,
NEGATION = 129
};
private:
int myStartOfText;
int myEndOfText;
Pieces myPieces;
StyleSheet myStyleSheet;
CharInfoList myCharInfoList;
StyleInfoList myStyleInfoList;
SectionInfoList mySectionInfoList;
Bookmarks myBookmarks;
};
#endif /* __OLEMAINSTREAM_H__ */

View file

@ -0,0 +1,268 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <ZLLogger.h>
#include "OleStorage.h"
#include "OleUtil.h"
#include <cstring>
const size_t OleStorage::BBD_BLOCK_SIZE = 512;
OleStorage::OleStorage() {
clear();
}
void OleStorage::clear() {
myInputStream = 0;
mySectorSize = 0;
myShortSectorSize = 0;
myStreamSize = 0;
myRootEntryIndex = -1;
myDIFAT.clear();
myBBD.clear();
mySBD.clear();
myProperties.clear();
myEntries.clear();
}
bool OleStorage::init(shared_ptr<ZLInputStream> stream, size_t streamSize) {
clear();
myInputStream = stream;
myStreamSize = streamSize;
myInputStream->seek(0, true);
char oleBuf[BBD_BLOCK_SIZE];
size_t ret = myInputStream->read(oleBuf, BBD_BLOCK_SIZE);
if (ret != BBD_BLOCK_SIZE) {
clear();
return false;
}
static const char OLE_SIGN[] = {0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1, 0};
if (strncmp(oleBuf, OLE_SIGN, 8) != 0) {
clear();
return false;
}
mySectorSize = 1 << OleUtil::getU2Bytes(oleBuf, 0x1e); //offset for value of big sector size
myShortSectorSize = 1 << OleUtil::getU2Bytes(oleBuf, 0x20); //offset for value of small sector size
if (readDIFAT(oleBuf) && readBBD(oleBuf) && readSBD(oleBuf) && readProperties(oleBuf) && readAllEntries()) {
return true;
}
clear();
return false;
}
bool OleStorage::readDIFAT(char *oleBuf) {
int difatBlock = OleUtil::get4Bytes(oleBuf, 0x44); //address for first difat sector
int difatSectorNumbers = OleUtil::get4Bytes(oleBuf, 0x48); //numbers of additional difat records
//436 of difat records are stored in header, by offset 0x4c
for (unsigned int i = 0; i < 436; i += 4) {
myDIFAT.push_back(OleUtil::get4Bytes(oleBuf + 0x4c, i));
}
//for files > 6.78 mb we need read additional DIFAT fields
for (int i = 0; difatBlock > 0 && i < difatSectorNumbers; ++i) {
ZLLogger::Instance().println("OleStorage", "Read additional data for DIFAT");
char buffer[mySectorSize];
myInputStream->seek(BBD_BLOCK_SIZE + difatBlock * mySectorSize, true);
if (myInputStream->read(buffer, mySectorSize) != mySectorSize) {
ZLLogger::Instance().println("OleStorage", "Error read DIFAT!");
return false;
}
for (unsigned int j = 0; j < (mySectorSize - 4); j += 4) {
myDIFAT.push_back(OleUtil::get4Bytes(buffer, j));
}
difatBlock = OleUtil::get4Bytes(buffer, mySectorSize - 4); //next DIFAT block is pointed at the end of the sector
}
//removing unusable DIFAT links
//0xFFFFFFFF means "free section"
while (!myDIFAT.empty() && myDIFAT.back() == (int)0xFFFFFFFF) {
myDIFAT.pop_back();
}
return true;
}
bool OleStorage::readBBD(char *oleBuf) {
char buffer[mySectorSize];
unsigned int bbdNumberBlocks = OleUtil::getU4Bytes(oleBuf, 0x2c); //number of big blocks
for (unsigned int i = 0; i < bbdNumberBlocks; ++i) {
int bbdSector = myDIFAT.at(i);
if (bbdSector >= (int)(myStreamSize / mySectorSize) || bbdSector < 0) {
ZLLogger::Instance().println("OleStorage", "Bad BBD entry!");
return false;
}
myInputStream->seek(BBD_BLOCK_SIZE + bbdSector * mySectorSize, true);
if (myInputStream->read(buffer, mySectorSize) != mySectorSize) {
ZLLogger::Instance().println("OleStorage", "Can't read BBD!");
return false;
}
for (unsigned int j = 0; j < mySectorSize; j += 4) {
myBBD.push_back(OleUtil::get4Bytes(buffer, j));
}
}
return true;
}
bool OleStorage::readSBD(char *oleBuf) {
int sbdCur = OleUtil::get4Bytes(oleBuf, 0x3c); //address of first small sector
int sbdCount = OleUtil::get4Bytes(oleBuf, 0x40); //count of small sectors
if (sbdCur <= 0) {
ZLLogger::Instance().println("OleStorage", "There's no SBD, don't read it");
return true;
}
char buffer[mySectorSize];
for (int i = 0; i < sbdCount; ++i) {
if (i != 0) {
sbdCur = myBBD.at(sbdCur);
}
if (sbdCur <= 0) {
break;
}
myInputStream->seek(BBD_BLOCK_SIZE + sbdCur * mySectorSize, true);
myInputStream->read(buffer, mySectorSize);
for (unsigned int j = 0; j < mySectorSize; j += 4) {
mySBD.push_back(OleUtil::get4Bytes(buffer, j));
}
}
return true;
}
bool OleStorage::readProperties(char *oleBuf) {
int propCur = OleUtil::get4Bytes(oleBuf, 0x30); //offset for address of sector with first property
if (propCur < 0) {
ZLLogger::Instance().println("OleStorage", "Wrong first directory sector location");
return false;
}
char buffer[mySectorSize];
do {
myInputStream->seek(BBD_BLOCK_SIZE + propCur * mySectorSize, true);
myInputStream->read(buffer, mySectorSize);
for (unsigned int j = 0; j < mySectorSize; j += 128) {
myProperties.push_back(std::string(buffer + j, 128));
}
if (propCur < 0 || (size_t)propCur >= myBBD.size()) {
break;
}
propCur = myBBD.at(propCur);
} while (propCur >= 0 && propCur < (int)(myStreamSize / mySectorSize));
return true;
}
bool OleStorage::readAllEntries() {
int propCount = myProperties.size();
for (int i = 0; i < propCount; ++i) {
OleEntry entry;
bool result = readOleEntry(i, entry);
if (!result) {
break;
}
if (entry.type == OleEntry::ROOT_DIR) {
myRootEntryIndex = i;
}
myEntries.push_back(entry);
}
if (myRootEntryIndex < 0) {
return false;
}
return true;
}
bool OleStorage::readOleEntry(int propNumber, OleEntry &e) {
static const std::string ROOT_ENTRY = "Root Entry";
std::string property = myProperties.at(propNumber);
char oleType = property.at(0x42); //offset for Ole Type
if (oleType != 1 && oleType != 2 && oleType != 3 && oleType != 5) {
ZLLogger::Instance().println("OleStorage", "entry -- not right ole type");
return false;
}
e.type = (OleEntry::Type)oleType;
int nameLength = OleUtil::getU2Bytes(property.c_str(), 0x40); //offset for value entry's name length
e.name.clear();
e.name.reserve(33); //max size of entry name
for (int i = 0; i < nameLength; i+=2) {
char c = property.at(i);
if (c != 0) {
e.name += c;
}
}
e.length = OleUtil::getU4Bytes(property.c_str(), 0x78); //offset for entry's length value
e.isBigBlock = e.length >= 0x1000 || e.name == ROOT_ENTRY;
// Read sector chain
int chainCur = OleUtil::get4Bytes(property.c_str(), 0x74); //offset for start block of entry
if (chainCur >= 0 && (chainCur <= (int)(myStreamSize / (e.isBigBlock ? mySectorSize : myShortSectorSize)))) {
//filling blocks with chains
do {
e.blocks.push_back((unsigned int)chainCur);
if (e.isBigBlock && (size_t)chainCur < myBBD.size()) {
chainCur = myBBD.at(chainCur);
} else if (!mySBD.empty() && (size_t)chainCur < mySBD.size()) {
chainCur = mySBD.at(chainCur);
} else {
chainCur = -1;
}
} while (chainCur > 0 &&
chainCur < (int)(e.isBigBlock ? myBBD.size() : mySBD.size()) &&
e.blocks.size() <= e.length / (e.isBigBlock ? mySectorSize : myShortSectorSize));
}
e.length = std::min(e.length, (unsigned int)(e.isBigBlock ? mySectorSize : myShortSectorSize) * e.blocks.size());
return true;
}
unsigned int OleStorage::getFileOffsetOfBlock(OleEntry &e, unsigned int blockNumber) {
unsigned int res;
if (e.isBigBlock) {
res = BBD_BLOCK_SIZE + e.blocks.at(blockNumber) * mySectorSize;
} else {
unsigned int sbdPerSector = mySectorSize / myShortSectorSize;
unsigned int sbdSectorNumber = e.blocks.at(blockNumber) / sbdPerSector;
unsigned int sbdSectorMod = e.blocks.at(blockNumber) % sbdPerSector;
res = BBD_BLOCK_SIZE + myEntries.at(myRootEntryIndex).blocks.at(sbdSectorNumber) * mySectorSize + sbdSectorMod * myShortSectorSize;
}
return res;
}
bool OleStorage::getEntryByName(std::string name, OleEntry &returnEntry) const {
for (size_t i = 0; i < myEntries.size(); ++i) {
const OleEntry &entry = myEntries.at(i);
if (entry.name == name) {
returnEntry = entry;
return true;
}
}
return false;
}

View file

@ -0,0 +1,92 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#ifndef __OLESTORAGE_H__
#define __OLESTORAGE_H__
#include <algorithm>
#include <vector>
#include <string>
#include <ZLInputStream.h>
struct OleEntry {
enum Type {
DIR = 1,
STREAM = 2,
ROOT_DIR = 5,
LOCK_BYTES =3
};
typedef std::vector<unsigned int> Blocks;
std::string name;
unsigned int length;
Type type;
Blocks blocks;
bool isBigBlock;
};
class OleStorage {
public:
static const size_t BBD_BLOCK_SIZE;
public:
OleStorage();
bool init(shared_ptr<ZLInputStream>, size_t streamSize);
void clear();
const std::vector<OleEntry> &getEntries() const;
bool getEntryByName(std::string name, OleEntry &entry) const;
unsigned int getSectorSize();
unsigned int getShortSectorSize();
public: //TODO make private
unsigned int getFileOffsetOfBlock(OleEntry &e, unsigned int blockNumber);
private:
bool readDIFAT(char *oleBuf);
bool readBBD(char *oleBuf);
bool readSBD(char *oleBuf);
bool readProperties(char *oleBuf);
bool readAllEntries();
bool readOleEntry(int propNumber, OleEntry &entry);
private:
shared_ptr<ZLInputStream> myInputStream;
unsigned int mySectorSize, myShortSectorSize;
size_t myStreamSize;
std::vector<int> myDIFAT; //double-indirect file allocation table
std::vector<int> myBBD; //Big Block Depot
std::vector<int> mySBD; //Small Block Depot
std::vector<std::string> myProperties;
std::vector<OleEntry> myEntries;
int myRootEntryIndex;
};
inline const std::vector<OleEntry> &OleStorage::getEntries() const { return myEntries; }
inline unsigned int OleStorage::getSectorSize() { return mySectorSize; }
inline unsigned int OleStorage::getShortSectorSize() { return myShortSectorSize; }
#endif /* __OLESTORAGE_H__ */

View file

@ -0,0 +1,127 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <ZLLogger.h>
#include "OleStream.h"
#include "OleUtil.h"
OleStream::OleStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream) :
myStorage(storage),
myOleEntry(oleEntry),
myBaseStream(stream) {
myOleOffset = 0;
}
bool OleStream::open() {
if (myOleEntry.type != OleEntry::STREAM) {
return false;
}
return true;
}
size_t OleStream::read(char *buffer, size_t maxSize) {
size_t length = maxSize;
size_t readedBytes = 0;
size_t bytesLeftInCurBlock;
unsigned int newFileOffset;
unsigned int curBlockNumber, modBlock;
size_t toReadBlocks, toReadBytes;
if (myOleOffset + length > myOleEntry.length) {
length = myOleEntry.length - myOleOffset;
}
size_t sectorSize = (size_t)(myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize());
curBlockNumber = myOleOffset / sectorSize;
if (curBlockNumber >= myOleEntry.blocks.size()) {
return 0;
}
modBlock = myOleOffset % sectorSize;
bytesLeftInCurBlock = sectorSize - modBlock;
if (bytesLeftInCurBlock < length) {
toReadBlocks = (length - bytesLeftInCurBlock) / sectorSize;
toReadBytes = (length - bytesLeftInCurBlock) % sectorSize;
} else {
toReadBlocks = toReadBytes = 0;
}
newFileOffset = myStorage->getFileOffsetOfBlock(myOleEntry, curBlockNumber) + modBlock;
myBaseStream->seek(newFileOffset, true);
readedBytes = myBaseStream->read(buffer, std::min(length, bytesLeftInCurBlock));
for (size_t i = 0; i < toReadBlocks; ++i) {
size_t readbytes;
++curBlockNumber;
newFileOffset = myStorage->getFileOffsetOfBlock(myOleEntry, curBlockNumber);
myBaseStream->seek(newFileOffset, true);
readbytes = myBaseStream->read(buffer + readedBytes, std::min(length - readedBytes, sectorSize));
readedBytes += readbytes;
}
if (toReadBytes > 0) {
size_t readbytes;
++curBlockNumber;
newFileOffset = myStorage->getFileOffsetOfBlock(myOleEntry, curBlockNumber);
myBaseStream->seek(newFileOffset, true);
readbytes = myBaseStream->read(buffer + readedBytes, toReadBytes);
readedBytes += readbytes;
}
myOleOffset += readedBytes;
return readedBytes;
}
bool OleStream::eof() const {
return (myOleOffset >= myOleEntry.length);
}
void OleStream::close() {
}
bool OleStream::seek(unsigned int offset, bool absoluteOffset) {
unsigned int newOleOffset = 0;
unsigned int newFileOffset;
if (absoluteOffset) {
newOleOffset = offset;
} else {
newOleOffset = myOleOffset + offset;
}
newOleOffset = std::min(newOleOffset, myOleEntry.length);
unsigned int sectorSize = (myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize());
unsigned int blockNumber = newOleOffset / sectorSize;
if (blockNumber >= myOleEntry.blocks.size()) {
return false;
}
unsigned int modBlock = newOleOffset % sectorSize;
newFileOffset = myStorage->getFileOffsetOfBlock(myOleEntry, blockNumber) + modBlock;
myBaseStream->seek(newFileOffset, true);
myOleOffset = newOleOffset;
return true;
}
size_t OleStream::offset() {
return myOleOffset;
}

View file

@ -0,0 +1,53 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#ifndef __OLESTREAM_H__
#define __OLESTREAM_H__
#include "OleStorage.h"
class OleStream {
public:
OleStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream);
public:
bool open();
size_t read(char *buffer, size_t maxSize);
void close();
public:
bool seek(unsigned int offset, bool absoluteOffset);
size_t offset();
public:
bool eof() const;
protected:
shared_ptr<OleStorage> myStorage;
OleEntry myOleEntry;
shared_ptr<ZLInputStream> myBaseStream;
unsigned int myOleOffset;
};
#endif /* __OLESTREAM_H__ */

View file

@ -0,0 +1,230 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include <cctype>
#include <cstring>
#include <ZLLogger.h>
#include "OleMainStream.h"
#include "DocBookReader.h"
#include "OleUtil.h"
#include "OleStreamReader.h"
//word's control chars:
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_FOOTNOTE_MARK = 0x0002;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_TABLE_SEPARATOR = 0x0007;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_HORIZONTAL_TAB = 0x0009;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_HARD_LINEBREAK = 0x000b;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_PAGE_BREAK = 0x000c;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_END_OF_PARAGRAPH = 0x000d;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_SHORT_DEFIS = 0x001e;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_SOFT_HYPHEN = 0x001f;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_START_FIELD = 0x0013;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_SEPARATOR_FIELD = 0x0014;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_END_FIELD = 0x0015;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_ZERO_WIDTH_UNBREAKABLE_SPACE = 0xfeff;
//unicode values:
const ZLUnicodeUtil::Ucs2Char OleStreamReader::NULL_SYMBOL = 0x0;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::FILE_SEPARATOR = 0x1c;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::LINE_FEED = 0x000a;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::SOFT_HYPHEN = 0xad;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::START_OF_HEADING = 0x0001;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::SPACE = 0x20;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::SHORT_DEFIS = 0x2D;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::VERTICAL_LINE = 0x7C;
OleStreamReader::OleStreamReader(const std::string &encoding) :
myEncoding(encoding) {
clear();
}
void OleStreamReader::clear() {
myBuffer.clear();
myCurBufferPosition = 0;
myNextPieceNumber = 0;
myCurCharPos = 0;
myNextStyleInfoIndex = 0;
myNextCharInfoIndex = 0;
myNextBookmarkIndex = 0;
}
bool OleStreamReader::readStream(OleMainStream &oleMainStream) {
clear();
bool res = oleMainStream.open();
if (!res) {
ZLLogger::Instance().println("OleStreamReader", "doesn't open correct");
return false;
}
ZLUnicodeUtil::Ucs2Char ucs2char;
bool tabMode = false;
while (getUcs2Char(oleMainStream, ucs2char)) {
if (ucs2char < 32) { //< 32 are control symbols
//printf("[0x%x]", ucs2char); //debug output
}
if (tabMode) {
tabMode = false;
if (ucs2char == WORD_TABLE_SEPARATOR) {
handleTableEndRow();
continue;
} else {
handleTableSeparator();
}
}
if (ucs2char < 32) {
switch (ucs2char) {
case NULL_SYMBOL:
break;
case WORD_HARD_LINEBREAK:
//printf("\n");
handleHardLinebreak();
break;
case WORD_END_OF_PARAGRAPH:
case WORD_PAGE_BREAK:
//printf("\n");
handleParagraphEnd();
break;
case WORD_TABLE_SEPARATOR:
tabMode = true;
break;
case WORD_FOOTNOTE_MARK:
handleFootNoteMark();
break;
case WORD_START_FIELD:
handleStartField();
break;
case WORD_SEPARATOR_FIELD:
handleSeparatorField();
break;
case WORD_END_FIELD:
handleEndField();
break;
case START_OF_HEADING:
handleStartOfHeading();
break;
default:
handleOtherControlChar(ucs2char);
break;
}
} else if (ucs2char == WORD_ZERO_WIDTH_UNBREAKABLE_SPACE) {
continue; //skip
} else {
//debug output
//std::string utf8String;
//ZLUnicodeUtil::Ucs2String ucs2String;
//ucs2String.push_back(ucs2char);
//ZLUnicodeUtil::ucs2ToUtf8(utf8String, ucs2String);
//printf("%s", utf8String.c_str());
handleChar(ucs2char);
}
}
return true;
}
bool OleStreamReader::getUcs2Char(OleMainStream &stream, ZLUnicodeUtil::Ucs2Char &ucs2char) {
if (myCurBufferPosition >= myBuffer.size()) {
if (!fillBuffer(stream)) {
return false;
}
}
const OleMainStream::StyleInfoList &styleInfoList = stream.getStyleInfoList();
if (!styleInfoList.empty()) {
while (myNextStyleInfoIndex < styleInfoList.size() && styleInfoList.at(myNextStyleInfoIndex).first == myCurCharPos) {
OleMainStream::Style info = styleInfoList.at(myNextStyleInfoIndex).second;
handleParagraphStyle(info);
++myNextStyleInfoIndex;
}
}
const OleMainStream::CharInfoList &charInfoList = stream.getCharInfoList();
if (!charInfoList.empty()) {
while (myNextCharInfoIndex < charInfoList.size() && charInfoList.at(myNextCharInfoIndex).first == myCurCharPos) {
OleMainStream::CharInfo info = charInfoList.at(myNextCharInfoIndex).second;
handleFontStyle(info.fontStyle);
++myNextCharInfoIndex;
}
}
const OleMainStream::Bookmarks &bookmarksList = stream.getBookmarks();
if (!bookmarksList.empty()) {
while (myNextBookmarkIndex < bookmarksList.size() && bookmarksList.at(myNextBookmarkIndex).charPos == myCurCharPos) {
OleMainStream::Bookmark bookmark = bookmarksList.at(myNextBookmarkIndex);
handleBookmark(bookmark.name);
++myNextBookmarkIndex;
}
}
ucs2char = myBuffer.at(myCurBufferPosition++);
++myCurCharPos;
return true;
}
bool OleStreamReader::fillBuffer(OleMainStream &stream) {
const OleMainStream::Pieces &pieces = stream.getPieces();
if (myNextPieceNumber >= pieces.size()) {
return false; //end of reading
}
const OleMainStream::Piece &piece = pieces.at(myNextPieceNumber);
if (piece.type == OleMainStream::Piece::FOOTNOTE) {
handlePageBreak();
} else if (piece.type == OleMainStream::Piece::OTHER) {
return false;
}
char *textBuffer = new char[piece.length];
stream.seek(piece.offset, true);
stream.read(textBuffer, piece.length);
myBuffer.clear();
if (!piece.isANSI) {
for (int i = 0; i < piece.length; i += 2) {
ZLUnicodeUtil::Ucs2Char ch = OleUtil::getU2Bytes(textBuffer, i);
myBuffer.push_back(ch);
}
} else {
if (myConverter.isNull()) {
//lazy convertor loading, because documents can be in Unicode only and don't need to be converted
ZLEncodingCollection &collection = ZLEncodingCollection::Instance();
myConverter = collection.converter(myEncoding);
if (myConverter.isNull()) {
myConverter = collection.defaultConverter();
}
}
std::string utf8String;
myConverter->convert(utf8String, std::string(textBuffer, piece.length));
ZLUnicodeUtil::utf8ToUcs2(myBuffer, utf8String);
}
myCurBufferPosition = 0;
++myNextPieceNumber;
delete textBuffer;
return true;
}

View file

@ -0,0 +1,99 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#ifndef __OLESTREAMREADER_H__
#define __OLESTREAMREADER_H__
#include <ZLUnicodeUtil.h>
#include <ZLEncodingConverter.h>
#include "OleMainStream.h"
class OleStreamReader {
public:
//word's control chars:
static const ZLUnicodeUtil::Ucs2Char WORD_FOOTNOTE_MARK;
static const ZLUnicodeUtil::Ucs2Char WORD_TABLE_SEPARATOR;
static const ZLUnicodeUtil::Ucs2Char WORD_HORIZONTAL_TAB;
static const ZLUnicodeUtil::Ucs2Char WORD_HARD_LINEBREAK;
static const ZLUnicodeUtil::Ucs2Char WORD_PAGE_BREAK;
static const ZLUnicodeUtil::Ucs2Char WORD_END_OF_PARAGRAPH;
static const ZLUnicodeUtil::Ucs2Char WORD_SHORT_DEFIS;
static const ZLUnicodeUtil::Ucs2Char WORD_SOFT_HYPHEN;
static const ZLUnicodeUtil::Ucs2Char WORD_START_FIELD;
static const ZLUnicodeUtil::Ucs2Char WORD_SEPARATOR_FIELD;
static const ZLUnicodeUtil::Ucs2Char WORD_END_FIELD;
static const ZLUnicodeUtil::Ucs2Char WORD_ZERO_WIDTH_UNBREAKABLE_SPACE;
//unicode values:
static const ZLUnicodeUtil::Ucs2Char NULL_SYMBOL;
static const ZLUnicodeUtil::Ucs2Char FILE_SEPARATOR;
static const ZLUnicodeUtil::Ucs2Char LINE_FEED;
static const ZLUnicodeUtil::Ucs2Char SOFT_HYPHEN;
static const ZLUnicodeUtil::Ucs2Char START_OF_HEADING;
static const ZLUnicodeUtil::Ucs2Char SPACE;
static const ZLUnicodeUtil::Ucs2Char SHORT_DEFIS;
static const ZLUnicodeUtil::Ucs2Char VERTICAL_LINE;
public:
OleStreamReader(const std::string &encoding);
bool readStream(OleMainStream &stream);
void clear();
protected:
//virtual void parapgraphHandler(std::string paragraph) = 0;
virtual void handleChar(ZLUnicodeUtil::Ucs2Char ucs2char) = 0;
virtual void handleHardLinebreak() = 0;
virtual void handleParagraphEnd() = 0;
virtual void handlePageBreak() = 0;
virtual void handleTableSeparator() = 0;
virtual void handleTableEndRow() = 0;
virtual void handleFootNoteMark() = 0;
virtual void handleStartField() = 0;
virtual void handleSeparatorField() = 0;
virtual void handleEndField() = 0;
virtual void handleStartOfHeading() = 0;
virtual void handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char) = 0;
virtual void handleFontStyle(unsigned int fontStyle) = 0;
virtual void handleParagraphStyle(const OleMainStream::Style &styleInfo) = 0;
virtual void handleBookmark(const std::string &name) = 0;
private:
bool getUcs2Char(OleMainStream &stream, ZLUnicodeUtil::Ucs2Char &ucs2char);
bool fillBuffer(OleMainStream &stream);
private:
ZLUnicodeUtil::Ucs2String myBuffer;
size_t myCurBufferPosition;
size_t myNextPieceNumber;
shared_ptr<ZLEncodingConverter> myConverter;
const std::string myEncoding;
unsigned int myCurCharPos;
size_t myNextStyleInfoIndex;
size_t myNextCharInfoIndex;
size_t myNextBookmarkIndex;
};
#endif /* __OLESTREAMREADER_H__ */

View file

@ -0,0 +1,58 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#include "OleUtil.h"
int OleUtil::get4Bytes(const char *buffer, unsigned int offset) {
const unsigned char *buf = (const unsigned char*)buffer;
return
(int)buf[offset]
| ((int)buf[offset+1] << 8)
| ((int)buf[offset+2] << 16)
| ((int)buf[offset+3] << 24);
}
unsigned int OleUtil::getU4Bytes(const char *buffer, unsigned int offset) {
const unsigned char *buf = (const unsigned char*)buffer;
return
(unsigned int)buf[offset]
| ((unsigned int)buf[offset+1] << 8)
| ((unsigned int)buf[offset+2] << 16)
| ((unsigned int)buf[offset+3] << 24);
}
unsigned int OleUtil::getU2Bytes(const char *buffer, unsigned int offset) {
const unsigned char *buf = (const unsigned char*)buffer;
return
(unsigned int)buf[offset]
| ((unsigned int)buf[offset+1] << 8);
}
unsigned int OleUtil::getU1Byte(const char *buffer, unsigned int offset) {
const unsigned char *buf = (const unsigned char*)buffer;
return (unsigned int)buf[offset];
}
int OleUtil::get1Byte(const char *buffer, unsigned int offset) {
const unsigned char *buf = (const unsigned char*)buffer;
return (int)buf[offset];
}

View file

@ -0,0 +1,32 @@
/*
* Copyright (C) 2009-2010 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
#ifndef __OLEUTIL_H__
#define __OLEUTIL_H__
class OleUtil {
public:
static int get4Bytes(const char *buffer, unsigned int offset);
static unsigned int getU4Bytes(const char *buffer, unsigned int offset);
static unsigned int getU2Bytes(const char *buffer, unsigned int offset);
static unsigned int getU1Byte(const char *buffer, unsigned int offset);
static int get1Byte(const char *buffer, unsigned int offset);
};
#endif /* __OLEUTIL_H__ */

View file

@ -39,6 +39,7 @@ public class FileTypeCollection {
addType(new SimpleFileType("PDF", "pdf", MimeType.TYPES_PDF));
addType(new FileTypeDjVu());
addType(new SimpleFileType("ZIP archive", "zip", Collections.singletonList(MimeType.APP_ZIP)));
addType(new SimpleFileType("DOC", "doc", MimeType.TYPES_DOC));
}
private void addType(FileType type) {