1
0
Fork 0
mirror of https://github.com/geometer/FBReaderJ.git synced 2025-10-03 09:49:19 +02:00

doc (ms word) files encoding auto detection

This commit is contained in:
Nikolay Pultsin 2012-10-26 09:47:34 +04:00
parent e2e72cc046
commit 7f30c5fa73
13 changed files with 298 additions and 72 deletions

View file

@ -17,7 +17,6 @@
* 02110-1301, USA.
*/
#include <cctype>
#include <cstring>
@ -37,7 +36,7 @@ const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_HORIZONTAL_TAB = 0x0009;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_HARD_LINEBREAK = 0x000b;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_PAGE_BREAK = 0x000c;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_END_OF_PARAGRAPH = 0x000d;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_SHORT_DEFIS = 0x001e;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_MINUS = 0x001e;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_SOFT_HYPHEN = 0x001f;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_START_FIELD = 0x0013;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::WORD_SEPARATOR_FIELD = 0x0014;
@ -52,7 +51,7 @@ const ZLUnicodeUtil::Ucs2Char OleStreamReader::FILE_SEPARATOR = 0x1c;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::LINE_FEED = 0x000a;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::SOFT_HYPHEN = 0xad;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::SPACE = 0x20;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::SHORT_DEFIS = 0x2D;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::MINUS = 0x2D;
const ZLUnicodeUtil::Ucs2Char OleStreamReader::VERTICAL_LINE = 0x7C;
OleStreamReader::OleStreamReader(const std::string &encoding) :
@ -73,6 +72,26 @@ void OleStreamReader::clear() {
myNextFloatImageInfoIndex = 0;
}
bool OleStreamReader::readDocument(shared_ptr<ZLInputStream> inputStream) {
static const std::string WORD_DOCUMENT = "WordDocument";
shared_ptr<OleStorage> storage = new OleStorage;
if (!storage->init(inputStream, inputStream->sizeOfOpened())) {
ZLLogger::Instance().println("DocBookReader", "Broken OLE file!");
return false;
}
OleEntry wordDocumentEntry;
bool result = storage->getEntryByName(WORD_DOCUMENT, wordDocumentEntry);
if (!result) {
return false;
}
OleMainStream oleStream(storage, wordDocumentEntry, inputStream);
return readStream(oleStream);
}
bool OleStreamReader::readStream(OleMainStream &oleMainStream) {
clear();
@ -252,29 +271,19 @@ bool OleStreamReader::fillBuffer(OleMainStream &stream) {
return false;
}
char *textBuffer = new char[piece.Length];
size_t readedBytes = stream.read(textBuffer, piece.Length);
if (readedBytes != (unsigned int)piece.Length) {
ZLLogger::Instance().println("OleStreamReader", "not all bytes has been readed from piece");
size_t readBytes = stream.read(textBuffer, piece.Length);
if (readBytes != (size_t)piece.Length) {
ZLLogger::Instance().println("OleStreamReader", "not all bytes have been read from piece");
}
myBuffer.clear();
if (!piece.IsANSI) {
for (unsigned int i = 0; i < readedBytes; i += 2) {
for (size_t i = 0; i < readBytes; i += 2) {
ZLUnicodeUtil::Ucs2Char ch = OleUtil::getU2Bytes(textBuffer, i);
myBuffer.push_back(ch);
}
} else {
if (myConverter.isNull()) {
//lazy convertor loading, because documents can be in Unicode only and don't need to be converted
ZLEncodingCollection &collection = ZLEncodingCollection::Instance();
myConverter = collection.converter(myEncoding);
if (myConverter.isNull()) {
myConverter = collection.defaultConverter();
}
}
std::string utf8String;
myConverter->convert(utf8String, std::string(textBuffer, readedBytes));
ZLUnicodeUtil::utf8ToUcs2(myBuffer, utf8String);
dataHandler(textBuffer, readBytes);
}
myCurBufferPosition = 0;
++myNextPieceNumber;
@ -282,3 +291,17 @@ bool OleStreamReader::fillBuffer(OleMainStream &stream) {
return true;
}
void OleStreamReader::dataHandler(const char *buffer, size_t len) {
if (myConverter.isNull()) {
// lazy converter initialization
const ZLEncodingCollection &collection = ZLEncodingCollection::Instance();
myConverter = collection.converter(myEncoding);
if (myConverter.isNull()) {
myConverter = collection.defaultConverter();
}
}
std::string utf8String;
myConverter->convert(utf8String, buffer, buffer + len);
ZLUnicodeUtil::utf8ToUcs2(myBuffer, utf8String);
}