From 7aa4f39bc765c49ef40c53164f6fc91d6c377f5f Mon Sep 17 00:00:00 2001 From: Nikolay Pultsin Date: Fri, 23 Mar 2012 03:19:16 +0000 Subject: [PATCH] language detection for RTF's --- .../fbreader/src/formats/FormatPlugin.cpp | 2 +- .../fbreader/src/formats/rtf/RtfPlugin.cpp | 6 +-- .../core/src/language/ZLLanguageDetector.cpp | 40 +++++++++++-------- .../core/src/language/ZLLanguageDetector.h | 1 + 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/jni/NativeFormats/fbreader/src/formats/FormatPlugin.cpp b/jni/NativeFormats/fbreader/src/formats/FormatPlugin.cpp index ca971cc87..05ad133a5 100644 --- a/jni/NativeFormats/fbreader/src/formats/FormatPlugin.cpp +++ b/jni/NativeFormats/fbreader/src/formats/FormatPlugin.cpp @@ -73,7 +73,7 @@ void FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream) { const size_t size = stream.read(buffer, BUFSIZE); stream.close(); shared_ptr info = - ZLLanguageDetector().findInfo(buffer, size); + ZLLanguageDetector().findInfoForEncoding(book.encoding(), buffer, size, -20000); delete[] buffer; if (!info.isNull()) { if (!info->Language.empty()) { diff --git a/jni/NativeFormats/fbreader/src/formats/rtf/RtfPlugin.cpp b/jni/NativeFormats/fbreader/src/formats/rtf/RtfPlugin.cpp index 5c3c96115..c42cc9a05 100644 --- a/jni/NativeFormats/fbreader/src/formats/rtf/RtfPlugin.cpp +++ b/jni/NativeFormats/fbreader/src/formats/rtf/RtfPlugin.cpp @@ -43,12 +43,10 @@ bool RtfPlugin::readMetaInfo(Book &book) const { if (book.encoding().empty()) { book.setEncoding("utf-8"); - } - - if (book.language().empty()) { + } else if (book.language().empty()) { shared_ptr stream = new RtfReaderStream(book.file(), 50000); if (!stream.isNull()) { - //detectLanguage(book, *stream); + detectLanguage(book, *stream); } } diff --git a/jni/NativeFormats/zlibrary/core/src/language/ZLLanguageDetector.cpp b/jni/NativeFormats/zlibrary/core/src/language/ZLLanguageDetector.cpp index 138bd02ec..1c3546483 100644 --- a/jni/NativeFormats/zlibrary/core/src/language/ZLLanguageDetector.cpp +++ b/jni/NativeFormats/zlibrary/core/src/language/ZLLanguageDetector.cpp @@ -89,8 +89,6 @@ static std::string naiveEncodingDetection(const unsigned char *buffer, size_t le } shared_ptr ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) { - shared_ptr info; - std::map > statisticsMap; std::string naive; if ((unsigned char)buffer[0] == 0xFE && (unsigned char)buffer[1] == 0xFF) { @@ -101,22 +99,30 @@ shared_ptr ZLLanguageDetector::findInfo(const } else { naive = naiveEncodingDetection((const unsigned char*)buffer, length); } + return findInfoForEncoding(naive, buffer, length, matchingCriterion); +} + +shared_ptr ZLLanguageDetector::findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion) { + shared_ptr info; + std::map > statisticsMap; for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) { - if (naive.empty() || (*it)->info()->Encoding == naive) { - const int charSequenceLength = (*it)->charSequenceLength(); - shared_ptr stat = statisticsMap[charSequenceLength]; - if (stat.isNull()) { - stat = new ZLMapBasedStatistics(); - ZLStatisticsGenerator("\r\n ").generate( - buffer, length, charSequenceLength, *stat - ); - statisticsMap[charSequenceLength] = stat; - } - const int criterion = (*it)->criterion(*stat); - if (criterion > matchingCriterion) { - info = (*it)->info(); - matchingCriterion = criterion; - } + if (!encoding.empty() && (*it)->info()->Encoding != encoding) { + continue; + } + + const int charSequenceLength = (*it)->charSequenceLength(); + shared_ptr stat = statisticsMap[charSequenceLength]; + if (stat.isNull()) { + stat = new ZLMapBasedStatistics(); + ZLStatisticsGenerator("\r\n ").generate( + buffer, length, charSequenceLength, *stat + ); + statisticsMap[charSequenceLength] = stat; + } + const int criterion = (*it)->criterion(*stat); + if (criterion > matchingCriterion) { + info = (*it)->info(); + matchingCriterion = criterion; } } return info; diff --git a/jni/NativeFormats/zlibrary/core/src/language/ZLLanguageDetector.h b/jni/NativeFormats/zlibrary/core/src/language/ZLLanguageDetector.h index f70798be3..cef024d4f 100644 --- a/jni/NativeFormats/zlibrary/core/src/language/ZLLanguageDetector.h +++ b/jni/NativeFormats/zlibrary/core/src/language/ZLLanguageDetector.h @@ -41,6 +41,7 @@ public: ~ZLLanguageDetector(); shared_ptr findInfo(const char *buffer, size_t length, int matchingCriterion = 0); + shared_ptr findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion = 0); private: typedef std::vector > SBVector;