1
0
Fork 0
mirror of https://github.com/geometer/FBReaderJ.git synced 2025-10-04 10:19:33 +02:00

language detection for RTF's

This commit is contained in:
Nikolay Pultsin 2012-03-23 03:19:16 +00:00
parent eddccf34ec
commit 7aa4f39bc7
4 changed files with 27 additions and 22 deletions

View file

@ -73,7 +73,7 @@ void FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream) {
const size_t size = stream.read(buffer, BUFSIZE); const size_t size = stream.read(buffer, BUFSIZE);
stream.close(); stream.close();
shared_ptr<ZLLanguageDetector::LanguageInfo> info = shared_ptr<ZLLanguageDetector::LanguageInfo> info =
ZLLanguageDetector().findInfo(buffer, size); ZLLanguageDetector().findInfoForEncoding(book.encoding(), buffer, size, -20000);
delete[] buffer; delete[] buffer;
if (!info.isNull()) { if (!info.isNull()) {
if (!info->Language.empty()) { if (!info->Language.empty()) {

View file

@ -43,12 +43,10 @@ bool RtfPlugin::readMetaInfo(Book &book) const {
if (book.encoding().empty()) { if (book.encoding().empty()) {
book.setEncoding("utf-8"); book.setEncoding("utf-8");
} } else if (book.language().empty()) {
if (book.language().empty()) {
shared_ptr<ZLInputStream> stream = new RtfReaderStream(book.file(), 50000); shared_ptr<ZLInputStream> stream = new RtfReaderStream(book.file(), 50000);
if (!stream.isNull()) { if (!stream.isNull()) {
//detectLanguage(book, *stream); detectLanguage(book, *stream);
} }
} }

View file

@ -89,8 +89,6 @@ static std::string naiveEncodingDetection(const unsigned char *buffer, size_t le
} }
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) { shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) {
shared_ptr<LanguageInfo> info;
std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap;
std::string naive; std::string naive;
if ((unsigned char)buffer[0] == 0xFE && if ((unsigned char)buffer[0] == 0xFE &&
(unsigned char)buffer[1] == 0xFF) { (unsigned char)buffer[1] == 0xFF) {
@ -101,22 +99,30 @@ shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const
} else { } else {
naive = naiveEncodingDetection((const unsigned char*)buffer, length); naive = naiveEncodingDetection((const unsigned char*)buffer, length);
} }
return findInfoForEncoding(naive, buffer, length, matchingCriterion);
}
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion) {
shared_ptr<LanguageInfo> info;
std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap;
for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) { for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) {
if (naive.empty() || (*it)->info()->Encoding == naive) { if (!encoding.empty() && (*it)->info()->Encoding != encoding) {
const int charSequenceLength = (*it)->charSequenceLength(); continue;
shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength]; }
if (stat.isNull()) {
stat = new ZLMapBasedStatistics(); const int charSequenceLength = (*it)->charSequenceLength();
ZLStatisticsGenerator("\r\n ").generate( shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength];
buffer, length, charSequenceLength, *stat if (stat.isNull()) {
); stat = new ZLMapBasedStatistics();
statisticsMap[charSequenceLength] = stat; ZLStatisticsGenerator("\r\n ").generate(
} buffer, length, charSequenceLength, *stat
const int criterion = (*it)->criterion(*stat); );
if (criterion > matchingCriterion) { statisticsMap[charSequenceLength] = stat;
info = (*it)->info(); }
matchingCriterion = criterion; const int criterion = (*it)->criterion(*stat);
} if (criterion > matchingCriterion) {
info = (*it)->info();
matchingCriterion = criterion;
} }
} }
return info; return info;

View file

@ -41,6 +41,7 @@ public:
~ZLLanguageDetector(); ~ZLLanguageDetector();
shared_ptr<LanguageInfo> findInfo(const char *buffer, size_t length, int matchingCriterion = 0); shared_ptr<LanguageInfo> findInfo(const char *buffer, size_t length, int matchingCriterion = 0);
shared_ptr<LanguageInfo> findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion = 0);
private: private:
typedef std::vector<shared_ptr<ZLStatisticsBasedMatcher> > SBVector; typedef std::vector<shared_ptr<ZLStatisticsBasedMatcher> > SBVector;