mirror of
https://github.com/geometer/FBReaderJ.git
synced 2025-10-04 18:29:23 +02:00
language detection for RTF's
This commit is contained in:
parent
eddccf34ec
commit
7aa4f39bc7
4 changed files with 27 additions and 22 deletions
|
@ -73,7 +73,7 @@ void FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream) {
|
||||||
const size_t size = stream.read(buffer, BUFSIZE);
|
const size_t size = stream.read(buffer, BUFSIZE);
|
||||||
stream.close();
|
stream.close();
|
||||||
shared_ptr<ZLLanguageDetector::LanguageInfo> info =
|
shared_ptr<ZLLanguageDetector::LanguageInfo> info =
|
||||||
ZLLanguageDetector().findInfo(buffer, size);
|
ZLLanguageDetector().findInfoForEncoding(book.encoding(), buffer, size, -20000);
|
||||||
delete[] buffer;
|
delete[] buffer;
|
||||||
if (!info.isNull()) {
|
if (!info.isNull()) {
|
||||||
if (!info->Language.empty()) {
|
if (!info->Language.empty()) {
|
||||||
|
|
|
@ -43,12 +43,10 @@ bool RtfPlugin::readMetaInfo(Book &book) const {
|
||||||
|
|
||||||
if (book.encoding().empty()) {
|
if (book.encoding().empty()) {
|
||||||
book.setEncoding("utf-8");
|
book.setEncoding("utf-8");
|
||||||
}
|
} else if (book.language().empty()) {
|
||||||
|
|
||||||
if (book.language().empty()) {
|
|
||||||
shared_ptr<ZLInputStream> stream = new RtfReaderStream(book.file(), 50000);
|
shared_ptr<ZLInputStream> stream = new RtfReaderStream(book.file(), 50000);
|
||||||
if (!stream.isNull()) {
|
if (!stream.isNull()) {
|
||||||
//detectLanguage(book, *stream);
|
detectLanguage(book, *stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -89,8 +89,6 @@ static std::string naiveEncodingDetection(const unsigned char *buffer, size_t le
|
||||||
}
|
}
|
||||||
|
|
||||||
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) {
|
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) {
|
||||||
shared_ptr<LanguageInfo> info;
|
|
||||||
std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap;
|
|
||||||
std::string naive;
|
std::string naive;
|
||||||
if ((unsigned char)buffer[0] == 0xFE &&
|
if ((unsigned char)buffer[0] == 0xFE &&
|
||||||
(unsigned char)buffer[1] == 0xFF) {
|
(unsigned char)buffer[1] == 0xFF) {
|
||||||
|
@ -101,8 +99,17 @@ shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const
|
||||||
} else {
|
} else {
|
||||||
naive = naiveEncodingDetection((const unsigned char*)buffer, length);
|
naive = naiveEncodingDetection((const unsigned char*)buffer, length);
|
||||||
}
|
}
|
||||||
|
return findInfoForEncoding(naive, buffer, length, matchingCriterion);
|
||||||
|
}
|
||||||
|
|
||||||
|
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion) {
|
||||||
|
shared_ptr<LanguageInfo> info;
|
||||||
|
std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap;
|
||||||
for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) {
|
for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) {
|
||||||
if (naive.empty() || (*it)->info()->Encoding == naive) {
|
if (!encoding.empty() && (*it)->info()->Encoding != encoding) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
const int charSequenceLength = (*it)->charSequenceLength();
|
const int charSequenceLength = (*it)->charSequenceLength();
|
||||||
shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength];
|
shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength];
|
||||||
if (stat.isNull()) {
|
if (stat.isNull()) {
|
||||||
|
@ -118,6 +125,5 @@ shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const
|
||||||
matchingCriterion = criterion;
|
matchingCriterion = criterion;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,6 +41,7 @@ public:
|
||||||
~ZLLanguageDetector();
|
~ZLLanguageDetector();
|
||||||
|
|
||||||
shared_ptr<LanguageInfo> findInfo(const char *buffer, size_t length, int matchingCriterion = 0);
|
shared_ptr<LanguageInfo> findInfo(const char *buffer, size_t length, int matchingCriterion = 0);
|
||||||
|
shared_ptr<LanguageInfo> findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion = 0);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
typedef std::vector<shared_ptr<ZLStatisticsBasedMatcher> > SBVector;
|
typedef std::vector<shared_ptr<ZLStatisticsBasedMatcher> > SBVector;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue