mirror of
https://github.com/geometer/FBReaderJ.git
synced 2025-10-04 10:19:33 +02:00
language detection for RTF's
This commit is contained in:
parent
eddccf34ec
commit
7aa4f39bc7
4 changed files with 27 additions and 22 deletions
|
@ -73,7 +73,7 @@ void FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream) {
|
|||
const size_t size = stream.read(buffer, BUFSIZE);
|
||||
stream.close();
|
||||
shared_ptr<ZLLanguageDetector::LanguageInfo> info =
|
||||
ZLLanguageDetector().findInfo(buffer, size);
|
||||
ZLLanguageDetector().findInfoForEncoding(book.encoding(), buffer, size, -20000);
|
||||
delete[] buffer;
|
||||
if (!info.isNull()) {
|
||||
if (!info->Language.empty()) {
|
||||
|
|
|
@ -43,12 +43,10 @@ bool RtfPlugin::readMetaInfo(Book &book) const {
|
|||
|
||||
if (book.encoding().empty()) {
|
||||
book.setEncoding("utf-8");
|
||||
}
|
||||
|
||||
if (book.language().empty()) {
|
||||
} else if (book.language().empty()) {
|
||||
shared_ptr<ZLInputStream> stream = new RtfReaderStream(book.file(), 50000);
|
||||
if (!stream.isNull()) {
|
||||
//detectLanguage(book, *stream);
|
||||
detectLanguage(book, *stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -89,8 +89,6 @@ static std::string naiveEncodingDetection(const unsigned char *buffer, size_t le
|
|||
}
|
||||
|
||||
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) {
|
||||
shared_ptr<LanguageInfo> info;
|
||||
std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap;
|
||||
std::string naive;
|
||||
if ((unsigned char)buffer[0] == 0xFE &&
|
||||
(unsigned char)buffer[1] == 0xFF) {
|
||||
|
@ -101,22 +99,30 @@ shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const
|
|||
} else {
|
||||
naive = naiveEncodingDetection((const unsigned char*)buffer, length);
|
||||
}
|
||||
return findInfoForEncoding(naive, buffer, length, matchingCriterion);
|
||||
}
|
||||
|
||||
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion) {
|
||||
shared_ptr<LanguageInfo> info;
|
||||
std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap;
|
||||
for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) {
|
||||
if (naive.empty() || (*it)->info()->Encoding == naive) {
|
||||
const int charSequenceLength = (*it)->charSequenceLength();
|
||||
shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength];
|
||||
if (stat.isNull()) {
|
||||
stat = new ZLMapBasedStatistics();
|
||||
ZLStatisticsGenerator("\r\n ").generate(
|
||||
buffer, length, charSequenceLength, *stat
|
||||
);
|
||||
statisticsMap[charSequenceLength] = stat;
|
||||
}
|
||||
const int criterion = (*it)->criterion(*stat);
|
||||
if (criterion > matchingCriterion) {
|
||||
info = (*it)->info();
|
||||
matchingCriterion = criterion;
|
||||
}
|
||||
if (!encoding.empty() && (*it)->info()->Encoding != encoding) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int charSequenceLength = (*it)->charSequenceLength();
|
||||
shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength];
|
||||
if (stat.isNull()) {
|
||||
stat = new ZLMapBasedStatistics();
|
||||
ZLStatisticsGenerator("\r\n ").generate(
|
||||
buffer, length, charSequenceLength, *stat
|
||||
);
|
||||
statisticsMap[charSequenceLength] = stat;
|
||||
}
|
||||
const int criterion = (*it)->criterion(*stat);
|
||||
if (criterion > matchingCriterion) {
|
||||
info = (*it)->info();
|
||||
matchingCriterion = criterion;
|
||||
}
|
||||
}
|
||||
return info;
|
||||
|
|
|
@ -41,6 +41,7 @@ public:
|
|||
~ZLLanguageDetector();
|
||||
|
||||
shared_ptr<LanguageInfo> findInfo(const char *buffer, size_t length, int matchingCriterion = 0);
|
||||
shared_ptr<LanguageInfo> findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion = 0);
|
||||
|
||||
private:
|
||||
typedef std::vector<shared_ptr<ZLStatisticsBasedMatcher> > SBVector;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue