use the "charset" extended attribute for text files if it is set

This commit is contained in:
Jean-Francois Dockes 2013-01-23 12:04:02 +01:00
parent 17ecc9d7bb
commit 78e889d984
2 changed files with 14 additions and 4 deletions

View file

@ -33,6 +33,7 @@ using namespace std;
#include "readfile.h" #include "readfile.h"
#include "md5.h" #include "md5.h"
#include "rclconfig.h" #include "rclconfig.h"
#include "pxattr.h"
const int MB = 1024*1024; const int MB = 1024*1024;
const int KB = 1024; const int KB = 1024;
@ -53,6 +54,10 @@ bool MimeHandlerText::set_document_file(const string &fn)
return false; return false;
} }
// Check for charset defined in extended attribute as per:
// http://freedesktop.org/wiki/CommonExtendedAttributes
pxattr::get(m_fn, "charset", &m_charsetfromxattr);
// Max file size parameter: texts over this size are not indexed // Max file size parameter: texts over this size are not indexed
int maxmbs = 20; int maxmbs = 20;
m_config->getConfParam("textfilemaxmbs", &maxmbs); m_config->getConfParam("textfilemaxmbs", &maxmbs);
@ -115,14 +120,18 @@ bool MimeHandlerText::next_document()
if (m_havedoc == false) if (m_havedoc == false)
return false; return false;
// We transcode even if defcharset is supposedly already utf-8: if (m_charsetfromxattr.empty())
// this validates the encoding. m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; else
m_metaData[cstr_dj_keyorigcharset] = m_charsetfromxattr;
m_metaData[cstr_dj_keymt] = cstr_textplain; m_metaData[cstr_dj_keymt] = cstr_textplain;
size_t srclen = m_text.length(); size_t srclen = m_text.length();
m_metaData[cstr_dj_keycontent].swap(m_text); m_metaData[cstr_dj_keycontent].swap(m_text);
// We transcode even if defcharset is supposedly already utf-8:
// this validates the encoding.
// txtdcode() truncates the text if transcoding fails // txtdcode() truncates the text if transcoding fails
(void)txtdcode("mh_text"); (void)txtdcode("mh_text");

View file

@ -56,7 +56,8 @@ private:
string m_fn; string m_fn;
off_t m_offs; // Offset of next read in file if we're paging off_t m_offs; // Offset of next read in file if we're paging
size_t m_pagesz; size_t m_pagesz;
string m_charsetfromxattr;
bool readnext(); bool readnext();
}; };