From 78e889d9842d4bc99e2cd3e8b7ed736c5537e1f4 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 23 Jan 2013 12:04:02 +0100 Subject: [PATCH] use the "charset" extended attribute for text files if it is set --- src/internfile/mh_text.cpp | 15 ++++++++++++--- src/internfile/mh_text.h | 3 ++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp index 03a2461b..5f5fd023 100644 --- a/src/internfile/mh_text.cpp +++ b/src/internfile/mh_text.cpp @@ -33,6 +33,7 @@ using namespace std; #include "readfile.h" #include "md5.h" #include "rclconfig.h" +#include "pxattr.h" const int MB = 1024*1024; const int KB = 1024; @@ -53,6 +54,10 @@ bool MimeHandlerText::set_document_file(const string &fn) return false; } + // Check for charset defined in extended attribute as per: + // http://freedesktop.org/wiki/CommonExtendedAttributes + pxattr::get(m_fn, "charset", &m_charsetfromxattr); + // Max file size parameter: texts over this size are not indexed int maxmbs = 20; m_config->getConfParam("textfilemaxmbs", &maxmbs); @@ -115,14 +120,18 @@ bool MimeHandlerText::next_document() if (m_havedoc == false) return false; - // We transcode even if defcharset is supposedly already utf-8: - // this validates the encoding. - m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; + if (m_charsetfromxattr.empty()) + m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; + else + m_metaData[cstr_dj_keyorigcharset] = m_charsetfromxattr; + m_metaData[cstr_dj_keymt] = cstr_textplain; size_t srclen = m_text.length(); m_metaData[cstr_dj_keycontent].swap(m_text); + // We transcode even if defcharset is supposedly already utf-8: + // this validates the encoding. // txtdcode() truncates the text if transcoding fails (void)txtdcode("mh_text"); diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h index cb97aef5..800822fe 100644 --- a/src/internfile/mh_text.h +++ b/src/internfile/mh_text.h @@ -56,7 +56,8 @@ private: string m_fn; off_t m_offs; // Offset of next read in file if we're paging size_t m_pagesz; - + string m_charsetfromxattr; + bool readnext(); };