From 6670b36bb7f144216bf2e09e6103626c36a168f1 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 5 Dec 2016 15:19:02 +0100 Subject: [PATCH] added "all in one" rclepub1 filter (no individual indexing of chapters) --- src/Makefile.am | 1 + src/filters/rclepub1 | 106 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100755 src/filters/rclepub1 diff --git a/src/Makefile.am b/src/Makefile.am index dad39bc3..a2c966fe 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -581,6 +581,7 @@ filters/rcldjvu.py \ filters/rcldoc.py \ filters/rcldvi \ filters/rclepub \ +filters/rclepub1 \ filters/rclexec1.py \ filters/rclexecm.py \ filters/rclfb2 \ diff --git a/src/filters/rclepub1 b/src/filters/rclepub1 new file mode 100755 index 00000000..22922652 --- /dev/null +++ b/src/filters/rclepub1 @@ -0,0 +1,106 @@ +#!/usr/bin/env python +"""Extract Html content from an EPUB file (.chm), concatenating all sections""" +from __future__ import print_function + +import sys +import os +import re + +import rclexecm + +try: + import epub +except: + print("RECFILTERROR HELPERNOTFOUND python:epub") + sys.exit(1); + +class rclEPUB: + """RclExecM slave worker for extracting all text from an EPUB + file. This version concatenates all nodes.""" + + def __init__(self, em): + self.em = em + self.em.setmimetype("text/html") + self.currentindex = 0 + + def _header(self): + meta = self.book.opf.metadata + title = "" + for tt, lang in meta.titles: + title += tt + " " + author = "" + for name, role, fileas in meta.creators: + author += name + " " + data = "\n\n" + if title: + data += "" + self.em.htmlescape(title) + "\n" + if author: + data += '\n' + if meta.description: + data += '\n' + data += "" + data = data.encode('UTF-8') + + return data + + def extractone(self, params): + """Extract EPUB data as concatenated HTML""" + + ok = True + data = self._header() + ids = [] + if self.book.opf.spine: + for id, linear in self.book.opf.spine.itemrefs: + ids.append(id) + else: + for id, item in self.book.opf.manifest.items(): + ids.append(id) + + for id in ids: + item = self.book.get_item(id) + if item is None or item.media_type != 'application/xhtml+xml': + continue + doc = self.book.read_item(item) + doc = re.sub(b'''<\?.*\?>''', b'', doc) + doc = re.sub(b'''<[hH][tT][mM][lL].*<[bB][oO][dD][yY][^>]*>''', + b'', doc, 1, re.DOTALL) + doc = re.sub(b'''''', b'', doc) + doc = re.sub(b'''''', b'', doc) + data += doc + + data += b'' + if ok: + return (ok, data, "", rclexecm.RclExecM.eofnext) + else: + return (ok, "", "", rclexecm.RclExecM.eofnow) + + def openfile(self, params): + """Open the EPUB file""" + self.currentindex = 0 + if not "filename:" in params: + self.em.rclog("openfile: no file name") + return (ok, "", "", rclexecm.RclExecM.eofnow) + + try: + self.book = epub.open_epub(params["filename:"].decode('UTF-8')) + except Exception as err: + self.em.rclog("openfile: epub.open failed: [%s]" % err) + return False + return True + + def getipath(self, params): + return self.extractone(params) + + def getnext(self, params): + if self.currentindex >= 1: + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(params) + self.currentindex += 1 + return ret + +proto = rclexecm.RclExecM() +extract = rclEPUB(proto) +rclexecm.main(proto, extract)