diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py index f6dcf334..8fb7a31a 100755 --- a/src/filters/rcldoc.py +++ b/src/filters/rcldoc.py @@ -140,7 +140,13 @@ class WordFilter: return ([],None) if __name__ == '__main__': + # Remember where we execute filters from, in case we need to exec another execdir = os.path.dirname(sys.argv[0]) + # Check that we have antiword. We could fallback to wvWare, but + # this is not what the old filter did. + if not rclexecm.which("antiword"): + print("RECFILTERROR HELPERNOTFOUND antiword") + sys.exit(1) proto = rclexecm.RclExecM() filter = WordFilter(proto, execdir) extract = rclexecm.Executor(proto, filter) diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index e31753df..ebb659df 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -20,6 +20,8 @@ import sys import os import subprocess +import tempfile +import shutil ############################################ # RclExecM implements the @@ -217,8 +219,11 @@ class Executor: proc = subprocess.Popen(cmd + [filename], stdout = subprocess.PIPE) stdout = proc.stdout - except subprocess.CalledProcessError, err: - self.em.rclog("extractone: extract failed: [%s]" % err) + except subprocess.CalledProcessError as err: + self.em.rclog("extractone: Popen() error: %s" % err) + return (False, "") + except OSError as err: + self.em.rclog("extractone: Popen OS error: %s" % err) return (False, "") for line in stdout: @@ -237,7 +242,7 @@ class Executor: ok = False if not params.has_key("filename:"): self.em.rclog("extractone: no mime or file name") - return (ok, docdata, "", RclExecM.eofnow) + return (ok, "", "", RclExecM.eofnow) fn = params["filename:"] while True: @@ -253,7 +258,6 @@ class Executor: else: return (ok, "", "", RclExecM.eofnow) - ###### File type handler api, used by rclexecm ----------> def openfile(self, params): self.currentindex = 0 @@ -270,7 +274,56 @@ class Executor: self.currentindex += 1 return ret - +# Helper routine to test for program accessibility +def which(program): + def is_exe(fpath): + return os.path.exists(fpath) and os.access(fpath, os.X_OK) + def ext_candidates(fpath): + yield fpath + for ext in os.environ.get("PATHEXT", "").split(os.pathsep): + yield fpath + ext + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + exe_file = os.path.join(path, program) + for candidate in ext_candidates(exe_file): + if is_exe(candidate): + return candidate + return None + +# Temp dir helper +class SafeTmpDir: + def __init__(self, em): + self.em = em + self.toptmp = "" + self.tmpdir = "" + + def __del__(self): + try: + if self.toptmp: + shutil.rmtree(self.tmpdir, True) + os.rmdir(self.toptmp) + except Exception as err: + self.em.rclog("delete dir failed for " + self.toptmp) + + def getpath(self): + if not self.tmpdir: + envrcltmp = os.getenv('RECOLL_TMPDIR') + if envrcltmp: + self.toptmp = tempfile.mkdtemp(prefix='rcltmp', dir=envrcltmp) + else: + self.toptmp = tempfile.mkdtemp(prefix='rcltmp') + + self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp') + os.makedirs(self.tmpdir) + + return self.tmpdir + + # Common main routine for all python execm filters: either run the # normal protocol engine or a local loop to test without recollindex def main(proto, extract): diff --git a/src/filters/rclsoff.py b/src/filters/rclsoff.py new file mode 100755 index 00000000..cb28ef26 --- /dev/null +++ b/src/filters/rclsoff.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python + +import sys +import os +import subprocess +import rclexecm +import rclxslt +from zipfile import ZipFile + +stylesheet_meta = ''' + + + + + + + + + + + + + + <xsl:value-of select="."/> + + + + + + abstract + + + + + + + + + + keywords + + + + + + + + + + author + + + + + + + + + + keywords + + + + + + + + +''' + +stylesheet_content = ''' + + + + + +

+ +
+ + +

+ +
+ + + + + + +
+
+ + + + + +
+''' + +class OOExtractor: + def __init__(self, em): + self.em = em + self.currentindex = 0 + + def extractone(self, params): + if not params.has_key("filename:"): + self.em.rclog("extractone: no mime or file name") + return (False, "", "", rclexecm.RclExecM.eofnow) + fn = params["filename:"] + + try: + zip = ZipFile(fn) + except Exception as err: + self.em.rclog("unzip failed: " + str(err)) + return (False, "", "", rclexecm.RclExecM.eofnow) + + docdata = '' + + try: + metadata = zip.read("meta.xml") + if metadata: + res = rclxslt.apply_sheet_data(stylesheet_meta, metadata) + docdata += res + except: + # To be checked. I'm under the impression that I get this when + # nothing matches? + #self.em.rclog("no/bad metadata in %s" % fn) + pass + + try: + content = zip.read("content.xml") + if content: + res = rclxslt.apply_sheet_data(stylesheet_content, content) + docdata += res + docdata += '' + except Exception as err: + self.em.rclog("bad data in %s" % fn) + return (False, "", "", rclexecm.RclExecM.eofnow) + + return (True, docdata, "", rclexecm.RclExecM.eofnow) + + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.currentindex = 0 + return True + + def getipath(self, params): + return self.extractone(params) + + def getnext(self, params): + if self.currentindex >= 1: + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(params) + self.currentindex += 1 + return ret + +if __name__ == '__main__': + # Check for unzip + if not rclexecm.which("unzip"): + print("RECFILTERROR HELPERNOTFOUND unzip") + sys.exit(1) + proto = rclexecm.RclExecM() + extract = OOExtractor(proto) + rclexecm.main(proto, extract) diff --git a/src/filters/rclxslt.py b/src/filters/rclxslt.py new file mode 100644 index 00000000..574cd582 --- /dev/null +++ b/src/filters/rclxslt.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +import sys + +try: + import libxml2 + import libxslt +except: + print "RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1" + sys.exit(1); + +libxml2.substituteEntitiesDefault(1) + +def apply_sheet_data(sheet, data): + styledoc = libxml2.parseMemory(sheet, len(sheet)) + style = libxslt.parseStylesheetDoc(styledoc) + doc = libxml2.parseMemory(data, len(data)) + result = style.applyStylesheet(doc, None) + res = style.saveResultToString(result) + style.freeStylesheet() + doc.freeDoc() + result.freeDoc() + return res + +def apply_sheet_file(sheet, fn): + styledoc = libxml2.parseMemory(sheet, len(sheet)) + style = libxslt.parseStylesheetDoc(styledoc) + doc = libxml2.parseFile(fn) + result = style.applyStylesheet(doc, None) + res = style.saveResultToString(result) + style.freeStylesheet() + doc.freeDoc() + result.freeDoc() + return res +