From dac19964f352f0ad6d4417f1af5d7228d02b357a Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 6 Sep 2015 19:55:43 +0200 Subject: [PATCH] temp ckpt --- src/filters/rcldoc.py | 141 ++++++++++++++++++++++++++++++++++++++++ src/filters/rclexecm.py | 109 +++++++++++++++++++++++++++++-- 2 files changed, 246 insertions(+), 4 deletions(-) create mode 100755 src/filters/rcldoc.py diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py new file mode 100755 index 00000000..07ea248e --- /dev/null +++ b/src/filters/rcldoc.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +import rclexecm +import re +import sys +import os + +# Processing the output from antiword: create html header and tail, process +# continuation lines escape HTML special characters, accumulate the data +class WordProcessData: + def __init__(self, em): + self.em = em + self.out = "" + self.cont = "" + self.gotdata = False + # Line with continued word (ending in -) + # we strip the - which is not nice for actually hyphenated word. + # What to do ? + self.patcont = re.compile('''[\w][-]$''') + # Pattern for breaking continuation at last word start + self.patws = re.compile('''([\s])([\w]+)(-)$''') + + def takeLine(self, line): + if not self.gotdata: + if line == "": + return + self.out = '' + \ + '' + \ + '

' + self.gotdata = True + + if self.cont: + line = self.cont + line + self.cont = "" + + if line == "\f": + self.out += "


" + return + + if self.patcont.search(line): + # Break at last whitespace + match = self.patws.search(line) + if match: + self.cont = line[match.start(2):match.end(2)] + line = line[0:match.start(1)] + else: + self.cont = line + line = "" + + if line: + self.out += self.em.htmlescape(line) + "
" + else: + self.out += "
" + + def wrapData(self): + if self.gotdata: + self.out += "

" + self.em.setmimetype("text/html") + return self.out + +# Null data accumulator. We use this when antiword has fail, and the +# data actually comes from rclrtf, rcltext or vwWare, which all +# output HTML +class WordPassData: + def __init__(self, em): + self.out = "" + self.em = em + def takeLine(self, line): + self.out += line + def wrapData(self): + self.em.setmimetype("text/html") + return self.out + +# Filter for msword docs. Try antiword, and if this fails, check for +# an rtf or text document (.doc are sometimes like this). Also try +# vwWare if the doc is actually a word doc +class WordFilter: + def __init__(self, em, td): + self.em = em + self.ntry = 0 + self.thisdir = td + + def hasControlChars(self, data): + for c in data: + if c < chr(32) and c != '\n' and c != '\t' and \ + c != '\f' and c != '\r': + return True + return False + + def mimetype(self, fn): + rtfprolog ="{\\rtf1" + docprolog = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" + try: + f = open(fn, "r") + except: + return "" + data = f.read(100) + if data[0:6] == rtfprolog: + return "text/rtf" + elif data[0:8] == docprolog: + return "application/msword" + elif self.hasControlChars(data): + return "application/octet-stream" + else: + return "text/plain" + + def getCmd(self, fn): + '''Return command to execute and postprocessor according to + our state: first try antiword, then others depending on mime + identification. Do 2 tries at most''' + if self.ntry == 0: + self.ntry = 1 + return (["antiword", "-t", "-i", "1", "-m", "UTF-8"], + WordPassData(self.em)) + elif self.ntry == 1: + ntry = 2 + # antiword failed. Check for an rtf file, or text and + # process accordingly. It the doc is actually msword, try + # wvWare. + mt = self.mimetype(fn) + if mt == "text/plain": + return ([os.path.join(self.thisdir,"rcltext")], + WordPassData(self.em)) + elif mt == "text/rtf": + return ([os.path.join(self.thisdir, "rclrtf")], + WordPassData(self.em)) + elif mt == "application/msword": + return (["wvWare", "--nographics", "--charset=utf-8"], + WordPassData(self.em)) + else: + return ([],None) + else: + return ([],None) + +if __name__ == '__main__': + thisdir = os.path.dirname(sys.argv[0]) + proto = rclexecm.RclExecM() + filter = WordFilter(proto, thisdir) + extract = rclexecm.Executor(proto, filter) + rclexecm.main(proto, extract) diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index 24316f7f..148a571a 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -1,10 +1,30 @@ -#!/usr/bin/env python +################################# +# Copyright (C) 2014 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +######################################################## +## Recoll multifilter communication module and utilities -########################################### -## Generic recoll multifilter communication code import sys import os +import subprocess +############################################ +# RclExecM implements the +# communication protocol with the recollindex process. It calls the +# object specific of the document type to actually get the data. class RclExecM: noteof = 0 eofnext = 1 @@ -168,6 +188,87 @@ class RclExecM: self.processmessage(processor, params) +#################################################################### +# Common code for replacing the shell scripts: this implements the basic +# functions for a filter which executes a command to translate a +# simple file (like rclword with antiword). +# +# This was motivated by the Windows port: to replace shell and Unix +# utility (awk , etc usage). We can't just execute python scripts, +# this would be to slow. So this helps implementing a permanent script +# to repeatedly execute single commands. +# +# This class has the code to execute the subprocess and call a +# data-specific post-processor. Command and processor are supplied by +# the object which we receive as a parameter, which in turn is defined +# in the actual executable filter (e.g. rcldoc) +class Executor: + def __init__(self, em, flt): + self.em = em + self.flt = flt + self.currentindex = 0 + + def runCmd(self, cmd, filename, postproc): + ''' Substitute parameters and execute command, process output + with the specific postprocessor and return the complete text. + We expect cmd as a list of command name + arguments''' + + try: + proc = subprocess.Popen(cmd + [filename], + stdout = subprocess.PIPE) + stdout = proc.stdout + except subprocess.CalledProcessError, err: + self.em.rclog("extractone: extract failed: [%s]" % err) + return (False, "") + + for line in stdout: + postproc.takeLine(line.strip()) + + proc.wait() + if proc.returncode: + return False, postproc.wrapData() + else: + return True, postproc.wrapData() + + def extractone(self, params): + #self.em.rclog("extractone %s %s" % (params["filename:"], \ + # params["mimetype:"])) + ok = False + if not params.has_key("filename:"): + self.em.rclog("extractone: no mime or file name") + return (ok, docdata, "", RclExecM.eofnow) + + fn = params["filename:"] + while True: + cmd, postproc = self.flt.getCmd(fn) + if cmd: + ok, data = self.runCmd(cmd, fn, postproc) + if ok: + break + else: + break + if ok: + return (ok, data, "", RclExecM.eofnext) + else: + return (ok, "", "", RclExecM.eofnow) + + + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.currentindex = 0 + return True + + def getipath(self, params): + return self.extractone(params) + + def getnext(self, params): + if self.currentindex >= 1: + return (False, "", "", RclExecM.eofnow) + else: + ret= self.extractone(params) + self.currentindex += 1 + return ret + # Common main routine for all python execm filters: either run the # normal protocol engine or a local loop to test without recollindex @@ -225,7 +326,7 @@ def main(proto, extract): bdata = data.encode("UTF-8") else: bdata = data - #sys.stdout.write(bdata) + sys.stdout.write(bdata) print if eof != RclExecM.noteof: break