temp ckpt

2015-09-06 19:55:43 +02:00 · 2015-09-06 19:55:43 +02:00 · dac19964f3
commit dac19964f3
parent 0896177cdf
2 changed files with 246 additions and 4 deletions
--- a/src/filters/rcldoc.py
+++ b/src/filters/rcldoc.py
@ -0,0 +1,141 @@
 #!/usr/bin/env python
 import rclexecm
 import re
 import sys
 import os
 # Processing the output from antiword: create html header and tail, process
 # continuation lines escape HTML special characters, accumulate the data
 class WordProcessData:
    def __init__(self, em):
        self.em = em
        self.out = ""
        self.cont = ""
        self.gotdata = False
        # Line with continued word (ending in -)
        # we strip the - which is not nice for actually hyphenated word.
        # What to do ?
        self.patcont = re.compile('''[\w][-]$''')
        # Pattern for breaking continuation at last word start
        self.patws = re.compile('''([\s])([\w]+)(-)$''')
    def takeLine(self, line):
        if not self.gotdata:
            if line == "":
                return
            self.out = '<html><head><title></title>' + \
                       '<meta http-equiv="Content-Type"' + \
                       'content="text/html;charset=UTF-8">' + \
                       '</head><body><p>'
            self.gotdata = True
        if self.cont:
            line = self.cont + line
            self.cont = ""
        if line == "\f":
            self.out += "</p><hr><p>"
            return
        if self.patcont.search(line):
            # Break at last whitespace
            match = self.patws.search(line)
            if match:
                self.cont = line[match.start(2):match.end(2)]
                line = line[0:match.start(1)]
            else:
                self.cont = line
                line = ""
        if line:
            self.out += self.em.htmlescape(line) + "<br>"
        else:
            self.out += "<br>"
    def wrapData(self):
        if self.gotdata:
            self.out += "</p></body></html>"
        self.em.setmimetype("text/html")
        return self.out
 # Null data accumulator. We use this when antiword has fail, and the
 # data actually comes from rclrtf, rcltext or vwWare, which all
 # output HTML
 class WordPassData:
    def __init__(self, em):
        self.out = ""
        self.em = em
    def takeLine(self, line):
        self.out += line
    def wrapData(self):
        self.em.setmimetype("text/html")
        return self.out
 # Filter for msword docs. Try antiword, and if this fails, check for
 # an rtf or text document (.doc are sometimes like this). Also try
 # vwWare if the doc is actually a word doc
 class WordFilter:
    def __init__(self, em, td):
        self.em = em
        self.ntry = 0
        self.thisdir = td
    def hasControlChars(self, data):
        for c in data:
            if c < chr(32) and c != '\n' and c != '\t' and \
                   c !=  '\f' and c != '\r':
                return True
        return False
    def mimetype(self, fn):
        rtfprolog ="{\\rtf1"
        docprolog = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
        try:
            f = open(fn, "r")
        except:
            return ""
        data = f.read(100)
        if data[0:6] == rtfprolog:
            return "text/rtf"
        elif data[0:8] == docprolog:
            return "application/msword"
        elif self.hasControlChars(data):
            return "application/octet-stream"
        else:
            return "text/plain"
    def getCmd(self, fn):
        '''Return command to execute and postprocessor according to
        our state: first try antiword, then others depending on mime
        identification. Do 2 tries at most'''
        if self.ntry == 0:
            self.ntry = 1
            return (["antiword", "-t", "-i", "1", "-m", "UTF-8"],
                    WordPassData(self.em))
        elif self.ntry == 1:
            ntry = 2
            # antiword failed. Check for an rtf file, or text and
            # process accordingly. It the doc is actually msword, try
            # wvWare.
            mt = self.mimetype(fn)
            if mt == "text/plain":
                return ([os.path.join(self.thisdir,"rcltext")],
                       WordPassData(self.em))
            elif mt == "text/rtf":
                return ([os.path.join(self.thisdir, "rclrtf")],
                        WordPassData(self.em))
            elif mt == "application/msword":
                return (["wvWare", "--nographics", "--charset=utf-8"],
                        WordPassData(self.em))
            else:
                return ([],None)
        else:
            return ([],None)
 if __name__ == '__main__':
    thisdir = os.path.dirname(sys.argv[0])
    proto = rclexecm.RclExecM()
    filter = WordFilter(proto, thisdir)
    extract = rclexecm.Executor(proto, filter)
    rclexecm.main(proto, extract)
--- a/src/filters/rclexecm.py
+++ b/src/filters/rclexecm.py
@ -1,10 +1,30 @@
-#!/usr/bin/env python
+#################################
 # Copyright (C) 2014 J.F.Dockes
 #   This program is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
 #   the Free Software Foundation; either version 2 of the License, or
 #   (at your option) any later version.
 #
 #   This program is distributed in the hope that it will be useful,
 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #   GNU General Public License for more details.
 #
 #   You should have received a copy of the GNU General Public License
 #   along with this program; if not, write to the
 #   Free Software Foundation, Inc.,
 #   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 ########################################################
 ## Recoll multifilter communication module and utilities
 ###########################################
 ## Generic recoll multifilter communication code
 import sys
 import os
 import subprocess
 ############################################
 # RclExecM implements the
 # communication protocol with the recollindex process. It calls the
 # object specific of the document type to actually get the data.
 class RclExecM:
    noteof  = 0
    eofnext = 1
@ -168,6 +188,87 @@ class RclExecM:
            self.processmessage(processor, params)
 ####################################################################
 # Common code for replacing the shell scripts: this implements the basic
 # functions for a filter which executes a command to translate a
 # simple file (like rclword with antiword).
 #
 # This was motivated by the Windows port: to replace shell and Unix
 # utility (awk , etc usage). We can't just execute python scripts,
 # this would be to slow. So this helps implementing a permanent script
 # to repeatedly execute single commands.
 #
 # This class has the code to execute the subprocess and call a
 # data-specific post-processor. Command and processor are supplied by
 # the object which we receive as a parameter, which in turn is defined
 # in the actual executable filter (e.g. rcldoc)
 class Executor:
    def __init__(self, em, flt):
        self.em = em
        self.flt = flt
        self.currentindex = 0
    def runCmd(self, cmd, filename, postproc):
        ''' Substitute parameters and execute command, process output
        with the specific postprocessor and return the complete text.
        We expect cmd as a list of command name + arguments'''
        try:
            proc = subprocess.Popen(cmd + [filename],
                                    stdout = subprocess.PIPE)
            stdout = proc.stdout
        except subprocess.CalledProcessError, err:
            self.em.rclog("extractone: extract failed: [%s]" % err)
            return (False, "")
        for line in stdout:
            postproc.takeLine(line.strip())
        proc.wait()
        if proc.returncode:
            return False, postproc.wrapData()
        else:
            return True, postproc.wrapData()
    def extractone(self, params):
        #self.em.rclog("extractone %s %s" % (params["filename:"], \
        # params["mimetype:"]))
        ok = False
        if not params.has_key("filename:"):
            self.em.rclog("extractone: no mime or file name")
            return (ok, docdata, "", RclExecM.eofnow)
        fn = params["filename:"]
        while True:
            cmd, postproc = self.flt.getCmd(fn)
            if cmd:
                ok, data = self.runCmd(cmd, fn, postproc)
                if ok:
                    break
            else:
                break
        if ok:
            return (ok, data, "", RclExecM.eofnext)
        else:
            return (ok, "", "", RclExecM.eofnow)
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
        self.currentindex = 0
        return True
    def getipath(self, params):
        return self.extractone(params)
    def getnext(self, params):
        if self.currentindex >= 1:
            return (False, "", "", RclExecM.eofnow)
        else:
            ret= self.extractone(params)
            self.currentindex += 1
            return ret
 # Common main routine for all python execm filters: either run the
 # normal protocol engine or a local loop to test without recollindex
@ -225,7 +326,7 @@ def main(proto, extract):
                    bdata = data.encode("UTF-8")
                else:
                    bdata = data
-                #sys.stdout.write(bdata)
+                sys.stdout.write(bdata)
                print
                if eof != RclExecM.noteof:
                    break