From dac19964f352f0ad6d4417f1af5d7228d02b357a Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Sun, 6 Sep 2015 19:55:43 +0200
Subject: [PATCH] temp ckpt

---
 src/filters/rcldoc.py   | 141 ++++++++++++++++++++++++++++++++++++++++
 src/filters/rclexecm.py | 109 +++++++++++++++++++++++++++++--
 2 files changed, 246 insertions(+), 4 deletions(-)
 create mode 100755 src/filters/rcldoc.py
diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py
new file mode 100755
index 00000000..07ea248e
--- /dev/null
+++ b/src/filters/rcldoc.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+
+import rclexecm
+import re
+import sys
+import os
+
+# Processing the output from antiword: create html header and tail, process
+# continuation lines escape HTML special characters, accumulate the data
+class WordProcessData:
+    def __init__(self, em):
+        self.em = em
+        self.out = ""
+        self.cont = ""
+        self.gotdata = False
+        # Line with continued word (ending in -)
+        # we strip the - which is not nice for actually hyphenated word.
+        # What to do ?
+        self.patcont = re.compile('''[\w][-]$''')
+        # Pattern for breaking continuation at last word start
+        self.patws = re.compile('''([\s])([\w]+)(-)$''')
+
+    def takeLine(self, line):
+        if not self.gotdata:
+            if line == "":
+                return
+            self.out = '<html><head><title></title>' + \
+                       '<meta http-equiv="Content-Type"' + \
+                       'content="text/html;charset=UTF-8">' + \
+                       '</head><body><p>'
+            self.gotdata = True
+
+        if self.cont:
+            line = self.cont + line
+            self.cont = ""
+
+        if line == "\f":
+            self.out += "</p><hr><p>"
+            return
+
+        if self.patcont.search(line):
+            # Break at last whitespace
+            match = self.patws.search(line)
+            if match:
+                self.cont = line[match.start(2):match.end(2)]
+                line = line[0:match.start(1)]
+            else:
+                self.cont = line
+                line = ""
+
+        if line:
+            self.out += self.em.htmlescape(line) + "<br>"
+        else:
+            self.out += "<br>"
+
+    def wrapData(self):
+        if self.gotdata:
+            self.out += "</p></body></html>"
+        self.em.setmimetype("text/html")
+        return self.out
+
+# Null data accumulator. We use this when antiword has fail, and the
+# data actually comes from rclrtf, rcltext or vwWare, which all
+# output HTML
+class WordPassData:
+    def __init__(self, em):
+        self.out = ""
+        self.em = em
+    def takeLine(self, line):
+        self.out += line
+    def wrapData(self):
+        self.em.setmimetype("text/html")
+        return self.out
+        
+# Filter for msword docs. Try antiword, and if this fails, check for
+# an rtf or text document (.doc are sometimes like this). Also try
+# vwWare if the doc is actually a word doc
+class WordFilter:
+    def __init__(self, em, td):
+        self.em = em
+        self.ntry = 0
+        self.thisdir = td
+        
+    def hasControlChars(self, data):
+        for c in data:
+            if c < chr(32) and c != '\n' and c != '\t' and \
+                   c !=  '\f' and c != '\r':
+                return True
+        return False
+
+    def mimetype(self, fn):
+        rtfprolog ="{\\rtf1"
+        docprolog = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
+        try:
+            f = open(fn, "r")
+        except:
+            return ""
+        data = f.read(100)
+        if data[0:6] == rtfprolog:
+            return "text/rtf"
+        elif data[0:8] == docprolog:
+            return "application/msword"
+        elif self.hasControlChars(data):
+            return "application/octet-stream"
+        else:
+            return "text/plain"
+
+    def getCmd(self, fn):
+        '''Return command to execute and postprocessor according to
+        our state: first try antiword, then others depending on mime
+        identification. Do 2 tries at most'''
+        if self.ntry == 0:
+            self.ntry = 1
+            return (["antiword", "-t", "-i", "1", "-m", "UTF-8"],
+                    WordPassData(self.em))
+        elif self.ntry == 1:
+            ntry = 2
+            # antiword failed. Check for an rtf file, or text and
+            # process accordingly. It the doc is actually msword, try
+            # wvWare.
+            mt = self.mimetype(fn)
+            if mt == "text/plain":
+                return ([os.path.join(self.thisdir,"rcltext")],
+                       WordPassData(self.em))
+            elif mt == "text/rtf":
+                return ([os.path.join(self.thisdir, "rclrtf")],
+                        WordPassData(self.em))
+            elif mt == "application/msword":
+                return (["wvWare", "--nographics", "--charset=utf-8"],
+                        WordPassData(self.em))
+            else:
+                return ([],None)
+        else:
+            return ([],None)
+            
+if __name__ == '__main__':
+    thisdir = os.path.dirname(sys.argv[0])
+    proto = rclexecm.RclExecM()
+    filter = WordFilter(proto, thisdir)
+    extract = rclexecm.Executor(proto, filter)
+    rclexecm.main(proto, extract)
diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py
index 24316f7f..148a571a 100644
--- a/src/filters/rclexecm.py
+++ b/src/filters/rclexecm.py
@@ -1,10 +1,30 @@
-#!/usr/bin/env python
+#################################
+# Copyright (C) 2014 J.F.Dockes
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+#   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+########################################################
+## Recoll multifilter communication module and utilities
 
-###########################################
-## Generic recoll multifilter communication code
 import sys
 import os
+import subprocess
 
+############################################
+# RclExecM implements the
+# communication protocol with the recollindex process. It calls the
+# object specific of the document type to actually get the data.
 class RclExecM:
     noteof  = 0
     eofnext = 1
@@ -168,6 +188,87 @@ class RclExecM:
             self.processmessage(processor, params)
 
 
+####################################################################
+# Common code for replacing the shell scripts: this implements the basic
+# functions for a filter which executes a command to translate a
+# simple file (like rclword with antiword).
+#
+# This was motivated by the Windows port: to replace shell and Unix
+# utility (awk , etc usage). We can't just execute python scripts,
+# this would be to slow. So this helps implementing a permanent script
+# to repeatedly execute single commands.
+#
+# This class has the code to execute the subprocess and call a
+# data-specific post-processor. Command and processor are supplied by
+# the object which we receive as a parameter, which in turn is defined
+# in the actual executable filter (e.g. rcldoc)
+class Executor:
+    def __init__(self, em, flt):
+        self.em = em
+        self.flt = flt
+        self.currentindex = 0
+
+    def runCmd(self, cmd, filename, postproc):
+        ''' Substitute parameters and execute command, process output
+        with the specific postprocessor and return the complete text.
+        We expect cmd as a list of command name + arguments'''
+
+        try:
+            proc = subprocess.Popen(cmd + [filename],
+                                    stdout = subprocess.PIPE)
+            stdout = proc.stdout
+        except subprocess.CalledProcessError, err:
+            self.em.rclog("extractone: extract failed: [%s]" % err)
+            return (False, "")
+
+        for line in stdout:
+            postproc.takeLine(line.strip())
+
+        proc.wait()
+        if proc.returncode:
+            return False, postproc.wrapData()
+        else:
+            return True, postproc.wrapData()
+
+    def extractone(self, params):
+        #self.em.rclog("extractone %s %s" % (params["filename:"], \
+        # params["mimetype:"]))
+        ok = False
+        if not params.has_key("filename:"):
+            self.em.rclog("extractone: no mime or file name")
+            return (ok, docdata, "", RclExecM.eofnow)
+
+        fn = params["filename:"]
+        while True:
+            cmd, postproc = self.flt.getCmd(fn)
+            if cmd:
+                ok, data = self.runCmd(cmd, fn, postproc)
+                if ok:
+                    break
+            else:
+                break
+        if ok:
+            return (ok, data, "", RclExecM.eofnext)
+        else:
+            return (ok, "", "", RclExecM.eofnow)
+        
+
+    ###### File type handler api, used by rclexecm ---------->
+    def openfile(self, params):
+        self.currentindex = 0
+        return True
+
+    def getipath(self, params):
+        return self.extractone(params)
+        
+    def getnext(self, params):
+        if self.currentindex >= 1:
+            return (False, "", "", RclExecM.eofnow)
+        else:
+            ret= self.extractone(params)
+            self.currentindex += 1
+            return ret
+
   
 # Common main routine for all python execm filters: either run the
 # normal protocol engine or a local loop to test without recollindex
@@ -225,7 +326,7 @@ def main(proto, extract):
                     bdata = data.encode("UTF-8")
                 else:
                     bdata = data
-                #sys.stdout.write(bdata)
+                sys.stdout.write(bdata)
                 print
                 if eof != RclExecM.noteof:
                     break