temp ckpt
This commit is contained in:
parent
0896177cdf
commit
dac19964f3
2 changed files with 246 additions and 4 deletions
141
src/filters/rcldoc.py
Executable file
141
src/filters/rcldoc.py
Executable file
|
@ -0,0 +1,141 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import rclexecm
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Processing the output from antiword: create html header and tail, process
|
||||||
|
# continuation lines escape HTML special characters, accumulate the data
|
||||||
|
class WordProcessData:
|
||||||
|
def __init__(self, em):
|
||||||
|
self.em = em
|
||||||
|
self.out = ""
|
||||||
|
self.cont = ""
|
||||||
|
self.gotdata = False
|
||||||
|
# Line with continued word (ending in -)
|
||||||
|
# we strip the - which is not nice for actually hyphenated word.
|
||||||
|
# What to do ?
|
||||||
|
self.patcont = re.compile('''[\w][-]$''')
|
||||||
|
# Pattern for breaking continuation at last word start
|
||||||
|
self.patws = re.compile('''([\s])([\w]+)(-)$''')
|
||||||
|
|
||||||
|
def takeLine(self, line):
|
||||||
|
if not self.gotdata:
|
||||||
|
if line == "":
|
||||||
|
return
|
||||||
|
self.out = '<html><head><title></title>' + \
|
||||||
|
'<meta http-equiv="Content-Type"' + \
|
||||||
|
'content="text/html;charset=UTF-8">' + \
|
||||||
|
'</head><body><p>'
|
||||||
|
self.gotdata = True
|
||||||
|
|
||||||
|
if self.cont:
|
||||||
|
line = self.cont + line
|
||||||
|
self.cont = ""
|
||||||
|
|
||||||
|
if line == "\f":
|
||||||
|
self.out += "</p><hr><p>"
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.patcont.search(line):
|
||||||
|
# Break at last whitespace
|
||||||
|
match = self.patws.search(line)
|
||||||
|
if match:
|
||||||
|
self.cont = line[match.start(2):match.end(2)]
|
||||||
|
line = line[0:match.start(1)]
|
||||||
|
else:
|
||||||
|
self.cont = line
|
||||||
|
line = ""
|
||||||
|
|
||||||
|
if line:
|
||||||
|
self.out += self.em.htmlescape(line) + "<br>"
|
||||||
|
else:
|
||||||
|
self.out += "<br>"
|
||||||
|
|
||||||
|
def wrapData(self):
|
||||||
|
if self.gotdata:
|
||||||
|
self.out += "</p></body></html>"
|
||||||
|
self.em.setmimetype("text/html")
|
||||||
|
return self.out
|
||||||
|
|
||||||
|
# Null data accumulator. We use this when antiword has fail, and the
|
||||||
|
# data actually comes from rclrtf, rcltext or vwWare, which all
|
||||||
|
# output HTML
|
||||||
|
class WordPassData:
|
||||||
|
def __init__(self, em):
|
||||||
|
self.out = ""
|
||||||
|
self.em = em
|
||||||
|
def takeLine(self, line):
|
||||||
|
self.out += line
|
||||||
|
def wrapData(self):
|
||||||
|
self.em.setmimetype("text/html")
|
||||||
|
return self.out
|
||||||
|
|
||||||
|
# Filter for msword docs. Try antiword, and if this fails, check for
|
||||||
|
# an rtf or text document (.doc are sometimes like this). Also try
|
||||||
|
# vwWare if the doc is actually a word doc
|
||||||
|
class WordFilter:
|
||||||
|
def __init__(self, em, td):
|
||||||
|
self.em = em
|
||||||
|
self.ntry = 0
|
||||||
|
self.thisdir = td
|
||||||
|
|
||||||
|
def hasControlChars(self, data):
|
||||||
|
for c in data:
|
||||||
|
if c < chr(32) and c != '\n' and c != '\t' and \
|
||||||
|
c != '\f' and c != '\r':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def mimetype(self, fn):
|
||||||
|
rtfprolog ="{\\rtf1"
|
||||||
|
docprolog = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
|
||||||
|
try:
|
||||||
|
f = open(fn, "r")
|
||||||
|
except:
|
||||||
|
return ""
|
||||||
|
data = f.read(100)
|
||||||
|
if data[0:6] == rtfprolog:
|
||||||
|
return "text/rtf"
|
||||||
|
elif data[0:8] == docprolog:
|
||||||
|
return "application/msword"
|
||||||
|
elif self.hasControlChars(data):
|
||||||
|
return "application/octet-stream"
|
||||||
|
else:
|
||||||
|
return "text/plain"
|
||||||
|
|
||||||
|
def getCmd(self, fn):
|
||||||
|
'''Return command to execute and postprocessor according to
|
||||||
|
our state: first try antiword, then others depending on mime
|
||||||
|
identification. Do 2 tries at most'''
|
||||||
|
if self.ntry == 0:
|
||||||
|
self.ntry = 1
|
||||||
|
return (["antiword", "-t", "-i", "1", "-m", "UTF-8"],
|
||||||
|
WordPassData(self.em))
|
||||||
|
elif self.ntry == 1:
|
||||||
|
ntry = 2
|
||||||
|
# antiword failed. Check for an rtf file, or text and
|
||||||
|
# process accordingly. It the doc is actually msword, try
|
||||||
|
# wvWare.
|
||||||
|
mt = self.mimetype(fn)
|
||||||
|
if mt == "text/plain":
|
||||||
|
return ([os.path.join(self.thisdir,"rcltext")],
|
||||||
|
WordPassData(self.em))
|
||||||
|
elif mt == "text/rtf":
|
||||||
|
return ([os.path.join(self.thisdir, "rclrtf")],
|
||||||
|
WordPassData(self.em))
|
||||||
|
elif mt == "application/msword":
|
||||||
|
return (["wvWare", "--nographics", "--charset=utf-8"],
|
||||||
|
WordPassData(self.em))
|
||||||
|
else:
|
||||||
|
return ([],None)
|
||||||
|
else:
|
||||||
|
return ([],None)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
thisdir = os.path.dirname(sys.argv[0])
|
||||||
|
proto = rclexecm.RclExecM()
|
||||||
|
filter = WordFilter(proto, thisdir)
|
||||||
|
extract = rclexecm.Executor(proto, filter)
|
||||||
|
rclexecm.main(proto, extract)
|
|
@ -1,10 +1,30 @@
|
||||||
#!/usr/bin/env python
|
#################################
|
||||||
|
# Copyright (C) 2014 J.F.Dockes
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
########################################################
|
||||||
|
## Recoll multifilter communication module and utilities
|
||||||
|
|
||||||
###########################################
|
|
||||||
## Generic recoll multifilter communication code
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# RclExecM implements the
|
||||||
|
# communication protocol with the recollindex process. It calls the
|
||||||
|
# object specific of the document type to actually get the data.
|
||||||
class RclExecM:
|
class RclExecM:
|
||||||
noteof = 0
|
noteof = 0
|
||||||
eofnext = 1
|
eofnext = 1
|
||||||
|
@ -168,6 +188,87 @@ class RclExecM:
|
||||||
self.processmessage(processor, params)
|
self.processmessage(processor, params)
|
||||||
|
|
||||||
|
|
||||||
|
####################################################################
|
||||||
|
# Common code for replacing the shell scripts: this implements the basic
|
||||||
|
# functions for a filter which executes a command to translate a
|
||||||
|
# simple file (like rclword with antiword).
|
||||||
|
#
|
||||||
|
# This was motivated by the Windows port: to replace shell and Unix
|
||||||
|
# utility (awk , etc usage). We can't just execute python scripts,
|
||||||
|
# this would be to slow. So this helps implementing a permanent script
|
||||||
|
# to repeatedly execute single commands.
|
||||||
|
#
|
||||||
|
# This class has the code to execute the subprocess and call a
|
||||||
|
# data-specific post-processor. Command and processor are supplied by
|
||||||
|
# the object which we receive as a parameter, which in turn is defined
|
||||||
|
# in the actual executable filter (e.g. rcldoc)
|
||||||
|
class Executor:
|
||||||
|
def __init__(self, em, flt):
|
||||||
|
self.em = em
|
||||||
|
self.flt = flt
|
||||||
|
self.currentindex = 0
|
||||||
|
|
||||||
|
def runCmd(self, cmd, filename, postproc):
|
||||||
|
''' Substitute parameters and execute command, process output
|
||||||
|
with the specific postprocessor and return the complete text.
|
||||||
|
We expect cmd as a list of command name + arguments'''
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc = subprocess.Popen(cmd + [filename],
|
||||||
|
stdout = subprocess.PIPE)
|
||||||
|
stdout = proc.stdout
|
||||||
|
except subprocess.CalledProcessError, err:
|
||||||
|
self.em.rclog("extractone: extract failed: [%s]" % err)
|
||||||
|
return (False, "")
|
||||||
|
|
||||||
|
for line in stdout:
|
||||||
|
postproc.takeLine(line.strip())
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode:
|
||||||
|
return False, postproc.wrapData()
|
||||||
|
else:
|
||||||
|
return True, postproc.wrapData()
|
||||||
|
|
||||||
|
def extractone(self, params):
|
||||||
|
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||||
|
# params["mimetype:"]))
|
||||||
|
ok = False
|
||||||
|
if not params.has_key("filename:"):
|
||||||
|
self.em.rclog("extractone: no mime or file name")
|
||||||
|
return (ok, docdata, "", RclExecM.eofnow)
|
||||||
|
|
||||||
|
fn = params["filename:"]
|
||||||
|
while True:
|
||||||
|
cmd, postproc = self.flt.getCmd(fn)
|
||||||
|
if cmd:
|
||||||
|
ok, data = self.runCmd(cmd, fn, postproc)
|
||||||
|
if ok:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
if ok:
|
||||||
|
return (ok, data, "", RclExecM.eofnext)
|
||||||
|
else:
|
||||||
|
return (ok, "", "", RclExecM.eofnow)
|
||||||
|
|
||||||
|
|
||||||
|
###### File type handler api, used by rclexecm ---------->
|
||||||
|
def openfile(self, params):
|
||||||
|
self.currentindex = 0
|
||||||
|
return True
|
||||||
|
|
||||||
|
def getipath(self, params):
|
||||||
|
return self.extractone(params)
|
||||||
|
|
||||||
|
def getnext(self, params):
|
||||||
|
if self.currentindex >= 1:
|
||||||
|
return (False, "", "", RclExecM.eofnow)
|
||||||
|
else:
|
||||||
|
ret= self.extractone(params)
|
||||||
|
self.currentindex += 1
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
# Common main routine for all python execm filters: either run the
|
# Common main routine for all python execm filters: either run the
|
||||||
# normal protocol engine or a local loop to test without recollindex
|
# normal protocol engine or a local loop to test without recollindex
|
||||||
|
@ -225,7 +326,7 @@ def main(proto, extract):
|
||||||
bdata = data.encode("UTF-8")
|
bdata = data.encode("UTF-8")
|
||||||
else:
|
else:
|
||||||
bdata = data
|
bdata = data
|
||||||
#sys.stdout.write(bdata)
|
sys.stdout.write(bdata)
|
||||||
print
|
print
|
||||||
if eof != RclExecM.noteof:
|
if eof != RclExecM.noteof:
|
||||||
break
|
break
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue