converted/duplicated rclsoff to rclsoff.py, using python-libxslt/xml

This commit is contained in:
Jean-Francois Dockes 2015-09-07 15:34:39 +02:00
parent b3092151dc
commit 4a90074482
4 changed files with 278 additions and 5 deletions

View file

@ -140,7 +140,13 @@ class WordFilter:
return ([],None) return ([],None)
if __name__ == '__main__': if __name__ == '__main__':
# Remember where we execute filters from, in case we need to exec another
execdir = os.path.dirname(sys.argv[0]) execdir = os.path.dirname(sys.argv[0])
# Check that we have antiword. We could fallback to wvWare, but
# this is not what the old filter did.
if not rclexecm.which("antiword"):
print("RECFILTERROR HELPERNOTFOUND antiword")
sys.exit(1)
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()
filter = WordFilter(proto, execdir) filter = WordFilter(proto, execdir)
extract = rclexecm.Executor(proto, filter) extract = rclexecm.Executor(proto, filter)

View file

@ -20,6 +20,8 @@
import sys import sys
import os import os
import subprocess import subprocess
import tempfile
import shutil
############################################ ############################################
# RclExecM implements the # RclExecM implements the
@ -217,8 +219,11 @@ class Executor:
proc = subprocess.Popen(cmd + [filename], proc = subprocess.Popen(cmd + [filename],
stdout = subprocess.PIPE) stdout = subprocess.PIPE)
stdout = proc.stdout stdout = proc.stdout
except subprocess.CalledProcessError, err: except subprocess.CalledProcessError as err:
self.em.rclog("extractone: extract failed: [%s]" % err) self.em.rclog("extractone: Popen() error: %s" % err)
return (False, "")
except OSError as err:
self.em.rclog("extractone: Popen OS error: %s" % err)
return (False, "") return (False, "")
for line in stdout: for line in stdout:
@ -237,7 +242,7 @@ class Executor:
ok = False ok = False
if not params.has_key("filename:"): if not params.has_key("filename:"):
self.em.rclog("extractone: no mime or file name") self.em.rclog("extractone: no mime or file name")
return (ok, docdata, "", RclExecM.eofnow) return (ok, "", "", RclExecM.eofnow)
fn = params["filename:"] fn = params["filename:"]
while True: while True:
@ -253,7 +258,6 @@ class Executor:
else: else:
return (ok, "", "", RclExecM.eofnow) return (ok, "", "", RclExecM.eofnow)
###### File type handler api, used by rclexecm ----------> ###### File type handler api, used by rclexecm ---------->
def openfile(self, params): def openfile(self, params):
self.currentindex = 0 self.currentindex = 0
@ -270,6 +274,55 @@ class Executor:
self.currentindex += 1 self.currentindex += 1
return ret return ret
# Helper routine to test for program accessibility
def which(program):
def is_exe(fpath):
return os.path.exists(fpath) and os.access(fpath, os.X_OK)
def ext_candidates(fpath):
yield fpath
for ext in os.environ.get("PATHEXT", "").split(os.pathsep):
yield fpath + ext
fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, program)
for candidate in ext_candidates(exe_file):
if is_exe(candidate):
return candidate
return None
# Temp dir helper
class SafeTmpDir:
def __init__(self, em):
self.em = em
self.toptmp = ""
self.tmpdir = ""
def __del__(self):
try:
if self.toptmp:
shutil.rmtree(self.tmpdir, True)
os.rmdir(self.toptmp)
except Exception as err:
self.em.rclog("delete dir failed for " + self.toptmp)
def getpath(self):
if not self.tmpdir:
envrcltmp = os.getenv('RECOLL_TMPDIR')
if envrcltmp:
self.toptmp = tempfile.mkdtemp(prefix='rcltmp', dir=envrcltmp)
else:
self.toptmp = tempfile.mkdtemp(prefix='rcltmp')
self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp')
os.makedirs(self.tmpdir)
return self.tmpdir
# Common main routine for all python execm filters: either run the # Common main routine for all python execm filters: either run the
# normal protocol engine or a local loop to test without recollindex # normal protocol engine or a local loop to test without recollindex

179
src/filters/rclsoff.py Executable file
View file

@ -0,0 +1,179 @@
#!/usr/bin/env python
import sys
import os
import subprocess
import rclexecm
import rclxslt
from zipfile import ZipFile
stylesheet_meta = '''<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
xmlns:ooo="http://openoffice.org/2004/office"
exclude-result-prefixes="office xlink meta ooo dc"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="/office:document-meta">
<xsl:apply-templates select="office:meta/dc:description"/>
<xsl:apply-templates select="office:meta/dc:subject"/>
<xsl:apply-templates select="office:meta/dc:title"/>
<xsl:apply-templates select="office:meta/meta:keyword"/>
<xsl:apply-templates select="office:meta/dc:creator"/>
</xsl:template>
<xsl:template match="dc:title">
<title> <xsl:value-of select="."/> </title><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:description">
<meta>
<xsl:attribute name="name">abstract</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:subject">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:creator">
<meta>
<xsl:attribute name="name">author</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="meta:keyword">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
'''
stylesheet_content = '''<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
exclude-result-prefixes="text"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="text:p">
<p><xsl:apply-templates/></p><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="text:h">
<p><xsl:apply-templates/></p><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="text:s">
<xsl:text> </xsl:text>
</xsl:template>
<xsl:template match="text:line-break">
<br />
</xsl:template>
<xsl:template match="text:tab">
<xsl:text> </xsl:text>
</xsl:template>
</xsl:stylesheet>
'''
class OOExtractor:
def __init__(self, em):
self.em = em
self.currentindex = 0
def extractone(self, params):
if not params.has_key("filename:"):
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
zip = ZipFile(fn)
except Exception as err:
self.em.rclog("unzip failed: " + str(err))
return (False, "", "", rclexecm.RclExecM.eofnow)
docdata = '<html><head><meta http-equiv="Content-Type"' \
'content="text/html; charset=UTF-8"></head><body>'
try:
metadata = zip.read("meta.xml")
if metadata:
res = rclxslt.apply_sheet_data(stylesheet_meta, metadata)
docdata += res
except:
# To be checked. I'm under the impression that I get this when
# nothing matches?
#self.em.rclog("no/bad metadata in %s" % fn)
pass
try:
content = zip.read("content.xml")
if content:
res = rclxslt.apply_sheet_data(stylesheet_content, content)
docdata += res
docdata += '</body></html>'
except Exception as err:
self.em.rclog("bad data in %s" % fn)
return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnow)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__':
# Check for unzip
if not rclexecm.which("unzip"):
print("RECFILTERROR HELPERNOTFOUND unzip")
sys.exit(1)
proto = rclexecm.RclExecM()
extract = OOExtractor(proto)
rclexecm.main(proto, extract)

35
src/filters/rclxslt.py Normal file
View file

@ -0,0 +1,35 @@
#!/usr/bin/env python
import sys
try:
import libxml2
import libxslt
except:
print "RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1"
sys.exit(1);
libxml2.substituteEntitiesDefault(1)
def apply_sheet_data(sheet, data):
styledoc = libxml2.parseMemory(sheet, len(sheet))
style = libxslt.parseStylesheetDoc(styledoc)
doc = libxml2.parseMemory(data, len(data))
result = style.applyStylesheet(doc, None)
res = style.saveResultToString(result)
style.freeStylesheet()
doc.freeDoc()
result.freeDoc()
return res
def apply_sheet_file(sheet, fn):
styledoc = libxml2.parseMemory(sheet, len(sheet))
style = libxslt.parseStylesheetDoc(styledoc)
doc = libxml2.parseFile(fn)
result = style.applyStylesheet(doc, None)
res = style.saveResultToString(result)
style.freeStylesheet()
doc.freeDoc()
result.freeDoc()
return res