converted/duplicated rclsoff to rclsoff.py, using python-libxslt/xml
This commit is contained in:
parent
b3092151dc
commit
4a90074482
4 changed files with 278 additions and 5 deletions
|
@ -140,7 +140,13 @@ class WordFilter:
|
|||
return ([],None)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Remember where we execute filters from, in case we need to exec another
|
||||
execdir = os.path.dirname(sys.argv[0])
|
||||
# Check that we have antiword. We could fallback to wvWare, but
|
||||
# this is not what the old filter did.
|
||||
if not rclexecm.which("antiword"):
|
||||
print("RECFILTERROR HELPERNOTFOUND antiword")
|
||||
sys.exit(1)
|
||||
proto = rclexecm.RclExecM()
|
||||
filter = WordFilter(proto, execdir)
|
||||
extract = rclexecm.Executor(proto, filter)
|
||||
|
|
|
@ -20,6 +20,8 @@
|
|||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
############################################
|
||||
# RclExecM implements the
|
||||
|
@ -217,8 +219,11 @@ class Executor:
|
|||
proc = subprocess.Popen(cmd + [filename],
|
||||
stdout = subprocess.PIPE)
|
||||
stdout = proc.stdout
|
||||
except subprocess.CalledProcessError, err:
|
||||
self.em.rclog("extractone: extract failed: [%s]" % err)
|
||||
except subprocess.CalledProcessError as err:
|
||||
self.em.rclog("extractone: Popen() error: %s" % err)
|
||||
return (False, "")
|
||||
except OSError as err:
|
||||
self.em.rclog("extractone: Popen OS error: %s" % err)
|
||||
return (False, "")
|
||||
|
||||
for line in stdout:
|
||||
|
@ -237,7 +242,7 @@ class Executor:
|
|||
ok = False
|
||||
if not params.has_key("filename:"):
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (ok, docdata, "", RclExecM.eofnow)
|
||||
return (ok, "", "", RclExecM.eofnow)
|
||||
|
||||
fn = params["filename:"]
|
||||
while True:
|
||||
|
@ -253,7 +258,6 @@ class Executor:
|
|||
else:
|
||||
return (ok, "", "", RclExecM.eofnow)
|
||||
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
|
@ -270,7 +274,56 @@ class Executor:
|
|||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
|
||||
# Helper routine to test for program accessibility
|
||||
def which(program):
|
||||
def is_exe(fpath):
|
||||
return os.path.exists(fpath) and os.access(fpath, os.X_OK)
|
||||
def ext_candidates(fpath):
|
||||
yield fpath
|
||||
for ext in os.environ.get("PATHEXT", "").split(os.pathsep):
|
||||
yield fpath + ext
|
||||
|
||||
fpath, fname = os.path.split(program)
|
||||
if fpath:
|
||||
if is_exe(program):
|
||||
return program
|
||||
else:
|
||||
for path in os.environ["PATH"].split(os.pathsep):
|
||||
exe_file = os.path.join(path, program)
|
||||
for candidate in ext_candidates(exe_file):
|
||||
if is_exe(candidate):
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# Temp dir helper
|
||||
class SafeTmpDir:
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.toptmp = ""
|
||||
self.tmpdir = ""
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
if self.toptmp:
|
||||
shutil.rmtree(self.tmpdir, True)
|
||||
os.rmdir(self.toptmp)
|
||||
except Exception as err:
|
||||
self.em.rclog("delete dir failed for " + self.toptmp)
|
||||
|
||||
def getpath(self):
|
||||
if not self.tmpdir:
|
||||
envrcltmp = os.getenv('RECOLL_TMPDIR')
|
||||
if envrcltmp:
|
||||
self.toptmp = tempfile.mkdtemp(prefix='rcltmp', dir=envrcltmp)
|
||||
else:
|
||||
self.toptmp = tempfile.mkdtemp(prefix='rcltmp')
|
||||
|
||||
self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp')
|
||||
os.makedirs(self.tmpdir)
|
||||
|
||||
return self.tmpdir
|
||||
|
||||
|
||||
# Common main routine for all python execm filters: either run the
|
||||
# normal protocol engine or a local loop to test without recollindex
|
||||
def main(proto, extract):
|
||||
|
|
179
src/filters/rclsoff.py
Executable file
179
src/filters/rclsoff.py
Executable file
|
@ -0,0 +1,179 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import rclexecm
|
||||
import rclxslt
|
||||
from zipfile import ZipFile
|
||||
|
||||
stylesheet_meta = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
||||
xmlns:ooo="http://openoffice.org/2004/office"
|
||||
exclude-result-prefixes="office xlink meta ooo dc"
|
||||
>
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="/office:document-meta">
|
||||
<xsl:apply-templates select="office:meta/dc:description"/>
|
||||
<xsl:apply-templates select="office:meta/dc:subject"/>
|
||||
<xsl:apply-templates select="office:meta/dc:title"/>
|
||||
<xsl:apply-templates select="office:meta/meta:keyword"/>
|
||||
<xsl:apply-templates select="office:meta/dc:creator"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:title">
|
||||
<title> <xsl:value-of select="."/> </title><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:description">
|
||||
<meta>
|
||||
<xsl:attribute name="name">abstract</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:subject">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="dc:creator">
|
||||
<meta>
|
||||
<xsl:attribute name="name">author</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="meta:keyword">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</meta><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
stylesheet_content = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
||||
exclude-result-prefixes="text"
|
||||
>
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="text:p">
|
||||
<p><xsl:apply-templates/></p><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text:h">
|
||||
<p><xsl:apply-templates/></p><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text:s">
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text:line-break">
|
||||
<br />
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text:tab">
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
class OOExtractor:
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.currentindex = 0
|
||||
|
||||
def extractone(self, params):
|
||||
if not params.has_key("filename:"):
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
fn = params["filename:"]
|
||||
|
||||
try:
|
||||
zip = ZipFile(fn)
|
||||
except Exception as err:
|
||||
self.em.rclog("unzip failed: " + str(err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
docdata = '<html><head><meta http-equiv="Content-Type"' \
|
||||
'content="text/html; charset=UTF-8"></head><body>'
|
||||
|
||||
try:
|
||||
metadata = zip.read("meta.xml")
|
||||
if metadata:
|
||||
res = rclxslt.apply_sheet_data(stylesheet_meta, metadata)
|
||||
docdata += res
|
||||
except:
|
||||
# To be checked. I'm under the impression that I get this when
|
||||
# nothing matches?
|
||||
#self.em.rclog("no/bad metadata in %s" % fn)
|
||||
pass
|
||||
|
||||
try:
|
||||
content = zip.read("content.xml")
|
||||
if content:
|
||||
res = rclxslt.apply_sheet_data(stylesheet_content, content)
|
||||
docdata += res
|
||||
docdata += '</body></html>'
|
||||
except Exception as err:
|
||||
self.em.rclog("bad data in %s" % fn)
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Check for unzip
|
||||
if not rclexecm.which("unzip"):
|
||||
print("RECFILTERROR HELPERNOTFOUND unzip")
|
||||
sys.exit(1)
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = OOExtractor(proto)
|
||||
rclexecm.main(proto, extract)
|
35
src/filters/rclxslt.py
Normal file
35
src/filters/rclxslt.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
|
||||
try:
|
||||
import libxml2
|
||||
import libxslt
|
||||
except:
|
||||
print "RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1"
|
||||
sys.exit(1);
|
||||
|
||||
libxml2.substituteEntitiesDefault(1)
|
||||
|
||||
def apply_sheet_data(sheet, data):
|
||||
styledoc = libxml2.parseMemory(sheet, len(sheet))
|
||||
style = libxslt.parseStylesheetDoc(styledoc)
|
||||
doc = libxml2.parseMemory(data, len(data))
|
||||
result = style.applyStylesheet(doc, None)
|
||||
res = style.saveResultToString(result)
|
||||
style.freeStylesheet()
|
||||
doc.freeDoc()
|
||||
result.freeDoc()
|
||||
return res
|
||||
|
||||
def apply_sheet_file(sheet, fn):
|
||||
styledoc = libxml2.parseMemory(sheet, len(sheet))
|
||||
style = libxslt.parseStylesheetDoc(styledoc)
|
||||
doc = libxml2.parseFile(fn)
|
||||
result = style.applyStylesheet(doc, None)
|
||||
res = style.saveResultToString(result)
|
||||
style.freeStylesheet()
|
||||
doc.freeDoc()
|
||||
result.freeDoc()
|
||||
return res
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue