Replace catdoc with mso-dumper for XLS too

This commit is contained in:
Jean-Francois Dockes 2014-01-09 17:44:05 +01:00
parent 458e51efca
commit 49fbaf1a81
6 changed files with 326 additions and 51 deletions

Binary file not shown.

View file

@ -1,36 +1,14 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
########################################################################
# #
# Copyright (c) 2010 Kohei Yoshida, Thorsten Behrens # This Source Code Form is subject to the terms of the Mozilla Public
# # License, v. 2.0. If a copy of the MPL was not distributed with this
# Permission is hereby granted, free of charge, to any person # file, You can obtain one at http://mozilla.org/MPL/2.0/.
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# #
########################################################################
import sys, os.path, getopt import sys, os.path, getopt
sys.path.append(sys.path[0]+"/msodump.zip/src") sys.path.append(sys.path[0]+"/msodump.zip")
import ole, pptstream, globals, olestream from msodumper import ole, pptstream, globals, olestream
from msodumper.globals import error
from globals import error
def usage (exname): def usage (exname):
exname = os.path.basename(exname) exname = os.path.basename(exname)
@ -38,9 +16,11 @@ def usage (exname):
Options: Options:
--help displays this help message. --help displays this help message.
--no-struct-output suppress normal disassembly output --no-struct-output suppress normal structure analysis output
--dump-text print the textual content --dump-text extract and print the textual content
"""%exname --no-raw-dumps suppress raw hex dumps of uninterpreted areas
--id-select=id1[,id2 ...] limit output to selected record Ids
""" % exname
print msg print msg
@ -104,24 +84,29 @@ def main (args):
usage(exname) usage(exname)
return return
params = globals.Params()
try: try:
opts, args = getopt.getopt(args, "h", opts, args = getopt.getopt(args, "h",
["help", "debug", "show-sector-chain", ["help", "debug", "show-sector-chain",
"no-struct-output", "dump-text"]) "no-struct-output", "dump-text",
"id-select=", "no-raw-dumps"])
for opt, arg in opts: for opt, arg in opts:
if opt in ['-h', '--help']: if opt in ['-h', '--help']:
usage(exname) usage(exname)
return return
elif opt in ['--debug']: elif opt in ['--debug']:
params.debug = True globals.params.debug = True
elif opt in ['--show-sector-chain']: elif opt in ['--show-sector-chain']:
params.showSectorChain = True globals.params.showSectorChain = True
elif opt in ['--no-struct-output']: elif opt in ['--no-struct-output']:
globals.muteOutput(1) globals.params.noStructOutput = True
params.noStructOutput = True
elif opt in ['--dump-text']: elif opt in ['--dump-text']:
params.dumpText = True globals.params.dumpText = True
elif opt in ['--no-raw-dumps']:
globals.params.noRawDumps = True
elif opt in ['--id-select']:
globals.params.dumpedIds = arg.split(",")
globals.params.dumpedIds = \
set([int(val) for val in globals.params.dumpedIds if val])
else: else:
error("unknown option %s\n"%opt) error("unknown option %s\n"%opt)
usage() usage()
@ -131,11 +116,13 @@ def main (args):
usage(exname) usage(exname)
return return
dumper = PPTDumper(args[0], params) dumper = PPTDumper(args[0], globals.params)
if not dumper.dump(): if not dumper.dump():
error("FAILURE\n") error("FAILURE\n")
if params.dumpText: if globals.params.dumpText:
print(globals.textdump.replace("\r", "\n")) print(globals.textdump.replace("\r", "\n"))
if __name__ == '__main__': if __name__ == '__main__':
main(sys.argv) main(sys.argv)
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:

View file

@ -17,12 +17,6 @@
#================================================================ #================================================================
# Handle excel files for recoll. # Handle excel files for recoll.
# Uses xls2csv from the catdoc utilities
# (http://ftp.45.free.net/~vitus/software/catdoc/)
# Note: xls2csv is supposed to detect the source charset from the excel
# file but this does not always work. If you see unexpected russian chars
# (the russian author's default charset) in the output, you may want to add
# ie a -s 8859-1 option to the xls2csv command line.
#================================================================ #================================================================
# set variables # set variables
LANG=C ; export LANG LANG=C ; export LANG
@ -32,8 +26,6 @@ filetype=excel
#RECFILTCOMMONCODE #RECFILTCOMMONCODE
############################################################################## ##############################################################################
# !! Leave the previous line unmodified!! Code imported from the # !! Leave the previous line unmodified!! Code imported from the
@ -100,7 +92,11 @@ umask 77
# !! Leave the following line unmodified ! # !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE #ENDRECFILTCOMMONCODE
checkcmds xls2csv top=`dirname $0`
XLSDUMP="$top/xls-dump.py"
XMLTOCSV="$top/xlsxmltocsv.py"
checkcmds $XLSDUMP $XLSTOCSV
# output the result # output the result
echo '<html><head>' echo '<html><head>'
@ -109,7 +105,8 @@ echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
echo '</head><body>' echo '</head><body>'
echo '<pre>' echo '<pre>'
xls2csv -c' ' -b"<hr>" -d utf-8 "$infile" | \ $XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
$XMLTOCSV | \
sed -e 's/</&lt;/g' -e 's/&/&amp;/g' sed -e 's/</&lt;/g' -e 's/&/&amp;/g'
echo '</pre>' echo '</pre>'

243
src/filters/xls-dump.py Executable file
View file

@ -0,0 +1,243 @@
#!/usr/bin/env python2
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
import sys, os.path, optparse
sys.path.append(sys.path[0]+"/msodump.zip")
from msodumper import ole, xlsstream, globals, node, xlsmodel, olestream
from msodumper import xlsparser, msocrypto
from msodumper.globals import error
def equalsName (name, array):
if len(name) != len(array):
return False
for i in xrange(0, len(name)):
if ord(name[i]) != array[i]:
return False
return True
def isOleStream (dirname):
"""Determine whether or not a stream is an OLE stream.
Accodring to the spec, an OLE stream is always named '\1Ole'."""
name = [0x01, 0x4F, 0x6C, 0x65] # 0x01, 'Ole'
return equalsName(dirname, name)
def isCompObjStream (dirname):
name = [0x01, 0x43, 0x6F, 0x6D, 0x70, 0x4F, 0x62, 0x6A] # 0x01, 'CompObj'
return equalsName(dirname, name)
class XLDumper(object):
def __init__ (self, filepath, params):
self.filepath = filepath
self.params = params
self.strm = None
self.strmData = None
def __printDirHeader (self, direntry, byteLen):
dirname = direntry.Name
dirname = globals.encodeName(dirname)
print("")
print("="*globals.OutputWidth)
if direntry.isStorage():
print("%s (storage)"%dirname)
else:
print("%s (stream, size: %d bytes)"%(dirname, byteLen))
print("-"*globals.OutputWidth)
def __parseFile (self):
file = open(self.filepath, 'rb')
self.strmData = xlsstream.StreamData()
self.strm = xlsstream.XLStream(file.read(), self.params, self.strmData)
file.close()
def dumpXML (self):
self.__parseFile()
dirs = self.strm.getDirectoryEntries()
docroot = node.Root()
root = docroot.appendElement('xls-dump')
for d in dirs:
if d.Name != "Workbook":
# for now, we only dump the Workbook directory stream.
continue
dirstrm = self.strm.getDirectoryStream(d)
data = self.__readSubStreamXML(dirstrm)
self.__dumpDataAsXML(data, root)
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
def dumpCanonicalXML (self):
self.__parseFile()
docroot = node.Root()
root = docroot.appendElement('xls-dump')
dirEntries = self.strm.getDirectoryEntries()
for entry in dirEntries:
dirname = entry.Name
if dirname != "Workbook":
# for now, we only dump the Workbook directory stream.
continue
dirstrm = self.strm.getDirectoryStream(entry)
wbmodel = self.__buildWorkbookModel(dirstrm)
wbmodel.encrypted = self.strmData.encrypted
root.appendChild(wbmodel.createDOM())
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
def dump (self):
self.__parseFile()
self.strm.printStreamInfo()
self.strm.printHeader()
self.strm.printMSAT()
self.strm.printSAT()
self.strm.printSSAT()
self.strm.printDirectory()
dirEntries = self.strm.getDirectoryEntries()
for entry in dirEntries:
dirname = entry.Name
if len(dirname) == 0:
continue
dirstrm = self.strm.getDirectoryStream(entry)
self.__printDirHeader(entry, len(dirstrm.bytes))
if entry.isStorage():
continue
elif dirname == "Workbook":
success = True
while success:
success = self.__readSubStream(dirstrm)
elif dirname == "Revision Log":
dirstrm.type = xlsstream.DirType.RevisionLog
self.__readSubStream(dirstrm)
elif dirname == "EncryptionInfo":
globals.dumpBytes(dirstrm.bytes, 512)
print("-"*globals.OutputWidth)
info = msocrypto.EncryptionInfo(dirstrm.bytes)
info.read()
info.output()
elif self.strmData.isPivotCacheStream(dirname):
dirstrm.type = xlsstream.DirType.PivotTableCache
self.__readSubStream(dirstrm)
elif isOleStream(dirname):
self.__readOleStream(dirstrm)
elif isCompObjStream(dirname):
self.__readCompObjStream(dirstrm)
else:
globals.dumpBytes(dirstrm.bytes, 512)
def __readSubStream (self, strm):
try:
# read bytes from BOF to EOF.
header = 0x0000
while header != 0x000A:
header = strm.readRecord()
return True
except xlsstream.EndOfStream:
return False
def __readOleStream (self, dirstrm):
strm = olestream.OLEStream(dirstrm.bytes)
strm.read()
def __readCompObjStream (self, dirstrm):
try:
strm = olestream.CompObjStream(dirstrm.bytes)
strm.read()
except olestream.CompObjStreamError:
globals.error("failed to parse CompObj stream.\n")
def __dumpDataAsXML(self, data, root):
if isinstance(data, tuple):
newRoot = root.appendElement(data[0])
if isinstance(data[1], dict): # attrs
for key,val in data[1].iteritems():
newRoot.setAttr(key, val)
if len(data) > 2: # data has a list of children
self.__dumpDataAsXML(data[2], newRoot)
else:
self.__dumpDataAsXML(data[1], newRoot)
elif isinstance(data, list):
for x in data:
self.__dumpDataAsXML(x, root)
else:
pass # we're skipping all unknown elems
def __readSubStreamXML (self, strm):
handlers = []
try:
while True:
handler = strm.getNextRecordHandler()
handlers.append(handler)
except xlsstream.EndOfStream:
pass
parser = xlsparser.XlsParser(handlers)
return parser.dumpData()
def __buildWorkbookModel (self, strm):
model = xlsmodel.Workbook()
try:
while True:
strm.fillModel(model)
except xlsstream.EndOfStream:
pass
return model
def main ():
parser = optparse.OptionParser()
parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False,
help="Turn on debug mode")
parser.add_option("--show-sector-chain", action="store_true", dest="show_sector_chain", default=False,
help="Show sector chain information at the start of the output.")
parser.add_option("--show-stream-pos", action="store_true", dest="show_stream_pos", default=False,
help="Show the position of each record relative to the stream.")
parser.add_option("--dump-mode", dest="dump_mode", default="flat", metavar="MODE",
help="Specify the dump mode. Possible values are: 'flat', 'xml', or 'canonical-xml'. The default value is 'flat'.")
parser.add_option("--catch", action="store_true", dest="catch_exceptions", default=False,
help="Catch exceptions and try to continue.")
parser.add_option("--utf-8", action="store_true", dest="utf8", default=False,
help="Output strings as UTF-8.")
options, args = parser.parse_args()
params = globals.params
params.debug = options.debug
params.showSectorChain = options.show_sector_chain
params.showStreamPos = options.show_stream_pos
params.catchExceptions = options.catch_exceptions
params.utf8 = options.utf8
if len(args) < 1:
globals.error("takes at least one argument\n")
parser.print_help()
sys.exit(1)
dumper = XLDumper(args[0], params)
if options.dump_mode == 'flat':
dumper.dump()
elif options.dump_mode == 'xml':
dumper.dumpXML()
elif options.dump_mode == 'canonical-xml' or options.dump_mode == 'cxml':
dumper.dumpCanonicalXML()
else:
error("unknown dump mode: '%s'\n"%options.dump_mode)
parser.print_help()
sys.exit(1)
if __name__ == '__main__':
main()
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:

48
src/filters/xlsxmltocsv.py Executable file
View file

@ -0,0 +1,48 @@
#!/usr/bin/env python
import sys
import xml.sax
dtt = True
if dtt:
sepstring = "\t"
dquote = ''
else:
sepstring = ","
dquote = '"'
class XlsXmlHandler(xml.sax.handler.ContentHandler):
def startElement(self, name, attrs):
if name == "worksheet":
if "name" in attrs:
print("%s" % attrs["name"].encode("UTF-8"))
elif name == "row":
self.cells = dict()
elif name == "label-cell" or name == "number-cell":
if "value" in attrs:
value = attrs["value"].encode("UTF-8")
else:
value = unicode()
if "col" in attrs:
self.cells[int(attrs["col"])] = value
else:
#??
sys.stdout.write("%s%s"%(value.encode("UTF-8"),sepstring))
elif name == "formula-cell":
if "formula-result" in attrs and "col" in attrs:
self.cells[int(attrs["col"])] = \
attrs["formula-result"].encode("UTF-8")
def endElement(self, name, ):
if name == "row":
curidx = 0
for idx, value in self.cells.iteritems():
sys.stdout.write(sepstring * (idx - curidx))
sys.stdout.write('%s%s%s' % (dquote, value, dquote))
curidx = idx
sys.stdout.write("\n")
elif name == "worksheet":
print("")
xml.sax.parse(sys.stdin, XlsXmlHandler())

View file

@ -61,7 +61,7 @@ application/vnd.ms-office = exec rcldoc
application/ogg = execm rclaudio application/ogg = execm rclaudio
application/pdf = exec rclpdf application/pdf = exec rclpdf
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;mimetype=text/plain application/vnd.ms-excel = exec rclxls
application/vnd.ms-powerpoint = exec rclppt application/vnd.ms-powerpoint = exec rclppt
application/vnd.oasis.opendocument.text = exec rclsoff application/vnd.oasis.opendocument.text = exec rclsoff
application/vnd.oasis.opendocument.text-template = exec rclsoff application/vnd.oasis.opendocument.text-template = exec rclsoff