Replace catdoc with mso-dumper for XLS too
This commit is contained in:
parent
458e51efca
commit
49fbaf1a81
6 changed files with 326 additions and 51 deletions
Binary file not shown.
|
@ -1,36 +1,14 @@
|
|||
#!/usr/bin/env python2
|
||||
########################################################################
|
||||
#
|
||||
# Copyright (c) 2010 Kohei Yoshida, Thorsten Behrens
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person
|
||||
# obtaining a copy of this software and associated documentation
|
||||
# files (the "Software"), to deal in the Software without
|
||||
# restriction, including without limitation the rights to use,
|
||||
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following
|
||||
# conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be
|
||||
# included in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
# OTHER DEALINGS IN THE SOFTWARE.
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
########################################################################
|
||||
|
||||
import sys, os.path, getopt
|
||||
sys.path.append(sys.path[0]+"/msodump.zip/src")
|
||||
import ole, pptstream, globals, olestream
|
||||
|
||||
from globals import error
|
||||
sys.path.append(sys.path[0]+"/msodump.zip")
|
||||
from msodumper import ole, pptstream, globals, olestream
|
||||
from msodumper.globals import error
|
||||
|
||||
def usage (exname):
|
||||
exname = os.path.basename(exname)
|
||||
|
@ -38,9 +16,11 @@ def usage (exname):
|
|||
|
||||
Options:
|
||||
--help displays this help message.
|
||||
--no-struct-output suppress normal disassembly output
|
||||
--dump-text print the textual content
|
||||
"""%exname
|
||||
--no-struct-output suppress normal structure analysis output
|
||||
--dump-text extract and print the textual content
|
||||
--no-raw-dumps suppress raw hex dumps of uninterpreted areas
|
||||
--id-select=id1[,id2 ...] limit output to selected record Ids
|
||||
""" % exname
|
||||
print msg
|
||||
|
||||
|
||||
|
@ -104,24 +84,29 @@ def main (args):
|
|||
usage(exname)
|
||||
return
|
||||
|
||||
params = globals.Params()
|
||||
try:
|
||||
opts, args = getopt.getopt(args, "h",
|
||||
["help", "debug", "show-sector-chain",
|
||||
"no-struct-output", "dump-text"])
|
||||
"no-struct-output", "dump-text",
|
||||
"id-select=", "no-raw-dumps"])
|
||||
for opt, arg in opts:
|
||||
if opt in ['-h', '--help']:
|
||||
usage(exname)
|
||||
return
|
||||
elif opt in ['--debug']:
|
||||
params.debug = True
|
||||
globals.params.debug = True
|
||||
elif opt in ['--show-sector-chain']:
|
||||
params.showSectorChain = True
|
||||
globals.params.showSectorChain = True
|
||||
elif opt in ['--no-struct-output']:
|
||||
globals.muteOutput(1)
|
||||
params.noStructOutput = True
|
||||
globals.params.noStructOutput = True
|
||||
elif opt in ['--dump-text']:
|
||||
params.dumpText = True
|
||||
globals.params.dumpText = True
|
||||
elif opt in ['--no-raw-dumps']:
|
||||
globals.params.noRawDumps = True
|
||||
elif opt in ['--id-select']:
|
||||
globals.params.dumpedIds = arg.split(",")
|
||||
globals.params.dumpedIds = \
|
||||
set([int(val) for val in globals.params.dumpedIds if val])
|
||||
else:
|
||||
error("unknown option %s\n"%opt)
|
||||
usage()
|
||||
|
@ -131,11 +116,13 @@ def main (args):
|
|||
usage(exname)
|
||||
return
|
||||
|
||||
dumper = PPTDumper(args[0], params)
|
||||
dumper = PPTDumper(args[0], globals.params)
|
||||
if not dumper.dump():
|
||||
error("FAILURE\n")
|
||||
if params.dumpText:
|
||||
if globals.params.dumpText:
|
||||
print(globals.textdump.replace("\r", "\n"))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
||||
|
||||
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
|
||||
|
|
|
@ -17,12 +17,6 @@
|
|||
|
||||
#================================================================
|
||||
# Handle excel files for recoll.
|
||||
# Uses xls2csv from the catdoc utilities
|
||||
# (http://ftp.45.free.net/~vitus/software/catdoc/)
|
||||
# Note: xls2csv is supposed to detect the source charset from the excel
|
||||
# file but this does not always work. If you see unexpected russian chars
|
||||
# (the russian author's default charset) in the output, you may want to add
|
||||
# ie a -s 8859-1 option to the xls2csv command line.
|
||||
#================================================================
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
|
@ -32,8 +26,6 @@ filetype=excel
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
|
@ -100,7 +92,11 @@ umask 77
|
|||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xls2csv
|
||||
top=`dirname $0`
|
||||
XLSDUMP="$top/xls-dump.py"
|
||||
XMLTOCSV="$top/xlsxmltocsv.py"
|
||||
|
||||
checkcmds $XLSDUMP $XLSTOCSV
|
||||
|
||||
# output the result
|
||||
echo '<html><head>'
|
||||
|
@ -109,7 +105,8 @@ echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
|||
echo '</head><body>'
|
||||
echo '<pre>'
|
||||
|
||||
xls2csv -c' ' -b"<hr>" -d utf-8 "$infile" | \
|
||||
$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
|
||||
$XMLTOCSV | \
|
||||
sed -e 's/</</g' -e 's/&/&/g'
|
||||
|
||||
echo '</pre>'
|
||||
|
|
243
src/filters/xls-dump.py
Executable file
243
src/filters/xls-dump.py
Executable file
|
@ -0,0 +1,243 @@
|
|||
#!/usr/bin/env python2
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
|
||||
import sys, os.path, optparse
|
||||
sys.path.append(sys.path[0]+"/msodump.zip")
|
||||
|
||||
from msodumper import ole, xlsstream, globals, node, xlsmodel, olestream
|
||||
from msodumper import xlsparser, msocrypto
|
||||
|
||||
from msodumper.globals import error
|
||||
|
||||
def equalsName (name, array):
|
||||
if len(name) != len(array):
|
||||
return False
|
||||
|
||||
for i in xrange(0, len(name)):
|
||||
if ord(name[i]) != array[i]:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def isOleStream (dirname):
|
||||
"""Determine whether or not a stream is an OLE stream.
|
||||
|
||||
Accodring to the spec, an OLE stream is always named '\1Ole'."""
|
||||
|
||||
name = [0x01, 0x4F, 0x6C, 0x65] # 0x01, 'Ole'
|
||||
return equalsName(dirname, name)
|
||||
|
||||
def isCompObjStream (dirname):
|
||||
name = [0x01, 0x43, 0x6F, 0x6D, 0x70, 0x4F, 0x62, 0x6A] # 0x01, 'CompObj'
|
||||
return equalsName(dirname, name)
|
||||
|
||||
class XLDumper(object):
|
||||
|
||||
def __init__ (self, filepath, params):
|
||||
self.filepath = filepath
|
||||
self.params = params
|
||||
self.strm = None
|
||||
self.strmData = None
|
||||
|
||||
def __printDirHeader (self, direntry, byteLen):
|
||||
dirname = direntry.Name
|
||||
dirname = globals.encodeName(dirname)
|
||||
print("")
|
||||
print("="*globals.OutputWidth)
|
||||
if direntry.isStorage():
|
||||
print("%s (storage)"%dirname)
|
||||
else:
|
||||
print("%s (stream, size: %d bytes)"%(dirname, byteLen))
|
||||
print("-"*globals.OutputWidth)
|
||||
|
||||
def __parseFile (self):
|
||||
file = open(self.filepath, 'rb')
|
||||
self.strmData = xlsstream.StreamData()
|
||||
self.strm = xlsstream.XLStream(file.read(), self.params, self.strmData)
|
||||
file.close()
|
||||
|
||||
def dumpXML (self):
|
||||
self.__parseFile()
|
||||
dirs = self.strm.getDirectoryEntries()
|
||||
docroot = node.Root()
|
||||
root = docroot.appendElement('xls-dump')
|
||||
|
||||
for d in dirs:
|
||||
if d.Name != "Workbook":
|
||||
# for now, we only dump the Workbook directory stream.
|
||||
continue
|
||||
|
||||
dirstrm = self.strm.getDirectoryStream(d)
|
||||
data = self.__readSubStreamXML(dirstrm)
|
||||
self.__dumpDataAsXML(data, root)
|
||||
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
|
||||
|
||||
def dumpCanonicalXML (self):
|
||||
self.__parseFile()
|
||||
docroot = node.Root()
|
||||
root = docroot.appendElement('xls-dump')
|
||||
|
||||
dirEntries = self.strm.getDirectoryEntries()
|
||||
for entry in dirEntries:
|
||||
dirname = entry.Name
|
||||
if dirname != "Workbook":
|
||||
# for now, we only dump the Workbook directory stream.
|
||||
continue
|
||||
|
||||
dirstrm = self.strm.getDirectoryStream(entry)
|
||||
wbmodel = self.__buildWorkbookModel(dirstrm)
|
||||
wbmodel.encrypted = self.strmData.encrypted
|
||||
root.appendChild(wbmodel.createDOM())
|
||||
|
||||
node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
|
||||
|
||||
def dump (self):
|
||||
self.__parseFile()
|
||||
self.strm.printStreamInfo()
|
||||
self.strm.printHeader()
|
||||
self.strm.printMSAT()
|
||||
self.strm.printSAT()
|
||||
self.strm.printSSAT()
|
||||
self.strm.printDirectory()
|
||||
dirEntries = self.strm.getDirectoryEntries()
|
||||
for entry in dirEntries:
|
||||
dirname = entry.Name
|
||||
if len(dirname) == 0:
|
||||
continue
|
||||
|
||||
dirstrm = self.strm.getDirectoryStream(entry)
|
||||
self.__printDirHeader(entry, len(dirstrm.bytes))
|
||||
if entry.isStorage():
|
||||
continue
|
||||
|
||||
elif dirname == "Workbook":
|
||||
success = True
|
||||
while success:
|
||||
success = self.__readSubStream(dirstrm)
|
||||
|
||||
elif dirname == "Revision Log":
|
||||
dirstrm.type = xlsstream.DirType.RevisionLog
|
||||
self.__readSubStream(dirstrm)
|
||||
|
||||
elif dirname == "EncryptionInfo":
|
||||
globals.dumpBytes(dirstrm.bytes, 512)
|
||||
print("-"*globals.OutputWidth)
|
||||
info = msocrypto.EncryptionInfo(dirstrm.bytes)
|
||||
info.read()
|
||||
info.output()
|
||||
|
||||
elif self.strmData.isPivotCacheStream(dirname):
|
||||
dirstrm.type = xlsstream.DirType.PivotTableCache
|
||||
self.__readSubStream(dirstrm)
|
||||
elif isOleStream(dirname):
|
||||
self.__readOleStream(dirstrm)
|
||||
elif isCompObjStream(dirname):
|
||||
self.__readCompObjStream(dirstrm)
|
||||
else:
|
||||
globals.dumpBytes(dirstrm.bytes, 512)
|
||||
|
||||
def __readSubStream (self, strm):
|
||||
try:
|
||||
# read bytes from BOF to EOF.
|
||||
header = 0x0000
|
||||
while header != 0x000A:
|
||||
header = strm.readRecord()
|
||||
return True
|
||||
except xlsstream.EndOfStream:
|
||||
return False
|
||||
|
||||
def __readOleStream (self, dirstrm):
|
||||
strm = olestream.OLEStream(dirstrm.bytes)
|
||||
strm.read()
|
||||
|
||||
def __readCompObjStream (self, dirstrm):
|
||||
try:
|
||||
strm = olestream.CompObjStream(dirstrm.bytes)
|
||||
strm.read()
|
||||
except olestream.CompObjStreamError:
|
||||
globals.error("failed to parse CompObj stream.\n")
|
||||
|
||||
def __dumpDataAsXML(self, data, root):
|
||||
if isinstance(data, tuple):
|
||||
newRoot = root.appendElement(data[0])
|
||||
if isinstance(data[1], dict): # attrs
|
||||
for key,val in data[1].iteritems():
|
||||
newRoot.setAttr(key, val)
|
||||
if len(data) > 2: # data has a list of children
|
||||
self.__dumpDataAsXML(data[2], newRoot)
|
||||
else:
|
||||
self.__dumpDataAsXML(data[1], newRoot)
|
||||
elif isinstance(data, list):
|
||||
for x in data:
|
||||
self.__dumpDataAsXML(x, root)
|
||||
else:
|
||||
pass # we're skipping all unknown elems
|
||||
|
||||
def __readSubStreamXML (self, strm):
|
||||
handlers = []
|
||||
try:
|
||||
while True:
|
||||
handler = strm.getNextRecordHandler()
|
||||
handlers.append(handler)
|
||||
except xlsstream.EndOfStream:
|
||||
pass
|
||||
parser = xlsparser.XlsParser(handlers)
|
||||
return parser.dumpData()
|
||||
|
||||
def __buildWorkbookModel (self, strm):
|
||||
model = xlsmodel.Workbook()
|
||||
try:
|
||||
while True:
|
||||
strm.fillModel(model)
|
||||
except xlsstream.EndOfStream:
|
||||
pass
|
||||
|
||||
return model
|
||||
|
||||
def main ():
|
||||
parser = optparse.OptionParser()
|
||||
parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False,
|
||||
help="Turn on debug mode")
|
||||
parser.add_option("--show-sector-chain", action="store_true", dest="show_sector_chain", default=False,
|
||||
help="Show sector chain information at the start of the output.")
|
||||
parser.add_option("--show-stream-pos", action="store_true", dest="show_stream_pos", default=False,
|
||||
help="Show the position of each record relative to the stream.")
|
||||
parser.add_option("--dump-mode", dest="dump_mode", default="flat", metavar="MODE",
|
||||
help="Specify the dump mode. Possible values are: 'flat', 'xml', or 'canonical-xml'. The default value is 'flat'.")
|
||||
parser.add_option("--catch", action="store_true", dest="catch_exceptions", default=False,
|
||||
help="Catch exceptions and try to continue.")
|
||||
parser.add_option("--utf-8", action="store_true", dest="utf8", default=False,
|
||||
help="Output strings as UTF-8.")
|
||||
options, args = parser.parse_args()
|
||||
params = globals.params
|
||||
params.debug = options.debug
|
||||
params.showSectorChain = options.show_sector_chain
|
||||
params.showStreamPos = options.show_stream_pos
|
||||
params.catchExceptions = options.catch_exceptions
|
||||
params.utf8 = options.utf8
|
||||
|
||||
if len(args) < 1:
|
||||
globals.error("takes at least one argument\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
dumper = XLDumper(args[0], params)
|
||||
if options.dump_mode == 'flat':
|
||||
dumper.dump()
|
||||
elif options.dump_mode == 'xml':
|
||||
dumper.dumpXML()
|
||||
elif options.dump_mode == 'canonical-xml' or options.dump_mode == 'cxml':
|
||||
dumper.dumpCanonicalXML()
|
||||
else:
|
||||
error("unknown dump mode: '%s'\n"%options.dump_mode)
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
|
48
src/filters/xlsxmltocsv.py
Executable file
48
src/filters/xlsxmltocsv.py
Executable file
|
@ -0,0 +1,48 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
import xml.sax
|
||||
|
||||
dtt = True
|
||||
|
||||
if dtt:
|
||||
sepstring = "\t"
|
||||
dquote = ''
|
||||
else:
|
||||
sepstring = ","
|
||||
dquote = '"'
|
||||
|
||||
class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
||||
def startElement(self, name, attrs):
|
||||
if name == "worksheet":
|
||||
if "name" in attrs:
|
||||
print("%s" % attrs["name"].encode("UTF-8"))
|
||||
elif name == "row":
|
||||
self.cells = dict()
|
||||
elif name == "label-cell" or name == "number-cell":
|
||||
if "value" in attrs:
|
||||
value = attrs["value"].encode("UTF-8")
|
||||
else:
|
||||
value = unicode()
|
||||
if "col" in attrs:
|
||||
self.cells[int(attrs["col"])] = value
|
||||
else:
|
||||
#??
|
||||
sys.stdout.write("%s%s"%(value.encode("UTF-8"),sepstring))
|
||||
elif name == "formula-cell":
|
||||
if "formula-result" in attrs and "col" in attrs:
|
||||
self.cells[int(attrs["col"])] = \
|
||||
attrs["formula-result"].encode("UTF-8")
|
||||
|
||||
def endElement(self, name, ):
|
||||
if name == "row":
|
||||
curidx = 0
|
||||
for idx, value in self.cells.iteritems():
|
||||
sys.stdout.write(sepstring * (idx - curidx))
|
||||
sys.stdout.write('%s%s%s' % (dquote, value, dquote))
|
||||
curidx = idx
|
||||
sys.stdout.write("\n")
|
||||
elif name == "worksheet":
|
||||
print("")
|
||||
|
||||
xml.sax.parse(sys.stdin, XlsXmlHandler())
|
|
@ -61,7 +61,7 @@ application/vnd.ms-office = exec rcldoc
|
|||
application/ogg = execm rclaudio
|
||||
application/pdf = exec rclpdf
|
||||
application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
|
||||
application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;mimetype=text/plain
|
||||
application/vnd.ms-excel = exec rclxls
|
||||
application/vnd.ms-powerpoint = exec rclppt
|
||||
application/vnd.oasis.opendocument.text = exec rclsoff
|
||||
application/vnd.oasis.opendocument.text-template = exec rclsoff
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue