diff --git a/src/filters/msodump.zip b/src/filters/msodump.zip index 115763c7..22a3e5cd 100644 Binary files a/src/filters/msodump.zip and b/src/filters/msodump.zip differ diff --git a/src/filters/ppt-dump.py b/src/filters/ppt-dump.py index e94ce291..d41c5891 100755 --- a/src/filters/ppt-dump.py +++ b/src/filters/ppt-dump.py @@ -1,36 +1,14 @@ #!/usr/bin/env python2 -######################################################################## # -# Copyright (c) 2010 Kohei Yoshida, Thorsten Behrens -# -# Permission is hereby granted, free of charge, to any person -# obtaining a copy of this software and associated documentation -# files (the "Software"), to deal in the Software without -# restriction, including without limitation the rights to use, -# copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be -# included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. # -######################################################################## import sys, os.path, getopt -sys.path.append(sys.path[0]+"/msodump.zip/src") -import ole, pptstream, globals, olestream - -from globals import error +sys.path.append(sys.path[0]+"/msodump.zip") +from msodumper import ole, pptstream, globals, olestream +from msodumper.globals import error def usage (exname): exname = os.path.basename(exname) @@ -38,9 +16,11 @@ def usage (exname): Options: --help displays this help message. - --no-struct-output suppress normal disassembly output - --dump-text print the textual content -"""%exname + --no-struct-output suppress normal structure analysis output + --dump-text extract and print the textual content + --no-raw-dumps suppress raw hex dumps of uninterpreted areas + --id-select=id1[,id2 ...] limit output to selected record Ids +""" % exname print msg @@ -104,24 +84,29 @@ def main (args): usage(exname) return - params = globals.Params() try: opts, args = getopt.getopt(args, "h", ["help", "debug", "show-sector-chain", - "no-struct-output", "dump-text"]) + "no-struct-output", "dump-text", + "id-select=", "no-raw-dumps"]) for opt, arg in opts: if opt in ['-h', '--help']: usage(exname) return elif opt in ['--debug']: - params.debug = True + globals.params.debug = True elif opt in ['--show-sector-chain']: - params.showSectorChain = True + globals.params.showSectorChain = True elif opt in ['--no-struct-output']: - globals.muteOutput(1) - params.noStructOutput = True + globals.params.noStructOutput = True elif opt in ['--dump-text']: - params.dumpText = True + globals.params.dumpText = True + elif opt in ['--no-raw-dumps']: + globals.params.noRawDumps = True + elif opt in ['--id-select']: + globals.params.dumpedIds = arg.split(",") + globals.params.dumpedIds = \ + set([int(val) for val in globals.params.dumpedIds if val]) else: error("unknown option %s\n"%opt) usage() @@ -131,11 +116,13 @@ def main (args): usage(exname) return - dumper = PPTDumper(args[0], params) + dumper = PPTDumper(args[0], globals.params) if not dumper.dump(): error("FAILURE\n") - if params.dumpText: + if globals.params.dumpText: print(globals.textdump.replace("\r", "\n")) if __name__ == '__main__': main(sys.argv) + +# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab: diff --git a/src/filters/rclxls b/src/filters/rclxls index d0f5159e..eb5d4904 100755 --- a/src/filters/rclxls +++ b/src/filters/rclxls @@ -17,12 +17,6 @@ #================================================================ # Handle excel files for recoll. -# Uses xls2csv from the catdoc utilities -# (http://ftp.45.free.net/~vitus/software/catdoc/) -# Note: xls2csv is supposed to detect the source charset from the excel -# file but this does not always work. If you see unexpected russian chars -# (the russian author's default charset) in the output, you may want to add -# ie a -s 8859-1 option to the xls2csv command line. #================================================================ # set variables LANG=C ; export LANG @@ -32,8 +26,6 @@ filetype=excel - - #RECFILTCOMMONCODE ############################################################################## # !! Leave the previous line unmodified!! Code imported from the @@ -100,7 +92,11 @@ umask 77 # !! Leave the following line unmodified ! #ENDRECFILTCOMMONCODE -checkcmds xls2csv +top=`dirname $0` +XLSDUMP="$top/xls-dump.py" +XMLTOCSV="$top/xlsxmltocsv.py" + +checkcmds $XLSDUMP $XLSTOCSV # output the result echo '' @@ -109,7 +105,8 @@ echo '' echo '' echo '
'
 
-xls2csv -c'	' -b"
" -d utf-8 "$infile" | \ +$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \ + $XMLTOCSV | \ sed -e 's/' diff --git a/src/filters/xls-dump.py b/src/filters/xls-dump.py new file mode 100755 index 00000000..e224da71 --- /dev/null +++ b/src/filters/xls-dump.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python2 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +import sys, os.path, optparse +sys.path.append(sys.path[0]+"/msodump.zip") + +from msodumper import ole, xlsstream, globals, node, xlsmodel, olestream +from msodumper import xlsparser, msocrypto + +from msodumper.globals import error + +def equalsName (name, array): + if len(name) != len(array): + return False + + for i in xrange(0, len(name)): + if ord(name[i]) != array[i]: + return False + + return True + +def isOleStream (dirname): + """Determine whether or not a stream is an OLE stream. + +Accodring to the spec, an OLE stream is always named '\1Ole'.""" + + name = [0x01, 0x4F, 0x6C, 0x65] # 0x01, 'Ole' + return equalsName(dirname, name) + +def isCompObjStream (dirname): + name = [0x01, 0x43, 0x6F, 0x6D, 0x70, 0x4F, 0x62, 0x6A] # 0x01, 'CompObj' + return equalsName(dirname, name) + +class XLDumper(object): + + def __init__ (self, filepath, params): + self.filepath = filepath + self.params = params + self.strm = None + self.strmData = None + + def __printDirHeader (self, direntry, byteLen): + dirname = direntry.Name + dirname = globals.encodeName(dirname) + print("") + print("="*globals.OutputWidth) + if direntry.isStorage(): + print("%s (storage)"%dirname) + else: + print("%s (stream, size: %d bytes)"%(dirname, byteLen)) + print("-"*globals.OutputWidth) + + def __parseFile (self): + file = open(self.filepath, 'rb') + self.strmData = xlsstream.StreamData() + self.strm = xlsstream.XLStream(file.read(), self.params, self.strmData) + file.close() + + def dumpXML (self): + self.__parseFile() + dirs = self.strm.getDirectoryEntries() + docroot = node.Root() + root = docroot.appendElement('xls-dump') + + for d in dirs: + if d.Name != "Workbook": + # for now, we only dump the Workbook directory stream. + continue + + dirstrm = self.strm.getDirectoryStream(d) + data = self.__readSubStreamXML(dirstrm) + self.__dumpDataAsXML(data, root) + node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8) + + def dumpCanonicalXML (self): + self.__parseFile() + docroot = node.Root() + root = docroot.appendElement('xls-dump') + + dirEntries = self.strm.getDirectoryEntries() + for entry in dirEntries: + dirname = entry.Name + if dirname != "Workbook": + # for now, we only dump the Workbook directory stream. + continue + + dirstrm = self.strm.getDirectoryStream(entry) + wbmodel = self.__buildWorkbookModel(dirstrm) + wbmodel.encrypted = self.strmData.encrypted + root.appendChild(wbmodel.createDOM()) + + node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8) + + def dump (self): + self.__parseFile() + self.strm.printStreamInfo() + self.strm.printHeader() + self.strm.printMSAT() + self.strm.printSAT() + self.strm.printSSAT() + self.strm.printDirectory() + dirEntries = self.strm.getDirectoryEntries() + for entry in dirEntries: + dirname = entry.Name + if len(dirname) == 0: + continue + + dirstrm = self.strm.getDirectoryStream(entry) + self.__printDirHeader(entry, len(dirstrm.bytes)) + if entry.isStorage(): + continue + + elif dirname == "Workbook": + success = True + while success: + success = self.__readSubStream(dirstrm) + + elif dirname == "Revision Log": + dirstrm.type = xlsstream.DirType.RevisionLog + self.__readSubStream(dirstrm) + + elif dirname == "EncryptionInfo": + globals.dumpBytes(dirstrm.bytes, 512) + print("-"*globals.OutputWidth) + info = msocrypto.EncryptionInfo(dirstrm.bytes) + info.read() + info.output() + + elif self.strmData.isPivotCacheStream(dirname): + dirstrm.type = xlsstream.DirType.PivotTableCache + self.__readSubStream(dirstrm) + elif isOleStream(dirname): + self.__readOleStream(dirstrm) + elif isCompObjStream(dirname): + self.__readCompObjStream(dirstrm) + else: + globals.dumpBytes(dirstrm.bytes, 512) + + def __readSubStream (self, strm): + try: + # read bytes from BOF to EOF. + header = 0x0000 + while header != 0x000A: + header = strm.readRecord() + return True + except xlsstream.EndOfStream: + return False + + def __readOleStream (self, dirstrm): + strm = olestream.OLEStream(dirstrm.bytes) + strm.read() + + def __readCompObjStream (self, dirstrm): + try: + strm = olestream.CompObjStream(dirstrm.bytes) + strm.read() + except olestream.CompObjStreamError: + globals.error("failed to parse CompObj stream.\n") + + def __dumpDataAsXML(self, data, root): + if isinstance(data, tuple): + newRoot = root.appendElement(data[0]) + if isinstance(data[1], dict): # attrs + for key,val in data[1].iteritems(): + newRoot.setAttr(key, val) + if len(data) > 2: # data has a list of children + self.__dumpDataAsXML(data[2], newRoot) + else: + self.__dumpDataAsXML(data[1], newRoot) + elif isinstance(data, list): + for x in data: + self.__dumpDataAsXML(x, root) + else: + pass # we're skipping all unknown elems + + def __readSubStreamXML (self, strm): + handlers = [] + try: + while True: + handler = strm.getNextRecordHandler() + handlers.append(handler) + except xlsstream.EndOfStream: + pass + parser = xlsparser.XlsParser(handlers) + return parser.dumpData() + + def __buildWorkbookModel (self, strm): + model = xlsmodel.Workbook() + try: + while True: + strm.fillModel(model) + except xlsstream.EndOfStream: + pass + + return model + +def main (): + parser = optparse.OptionParser() + parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, + help="Turn on debug mode") + parser.add_option("--show-sector-chain", action="store_true", dest="show_sector_chain", default=False, + help="Show sector chain information at the start of the output.") + parser.add_option("--show-stream-pos", action="store_true", dest="show_stream_pos", default=False, + help="Show the position of each record relative to the stream.") + parser.add_option("--dump-mode", dest="dump_mode", default="flat", metavar="MODE", + help="Specify the dump mode. Possible values are: 'flat', 'xml', or 'canonical-xml'. The default value is 'flat'.") + parser.add_option("--catch", action="store_true", dest="catch_exceptions", default=False, + help="Catch exceptions and try to continue.") + parser.add_option("--utf-8", action="store_true", dest="utf8", default=False, + help="Output strings as UTF-8.") + options, args = parser.parse_args() + params = globals.params + params.debug = options.debug + params.showSectorChain = options.show_sector_chain + params.showStreamPos = options.show_stream_pos + params.catchExceptions = options.catch_exceptions + params.utf8 = options.utf8 + + if len(args) < 1: + globals.error("takes at least one argument\n") + parser.print_help() + sys.exit(1) + + dumper = XLDumper(args[0], params) + if options.dump_mode == 'flat': + dumper.dump() + elif options.dump_mode == 'xml': + dumper.dumpXML() + elif options.dump_mode == 'canonical-xml' or options.dump_mode == 'cxml': + dumper.dumpCanonicalXML() + else: + error("unknown dump mode: '%s'\n"%options.dump_mode) + parser.print_help() + sys.exit(1) + +if __name__ == '__main__': + main() + +# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab: diff --git a/src/filters/xlsxmltocsv.py b/src/filters/xlsxmltocsv.py new file mode 100755 index 00000000..84b54c3b --- /dev/null +++ b/src/filters/xlsxmltocsv.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +import sys +import xml.sax + +dtt = True + +if dtt: + sepstring = "\t" + dquote = '' +else: + sepstring = "," + dquote = '"' + +class XlsXmlHandler(xml.sax.handler.ContentHandler): + def startElement(self, name, attrs): + if name == "worksheet": + if "name" in attrs: + print("%s" % attrs["name"].encode("UTF-8")) + elif name == "row": + self.cells = dict() + elif name == "label-cell" or name == "number-cell": + if "value" in attrs: + value = attrs["value"].encode("UTF-8") + else: + value = unicode() + if "col" in attrs: + self.cells[int(attrs["col"])] = value + else: + #?? + sys.stdout.write("%s%s"%(value.encode("UTF-8"),sepstring)) + elif name == "formula-cell": + if "formula-result" in attrs and "col" in attrs: + self.cells[int(attrs["col"])] = \ + attrs["formula-result"].encode("UTF-8") + + def endElement(self, name, ): + if name == "row": + curidx = 0 + for idx, value in self.cells.iteritems(): + sys.stdout.write(sepstring * (idx - curidx)) + sys.stdout.write('%s%s%s' % (dquote, value, dquote)) + curidx = idx + sys.stdout.write("\n") + elif name == "worksheet": + print("") + +xml.sax.parse(sys.stdin, XlsXmlHandler()) diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index b6a744cc..58129f17 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -61,7 +61,7 @@ application/vnd.ms-office = exec rcldoc application/ogg = execm rclaudio application/pdf = exec rclpdf application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain -application/vnd.ms-excel = exec xls2csv -c " " -d utf-8;mimetype=text/plain +application/vnd.ms-excel = exec rclxls application/vnd.ms-powerpoint = exec rclppt application/vnd.oasis.opendocument.text = exec rclsoff application/vnd.oasis.opendocument.text-template = exec rclsoff