Replace catdoc with mso-dumper for XLS too

2014-01-09 17:44:05 +01:00 · 2014-01-09 17:44:05 +01:00 · 49fbaf1a81
commit 49fbaf1a81
parent 458e51efca
6 changed files with 326 additions and 51 deletions
--- a/src/filters/msodump.zip
+++ b/src/filters/msodump.zip
--- a/src/filters/ppt-dump.py
+++ b/src/filters/ppt-dump.py
@ -1,36 +1,14 @@
 #!/usr/bin/env python2
 ########################################################################
 #
-#  Copyright (c) 2010 Kohei Yoshida, Thorsten Behrens
+# This Source Code Form is subject to the terms of the Mozilla Public
-#  
+# License, v. 2.0. If a copy of the MPL was not distributed with this
-#  Permission is hereby granted, free of charge, to any person
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #  obtaining a copy of this software and associated documentation
 #  files (the "Software"), to deal in the Software without
 #  restriction, including without limitation the rights to use,
 #  copy, modify, merge, publish, distribute, sublicense, and/or sell
 #  copies of the Software, and to permit persons to whom the
 #  Software is furnished to do so, subject to the following
 #  conditions:
 #  
 #  The above copyright notice and this permission notice shall be
 #  included in all copies or substantial portions of the Software.
 #  
 #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 #  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 #  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 #  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 #  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 #  OTHER DEALINGS IN THE SOFTWARE.
 #
 ########################################################################
 import sys, os.path, getopt
-sys.path.append(sys.path[0]+"/msodump.zip/src")
+sys.path.append(sys.path[0]+"/msodump.zip")
-import ole, pptstream, globals, olestream
+from msodumper import ole, pptstream, globals, olestream
-
+from msodumper.globals import error
 from globals import error
 def usage (exname):
    exname = os.path.basename(exname)
@ -38,9 +16,11 @@ def usage (exname):
 Options:
  --help        displays this help message.
-  --no-struct-output suppress normal disassembly output
+  --no-struct-output suppress normal structure analysis output
-  --dump-text   print the textual content
+  --dump-text   extract and print the textual content
-"""%exname
+  --no-raw-dumps suppress raw hex dumps of uninterpreted areas
  --id-select=id1[,id2 ...] limit output to selected record Ids
 """ % exname
    print msg
@ -104,24 +84,29 @@ def main (args):
        usage(exname)
        return
    params = globals.Params()
    try:
        opts, args = getopt.getopt(args, "h",
                                   ["help", "debug", "show-sector-chain",
-                                    "no-struct-output", "dump-text"])
+                                    "no-struct-output", "dump-text",
                                    "id-select=", "no-raw-dumps"])
        for opt, arg in opts:
            if opt in ['-h', '--help']:
                usage(exname)
                return
            elif opt in ['--debug']:
-                params.debug = True
+                globals.params.debug = True
            elif opt in ['--show-sector-chain']:
-                params.showSectorChain = True
+                globals.params.showSectorChain = True
            elif opt in ['--no-struct-output']:
-                globals.muteOutput(1)
+                globals.params.noStructOutput = True
                params.noStructOutput = True
            elif opt in ['--dump-text']:
-                params.dumpText = True
+                globals.params.dumpText = True
            elif opt in ['--no-raw-dumps']:
                globals.params.noRawDumps = True
            elif opt in ['--id-select']:
                globals.params.dumpedIds = arg.split(",")
                globals.params.dumpedIds = \
                    set([int(val) for val in globals.params.dumpedIds if val])
            else:
                error("unknown option %s\n"%opt)
                usage()
@ -131,11 +116,13 @@ def main (args):
        usage(exname)
        return
-    dumper = PPTDumper(args[0], params)
+    dumper = PPTDumper(args[0], globals.params)
    if not dumper.dump():
        error("FAILURE\n")
-    if params.dumpText:
+    if globals.params.dumpText:
        print(globals.textdump.replace("\r", "\n"))
 if __name__ == '__main__':
    main(sys.argv)
 # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
--- a/src/filters/rclxls
+++ b/src/filters/rclxls
@ -17,12 +17,6 @@
 #================================================================
 # Handle excel files for recoll. 
 # Uses xls2csv from the catdoc utilities
 # (http://ftp.45.free.net/~vitus/software/catdoc/)
 # Note: xls2csv is supposed to detect the source charset from the excel
 # file but this does not always work. If you see unexpected russian chars
 # (the russian author's default charset) in the output, you may want to add
 # ie a -s 8859-1 option to the xls2csv command line.
 #================================================================
 # set variables
 LANG=C ; export LANG
@ -32,8 +26,6 @@ filetype=excel
 #RECFILTCOMMONCODE
 ##############################################################################
 # !! Leave the previous line unmodified!! Code imported from the
@ -100,7 +92,11 @@ umask 77
 # !! Leave the following line unmodified !
 #ENDRECFILTCOMMONCODE
-checkcmds xls2csv
+top=`dirname $0`
 XLSDUMP="$top/xls-dump.py"
 XMLTOCSV="$top/xlsxmltocsv.py"
 checkcmds $XLSDUMP $XLSTOCSV
 # output the result
 echo '<html><head>'
@ -109,7 +105,8 @@ echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
 echo '</head><body>'
 echo '<pre>'
-xls2csv -c'	' -b"<hr>" -d utf-8 "$infile" | \
+$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
   $XMLTOCSV | \
   sed -e 's/</&lt;/g' -e 's/&/&amp;/g' 
 echo '</pre>'
--- a/src/filters/xls-dump.py
+++ b/src/filters/xls-dump.py
@ -0,0 +1,243 @@
 #!/usr/bin/env python2
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 import sys, os.path, optparse
 sys.path.append(sys.path[0]+"/msodump.zip")
 from msodumper import ole, xlsstream, globals, node, xlsmodel, olestream
 from msodumper import xlsparser, msocrypto
 from msodumper.globals import error
 def equalsName (name, array):
    if len(name) != len(array):
        return False
    for i in xrange(0, len(name)):
        if ord(name[i]) != array[i]:
            return False
    return True
 def isOleStream (dirname):
    """Determine whether or not a stream is an OLE stream.
 Accodring to the spec, an OLE stream is always named '\1Ole'."""
    name = [0x01, 0x4F, 0x6C, 0x65] # 0x01, 'Ole'
    return equalsName(dirname, name)
 def isCompObjStream (dirname):
    name = [0x01, 0x43, 0x6F, 0x6D, 0x70, 0x4F, 0x62, 0x6A] # 0x01, 'CompObj'
    return equalsName(dirname, name)
 class XLDumper(object):
    def __init__ (self, filepath, params):
        self.filepath = filepath
        self.params = params
        self.strm = None
        self.strmData = None
    def __printDirHeader (self, direntry, byteLen):
        dirname = direntry.Name
        dirname = globals.encodeName(dirname)
        print("")
        print("="*globals.OutputWidth)
        if direntry.isStorage():
            print("%s (storage)"%dirname)
        else:
            print("%s (stream, size: %d bytes)"%(dirname, byteLen))
        print("-"*globals.OutputWidth)
    def __parseFile (self):
        file = open(self.filepath, 'rb')
        self.strmData = xlsstream.StreamData()
        self.strm = xlsstream.XLStream(file.read(), self.params, self.strmData)
        file.close()
    def dumpXML (self):
        self.__parseFile()
        dirs = self.strm.getDirectoryEntries()
        docroot = node.Root()
        root = docroot.appendElement('xls-dump')
        for d in dirs:
            if d.Name != "Workbook":
                # for now, we only dump the Workbook directory stream.
                continue
            dirstrm = self.strm.getDirectoryStream(d)
            data = self.__readSubStreamXML(dirstrm)
            self.__dumpDataAsXML(data, root)
        node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
    def dumpCanonicalXML (self):
        self.__parseFile()
        docroot = node.Root()
        root = docroot.appendElement('xls-dump')
        dirEntries = self.strm.getDirectoryEntries()
        for entry in dirEntries:
            dirname = entry.Name
            if dirname != "Workbook":
                # for now, we only dump the Workbook directory stream.
                continue
            dirstrm = self.strm.getDirectoryStream(entry)
            wbmodel = self.__buildWorkbookModel(dirstrm)
            wbmodel.encrypted = self.strmData.encrypted
            root.appendChild(wbmodel.createDOM())
        node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
    def dump (self):
        self.__parseFile()
        self.strm.printStreamInfo()
        self.strm.printHeader()
        self.strm.printMSAT()
        self.strm.printSAT()
        self.strm.printSSAT()
        self.strm.printDirectory()
        dirEntries = self.strm.getDirectoryEntries()
        for entry in dirEntries:
            dirname = entry.Name
            if len(dirname) == 0:
                continue
            dirstrm = self.strm.getDirectoryStream(entry)
            self.__printDirHeader(entry, len(dirstrm.bytes))
            if entry.isStorage():
                continue
            elif dirname == "Workbook":
                success = True
                while success:
                    success = self.__readSubStream(dirstrm)
            elif dirname == "Revision Log":
                dirstrm.type = xlsstream.DirType.RevisionLog
                self.__readSubStream(dirstrm)
            elif dirname == "EncryptionInfo":
                globals.dumpBytes(dirstrm.bytes, 512)
                print("-"*globals.OutputWidth)
                info = msocrypto.EncryptionInfo(dirstrm.bytes)
                info.read()
                info.output()
            elif self.strmData.isPivotCacheStream(dirname):
                dirstrm.type = xlsstream.DirType.PivotTableCache
                self.__readSubStream(dirstrm)
            elif isOleStream(dirname):
                self.__readOleStream(dirstrm)
            elif isCompObjStream(dirname):
                self.__readCompObjStream(dirstrm)
            else:
                globals.dumpBytes(dirstrm.bytes, 512)
    def __readSubStream (self, strm):
        try:
            # read bytes from BOF to EOF.
            header = 0x0000
            while header != 0x000A:
                header = strm.readRecord()
            return True
        except xlsstream.EndOfStream:
            return False
    def __readOleStream (self, dirstrm):
        strm = olestream.OLEStream(dirstrm.bytes)
        strm.read()
    def __readCompObjStream (self, dirstrm):
        try:
            strm = olestream.CompObjStream(dirstrm.bytes)
            strm.read()
        except olestream.CompObjStreamError:
            globals.error("failed to parse CompObj stream.\n")
    def __dumpDataAsXML(self, data, root):
        if isinstance(data, tuple):
            newRoot = root.appendElement(data[0])
            if isinstance(data[1], dict): # attrs
                for key,val in data[1].iteritems():
                    newRoot.setAttr(key, val)
                if len(data) > 2: # data has a list of children
                    self.__dumpDataAsXML(data[2], newRoot)
            else:
                self.__dumpDataAsXML(data[1], newRoot)
        elif isinstance(data, list):
            for x in data:
                self.__dumpDataAsXML(x, root)
        else:
            pass # we're skipping all unknown elems
    def __readSubStreamXML (self, strm):
        handlers = []
        try:
            while True:
                handler = strm.getNextRecordHandler()
                handlers.append(handler)
        except xlsstream.EndOfStream:
            pass
        parser = xlsparser.XlsParser(handlers)
        return parser.dumpData()
    def __buildWorkbookModel (self, strm):
        model = xlsmodel.Workbook()
        try:
            while True:
                strm.fillModel(model)
        except xlsstream.EndOfStream:
            pass
        return model
 def main ():
    parser = optparse.OptionParser()
    parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False,
        help="Turn on debug mode")
    parser.add_option("--show-sector-chain", action="store_true", dest="show_sector_chain", default=False,
        help="Show sector chain information at the start of the output.")
    parser.add_option("--show-stream-pos", action="store_true", dest="show_stream_pos", default=False,
        help="Show the position of each record relative to the stream.")
    parser.add_option("--dump-mode", dest="dump_mode", default="flat", metavar="MODE",
        help="Specify the dump mode.  Possible values are: 'flat', 'xml', or 'canonical-xml'.  The default value is 'flat'.")
    parser.add_option("--catch", action="store_true", dest="catch_exceptions", default=False,
        help="Catch exceptions and try to continue.")
    parser.add_option("--utf-8", action="store_true", dest="utf8", default=False,
        help="Output strings as UTF-8.")
    options, args = parser.parse_args()
    params = globals.params
    params.debug = options.debug
    params.showSectorChain = options.show_sector_chain
    params.showStreamPos = options.show_stream_pos
    params.catchExceptions = options.catch_exceptions
    params.utf8 = options.utf8
    if len(args) < 1:
        globals.error("takes at least one argument\n")
        parser.print_help()
        sys.exit(1)
    dumper = XLDumper(args[0], params)
    if options.dump_mode == 'flat':
        dumper.dump()
    elif options.dump_mode == 'xml':
        dumper.dumpXML()
    elif options.dump_mode == 'canonical-xml' or options.dump_mode == 'cxml':
        dumper.dumpCanonicalXML()
    else:
        error("unknown dump mode: '%s'\n"%options.dump_mode)
        parser.print_help()
        sys.exit(1)
 if __name__ == '__main__':
    main()
 # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
--- a/src/filters/xlsxmltocsv.py
+++ b/src/filters/xlsxmltocsv.py
@ -0,0 +1,48 @@
 #!/usr/bin/env python
 import sys
 import xml.sax
 dtt = True
 if dtt:
    sepstring = "\t"
    dquote = ''
 else:
    sepstring = ","
    dquote = '"'
 class XlsXmlHandler(xml.sax.handler.ContentHandler):
    def startElement(self, name, attrs):
        if name == "worksheet":
            if "name" in attrs:
                print("%s" % attrs["name"].encode("UTF-8"))
        elif name == "row":
            self.cells = dict()
        elif name == "label-cell" or name == "number-cell":
            if "value" in attrs:
                value = attrs["value"].encode("UTF-8")
            else:
                value = unicode()
            if "col" in attrs:
                self.cells[int(attrs["col"])] = value
            else:
                #??
                sys.stdout.write("%s%s"%(value.encode("UTF-8"),sepstring))
        elif name == "formula-cell":
            if "formula-result" in attrs and "col" in attrs:
                self.cells[int(attrs["col"])] = \
                             attrs["formula-result"].encode("UTF-8")
    def endElement(self, name, ):
        if name == "row":
            curidx = 0
            for idx, value in self.cells.iteritems():
                sys.stdout.write(sepstring * (idx - curidx))
                sys.stdout.write('%s%s%s' % (dquote, value, dquote))
                curidx = idx
            sys.stdout.write("\n")
        elif name == "worksheet":
            print("")
 xml.sax.parse(sys.stdin, XlsXmlHandler())
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -61,7 +61,7 @@ application/vnd.ms-office = exec rcldoc
 application/ogg = execm rclaudio
 application/pdf = exec rclpdf
 application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
-application/vnd.ms-excel = exec xls2csv -c "	" -d utf-8;mimetype=text/plain
+application/vnd.ms-excel = exec rclxls
 application/vnd.ms-powerpoint = exec rclppt
 application/vnd.oasis.opendocument.text = exec rclsoff
 application/vnd.oasis.opendocument.text-template = exec rclsoff