Replace catdoc with mso-dumper for XLS too

2014-01-09 17:44:05 +01:00 · 2014-01-09 17:44:05 +01:00 · 49fbaf1a81
commit 49fbaf1a81
parent 458e51efca
6 changed files with 326 additions and 51 deletions
--- a/src/filters/msodump.zip
+++ b/src/filters/msodump.zip
--- a/src/filters/ppt-dump.py
+++ b/src/filters/ppt-dump.py
@ -1,36 +1,14 @@
 #!/usr/bin/env python2
-########################################################################
 #
-#  Copyright (c) 2010 Kohei Yoshida, Thorsten Behrens
-#  
-#  Permission is hereby granted, free of charge, to any person
-#  obtaining a copy of this software and associated documentation
-#  files (the "Software"), to deal in the Software without
-#  restriction, including without limitation the rights to use,
-#  copy, modify, merge, publish, distribute, sublicense, and/or sell
-#  copies of the Software, and to permit persons to whom the
-#  Software is furnished to do so, subject to the following
-#  conditions:
-#  
-#  The above copyright notice and this permission notice shall be
-#  included in all copies or substantial portions of the Software.
-#  
-#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-#  OTHER DEALINGS IN THE SOFTWARE.
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
-########################################################################

 import sys, os.path, getopt
-sys.path.append(sys.path[0]+"/msodump.zip/src")
-import ole, pptstream, globals, olestream
-
-from globals import error
+sys.path.append(sys.path[0]+"/msodump.zip")
+from msodumper import ole, pptstream, globals, olestream
+from msodumper.globals import error

 def usage (exname):
    exname = os.path.basename(exname)
@ -38,9 +16,11 @@ def usage (exname):

 Options:
  --help        displays this help message.
-  --no-struct-output suppress normal disassembly output
-  --dump-text   print the textual content
-"""%exname
+  --no-struct-output suppress normal structure analysis output
+  --dump-text   extract and print the textual content
+  --no-raw-dumps suppress raw hex dumps of uninterpreted areas
+  --id-select=id1[,id2 ...] limit output to selected record Ids
+""" % exname
    print msg


@ -104,24 +84,29 @@ def main (args):
        usage(exname)
        return

-    params = globals.Params()
    try:
        opts, args = getopt.getopt(args, "h",
                                   ["help", "debug", "show-sector-chain",
-                                    "no-struct-output", "dump-text"])
+                                    "no-struct-output", "dump-text",
+                                    "id-select=", "no-raw-dumps"])
        for opt, arg in opts:
            if opt in ['-h', '--help']:
                usage(exname)
                return
            elif opt in ['--debug']:
-                params.debug = True
+                globals.params.debug = True
            elif opt in ['--show-sector-chain']:
-                params.showSectorChain = True
+                globals.params.showSectorChain = True
            elif opt in ['--no-struct-output']:
-                globals.muteOutput(1)
-                params.noStructOutput = True
+                globals.params.noStructOutput = True
            elif opt in ['--dump-text']:
-                params.dumpText = True
+                globals.params.dumpText = True
+            elif opt in ['--no-raw-dumps']:
+                globals.params.noRawDumps = True
+            elif opt in ['--id-select']:
+                globals.params.dumpedIds = arg.split(",")
+                globals.params.dumpedIds = \
+                    set([int(val) for val in globals.params.dumpedIds if val])
            else:
                error("unknown option %s\n"%opt)
                usage()
@ -131,11 +116,13 @@ def main (args):
        usage(exname)
        return

-    dumper = PPTDumper(args[0], params)
+    dumper = PPTDumper(args[0], globals.params)
    if not dumper.dump():
        error("FAILURE\n")
-    if params.dumpText:
+    if globals.params.dumpText:
        print(globals.textdump.replace("\r", "\n"))

 if __name__ == '__main__':
    main(sys.argv)
+
+# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
--- a/src/filters/rclxls
+++ b/src/filters/rclxls
@ -17,12 +17,6 @@

 #================================================================
 # Handle excel files for recoll. 
-# Uses xls2csv from the catdoc utilities
-# (http://ftp.45.free.net/~vitus/software/catdoc/)
-# Note: xls2csv is supposed to detect the source charset from the excel
-# file but this does not always work. If you see unexpected russian chars
-# (the russian author's default charset) in the output, you may want to add
-# ie a -s 8859-1 option to the xls2csv command line.
 #================================================================
 # set variables
 LANG=C ; export LANG
@ -32,8 +26,6 @@ filetype=excel



-
-
 #RECFILTCOMMONCODE
 ##############################################################################
 # !! Leave the previous line unmodified!! Code imported from the
@ -100,7 +92,11 @@ umask 77
 # !! Leave the following line unmodified !
 #ENDRECFILTCOMMONCODE

-checkcmds xls2csv
+top=`dirname $0`
+XLSDUMP="$top/xls-dump.py"
+XMLTOCSV="$top/xlsxmltocsv.py"
+
+checkcmds $XLSDUMP $XLSTOCSV

 # output the result
 echo '<html><head>'
@ -109,7 +105,8 @@ echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
 echo '</head><body>'
 echo '<pre>'

-xls2csv -c'	' -b"<hr>" -d utf-8 "$infile" | \
+$XLSDUMP --dump-mode=canonical-xml --utf-8 --catch "$infile" | \
+   $XMLTOCSV | \
   sed -e 's/</&lt;/g' -e 's/&/&amp;/g' 

 echo '</pre>'
--- a/src/filters/xls-dump.py
+++ b/src/filters/xls-dump.py
@ -0,0 +1,243 @@
+#!/usr/bin/env python2
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+import sys, os.path, optparse
+sys.path.append(sys.path[0]+"/msodump.zip")
+
+from msodumper import ole, xlsstream, globals, node, xlsmodel, olestream
+from msodumper import xlsparser, msocrypto
+
+from msodumper.globals import error
+
+def equalsName (name, array):
+    if len(name) != len(array):
+        return False
+
+    for i in xrange(0, len(name)):
+        if ord(name[i]) != array[i]:
+            return False
+
+    return True
+
+def isOleStream (dirname):
+    """Determine whether or not a stream is an OLE stream.
+
+Accodring to the spec, an OLE stream is always named '\1Ole'."""
+
+    name = [0x01, 0x4F, 0x6C, 0x65] # 0x01, 'Ole'
+    return equalsName(dirname, name)
+
+def isCompObjStream (dirname):
+    name = [0x01, 0x43, 0x6F, 0x6D, 0x70, 0x4F, 0x62, 0x6A] # 0x01, 'CompObj'
+    return equalsName(dirname, name)
+
+class XLDumper(object):
+
+    def __init__ (self, filepath, params):
+        self.filepath = filepath
+        self.params = params
+        self.strm = None
+        self.strmData = None
+
+    def __printDirHeader (self, direntry, byteLen):
+        dirname = direntry.Name
+        dirname = globals.encodeName(dirname)
+        print("")
+        print("="*globals.OutputWidth)
+        if direntry.isStorage():
+            print("%s (storage)"%dirname)
+        else:
+            print("%s (stream, size: %d bytes)"%(dirname, byteLen))
+        print("-"*globals.OutputWidth)
+
+    def __parseFile (self):
+        file = open(self.filepath, 'rb')
+        self.strmData = xlsstream.StreamData()
+        self.strm = xlsstream.XLStream(file.read(), self.params, self.strmData)
+        file.close()
+
+    def dumpXML (self):
+        self.__parseFile()
+        dirs = self.strm.getDirectoryEntries()
+        docroot = node.Root()
+        root = docroot.appendElement('xls-dump')
+
+        for d in dirs:
+            if d.Name != "Workbook":
+                # for now, we only dump the Workbook directory stream.
+                continue
+
+            dirstrm = self.strm.getDirectoryStream(d)
+            data = self.__readSubStreamXML(dirstrm)
+            self.__dumpDataAsXML(data, root)
+        node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
+
+    def dumpCanonicalXML (self):
+        self.__parseFile()
+        docroot = node.Root()
+        root = docroot.appendElement('xls-dump')
+
+        dirEntries = self.strm.getDirectoryEntries()
+        for entry in dirEntries:
+            dirname = entry.Name
+            if dirname != "Workbook":
+                # for now, we only dump the Workbook directory stream.
+                continue
+
+            dirstrm = self.strm.getDirectoryStream(entry)
+            wbmodel = self.__buildWorkbookModel(dirstrm)
+            wbmodel.encrypted = self.strmData.encrypted
+            root.appendChild(wbmodel.createDOM())
+
+        node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
+
+    def dump (self):
+        self.__parseFile()
+        self.strm.printStreamInfo()
+        self.strm.printHeader()
+        self.strm.printMSAT()
+        self.strm.printSAT()
+        self.strm.printSSAT()
+        self.strm.printDirectory()
+        dirEntries = self.strm.getDirectoryEntries()
+        for entry in dirEntries:
+            dirname = entry.Name
+            if len(dirname) == 0:
+                continue
+
+            dirstrm = self.strm.getDirectoryStream(entry)
+            self.__printDirHeader(entry, len(dirstrm.bytes))
+            if entry.isStorage():
+                continue
+
+            elif dirname == "Workbook":
+                success = True
+                while success:
+                    success = self.__readSubStream(dirstrm)
+
+            elif dirname == "Revision Log":
+                dirstrm.type = xlsstream.DirType.RevisionLog
+                self.__readSubStream(dirstrm)
+
+            elif dirname == "EncryptionInfo":
+                globals.dumpBytes(dirstrm.bytes, 512)
+                print("-"*globals.OutputWidth)
+                info = msocrypto.EncryptionInfo(dirstrm.bytes)
+                info.read()
+                info.output()
+
+            elif self.strmData.isPivotCacheStream(dirname):
+                dirstrm.type = xlsstream.DirType.PivotTableCache
+                self.__readSubStream(dirstrm)
+            elif isOleStream(dirname):
+                self.__readOleStream(dirstrm)
+            elif isCompObjStream(dirname):
+                self.__readCompObjStream(dirstrm)
+            else:
+                globals.dumpBytes(dirstrm.bytes, 512)
+
+    def __readSubStream (self, strm):
+        try:
+            # read bytes from BOF to EOF.
+            header = 0x0000
+            while header != 0x000A:
+                header = strm.readRecord()
+            return True
+        except xlsstream.EndOfStream:
+            return False
+
+    def __readOleStream (self, dirstrm):
+        strm = olestream.OLEStream(dirstrm.bytes)
+        strm.read()
+
+    def __readCompObjStream (self, dirstrm):
+        try:
+            strm = olestream.CompObjStream(dirstrm.bytes)
+            strm.read()
+        except olestream.CompObjStreamError:
+            globals.error("failed to parse CompObj stream.\n")
+
+    def __dumpDataAsXML(self, data, root):
+        if isinstance(data, tuple):
+            newRoot = root.appendElement(data[0])
+            if isinstance(data[1], dict): # attrs
+                for key,val in data[1].iteritems():
+                    newRoot.setAttr(key, val)
+                if len(data) > 2: # data has a list of children
+                    self.__dumpDataAsXML(data[2], newRoot)
+            else:
+                self.__dumpDataAsXML(data[1], newRoot)
+        elif isinstance(data, list):
+            for x in data:
+                self.__dumpDataAsXML(x, root)
+        else:
+            pass # we're skipping all unknown elems
+
+    def __readSubStreamXML (self, strm):
+        handlers = []
+        try:
+            while True:
+                handler = strm.getNextRecordHandler()
+                handlers.append(handler)
+        except xlsstream.EndOfStream:
+            pass
+        parser = xlsparser.XlsParser(handlers)
+        return parser.dumpData()
+
+    def __buildWorkbookModel (self, strm):
+        model = xlsmodel.Workbook()
+        try:
+            while True:
+                strm.fillModel(model)
+        except xlsstream.EndOfStream:
+            pass
+
+        return model
+
+def main ():
+    parser = optparse.OptionParser()
+    parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False,
+        help="Turn on debug mode")
+    parser.add_option("--show-sector-chain", action="store_true", dest="show_sector_chain", default=False,
+        help="Show sector chain information at the start of the output.")
+    parser.add_option("--show-stream-pos", action="store_true", dest="show_stream_pos", default=False,
+        help="Show the position of each record relative to the stream.")
+    parser.add_option("--dump-mode", dest="dump_mode", default="flat", metavar="MODE",
+        help="Specify the dump mode.  Possible values are: 'flat', 'xml', or 'canonical-xml'.  The default value is 'flat'.")
+    parser.add_option("--catch", action="store_true", dest="catch_exceptions", default=False,
+        help="Catch exceptions and try to continue.")
+    parser.add_option("--utf-8", action="store_true", dest="utf8", default=False,
+        help="Output strings as UTF-8.")
+    options, args = parser.parse_args()
+    params = globals.params
+    params.debug = options.debug
+    params.showSectorChain = options.show_sector_chain
+    params.showStreamPos = options.show_stream_pos
+    params.catchExceptions = options.catch_exceptions
+    params.utf8 = options.utf8
+    
+    if len(args) < 1:
+        globals.error("takes at least one argument\n")
+        parser.print_help()
+        sys.exit(1)
+
+    dumper = XLDumper(args[0], params)
+    if options.dump_mode == 'flat':
+        dumper.dump()
+    elif options.dump_mode == 'xml':
+        dumper.dumpXML()
+    elif options.dump_mode == 'canonical-xml' or options.dump_mode == 'cxml':
+        dumper.dumpCanonicalXML()
+    else:
+        error("unknown dump mode: '%s'\n"%options.dump_mode)
+        parser.print_help()
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
+
+# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
--- a/src/filters/xlsxmltocsv.py
+++ b/src/filters/xlsxmltocsv.py
@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+import sys
+import xml.sax
+
+dtt = True
+
+if dtt:
+    sepstring = "\t"
+    dquote = ''
+else:
+    sepstring = ","
+    dquote = '"'
+    
+class XlsXmlHandler(xml.sax.handler.ContentHandler):
+    def startElement(self, name, attrs):
+        if name == "worksheet":
+            if "name" in attrs:
+                print("%s" % attrs["name"].encode("UTF-8"))
+        elif name == "row":
+            self.cells = dict()
+        elif name == "label-cell" or name == "number-cell":
+            if "value" in attrs:
+                value = attrs["value"].encode("UTF-8")
+            else:
+                value = unicode()
+            if "col" in attrs:
+                self.cells[int(attrs["col"])] = value
+            else:
+                #??
+                sys.stdout.write("%s%s"%(value.encode("UTF-8"),sepstring))
+        elif name == "formula-cell":
+            if "formula-result" in attrs and "col" in attrs:
+                self.cells[int(attrs["col"])] = \
+                             attrs["formula-result"].encode("UTF-8")
+            
+    def endElement(self, name, ):
+        if name == "row":
+            curidx = 0
+            for idx, value in self.cells.iteritems():
+                sys.stdout.write(sepstring * (idx - curidx))
+                sys.stdout.write('%s%s%s' % (dquote, value, dquote))
+                curidx = idx
+            sys.stdout.write("\n")
+        elif name == "worksheet":
+            print("")
+
+xml.sax.parse(sys.stdin, XlsXmlHandler())
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -61,7 +61,7 @@ application/vnd.ms-office = exec rcldoc
 application/ogg = execm rclaudio
 application/pdf = exec rclpdf
 application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
-application/vnd.ms-excel = exec xls2csv -c "	" -d utf-8;mimetype=text/plain
+application/vnd.ms-excel = exec rclxls
 application/vnd.ms-powerpoint = exec rclppt
 application/vnd.oasis.opendocument.text = exec rclsoff
 application/vnd.oasis.opendocument.text-template = exec rclsoff