PPT filter: use mso-dump

This commit is contained in:
Jean-Francois Dockes 2013-11-19 14:42:05 +01:00
parent 2f73e1df75
commit 44995858f5
3 changed files with 154 additions and 70 deletions

BIN
src/filters/msodump.zip Normal file

Binary file not shown.

141
src/filters/ppt-dump.py Executable file
View file

@ -0,0 +1,141 @@
#!/usr/bin/env python2
########################################################################
#
# Copyright (c) 2010 Kohei Yoshida, Thorsten Behrens
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
########################################################################
import sys, os.path, getopt
sys.path.append(sys.path[0]+"/msodump.zip/src")
import ole, pptstream, globals, olestream
from globals import error
def usage (exname):
exname = os.path.basename(exname)
msg = """Usage: %s [options] [ppt file]
Options:
--help displays this help message.
--no-struct-output suppress normal disassembly output
--dump-text print the textual content
"""%exname
print msg
class PPTDumper(object):
def __init__ (self, filepath, params):
self.filepath = filepath
self.params = params
def __printDirHeader (self, dirname, byteLen):
dirname = globals.encodeName(dirname)
globals.outputln("")
globals.outputln("="*68)
globals.outputln("%s (size: %d bytes)"%(dirname, byteLen))
globals.outputln("-"*68)
def dump (self):
file = open(self.filepath, 'rb')
strm = pptstream.PPTFile(file.read(), self.params)
file.close()
strm.printStreamInfo()
strm.printHeader()
strm.printDirectory()
dirnames = strm.getDirectoryNames()
result = True
for dirname in dirnames:
if len(dirname) == 0 or dirname == 'Root Entry':
continue
try:
dirstrm = strm.getDirectoryStreamByName(dirname)
except Exception, err:
error("getDirectoryStreamByName(%s): %s\n" % (dirname,str(err)))
# The previous version was killed by the exception
# here, so the equivalent is to break, but maybe there
# is no reason to do so.
break
self.__printDirHeader(dirname, len(dirstrm.bytes))
if dirname == "PowerPoint Document":
if not self.__readSubStream(dirstrm):
result = False
elif dirname == "Current User":
if not self.__readSubStream(dirstrm):
result = False
elif dirname == "\x05DocumentSummaryInformation":
strm = olestream.PropertySetStream(dirstrm.bytes)
strm.read()
else:
globals.dumpBytes(dirstrm.bytes, 512)
return result
def __readSubStream (self, strm):
# read all records in substream
return strm.readRecords()
def main (args):
exname, args = args[0], args[1:]
if len(args) < 1:
print("takes at least one argument")
usage(exname)
return
params = globals.Params()
try:
opts, args = getopt.getopt(args, "h",
["help", "debug", "show-sector-chain",
"no-struct-output", "dump-text"])
for opt, arg in opts:
if opt in ['-h', '--help']:
usage(exname)
return
elif opt in ['--debug']:
params.debug = True
elif opt in ['--show-sector-chain']:
params.showSectorChain = True
elif opt in ['--no-struct-output']:
globals.muteOutput(1)
params.noStructOutput = True
elif opt in ['--dump-text']:
params.dumpText = True
else:
error("unknown option %s\n"%opt)
usage()
except getopt.GetoptError:
error("error parsing input options\n")
usage(exname)
return
dumper = PPTDumper(args[0], params)
if not dumper.dump():
error("FAILURE\n")
if params.dumpText:
print(globals.textdump.replace("\r", "\n"))
if __name__ == '__main__':
main(sys.argv)

View file

@ -17,11 +17,9 @@
#================================================================
# Handle powerpoint files for recoll.
# Uses catppt from the catdoc utilities
# (http://ftp.45.free.net/~vitus/software/catdoc/)
# In my experience, this sometimes fail to extract text, printing "Default
# Design" ou "format par defaut" instead and only.
#
# Use unoconv, this is very slow, but catppt just can't handle the majority
# of semi-modern ppt files
#================================================================
# set variables
@ -96,72 +94,17 @@ umask 77
# !! Leave the following line unmodified !
#ENDRECFILTCOMMONCODE
havecappt=no
iscmd cappt && havecappt=yes
haveunoconv=no
iscmd unoconv && haveunoconv=yes
iscmd pdftotext || haveunoconv=no
filtersdir=`dirname $0`
checkcmds $filtersdir/ppt-dump.py
if test X$havecatppt = Xno -a X$haveunoconv = Xno ; then
# checkcmds will exit with the appropriate salutations
checkcmds catppt unoconv pdftotext
fi
mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text"
# This needs a temp dir because we first output pdf (outputting html
# would produce one file per page), and pdftotext can't read from
# stdin
if test z"$RECOLL_TMPDIR" != z; then
ttdir=$RECOLL_TMPDIR
elif test z"$TMPDIR" != z ; then
ttdir=$TMPDIR
else
ttdir=/tmp
fi
cat <<EOF
<html><head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
</head><body><pre>
EOF
tmpdir=$ttdir/rclppt_tmp$$
mkdir $tmpdir || exit 1
mkdir $tmpdir/rclppttmp || exit 1
$mso "$infile"| sed -e 's/</&lt;/g' -e 's/&/&amp;/g'
# We have to use a directory as output parameter to unoconv. Up to
# version 0.5, it could not use a file name for this
unopdf=$tmpdir/rclppttmp
cattxt=$tmpdir/rclppttmp/output.txt
cleanup()
{
# Note that we're using a constant part (rclppttmp), which
# hopefully guarantees that we can't do big mistakes here.
rm -rf $tmpdir/rclppttmp
rmdir $tmpdir
}
trap cleanup EXIT HUP QUIT INT TERM
# Try catppt. If the output looks too small and unoconv is available,
# use this instead. unoconv is very slow but it handles newer files
# that catppt will not convert.
#
# I'm not sure of the right test for detecting catppt failure. On the
# sample I have, it outputs Azure\n1_Azure\n\n. I don't know if Azure
# is a good marker of failure. Anyway, it seems unlikely that a real
# ppt would have fewer than 5 lines
catppt -d utf-8 "$infile" > $cattxt
lines=`wc -l < $cattxt`
if test $lines -lt 5 -a X$haveunoconv = Xyes; then
unoconv -f pdf -o "$unopdf" "$infile"
sinfile=`basename "$infile"`
`dirname $0`/rclpdf "$unopdf/${sinfile%.*}.pdf"
else
# output the catppt result
echo '<html><head>'
#echo '<title>' "$title" '</title>'
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
echo '</head><body>'
echo '<pre>'
catppt -d utf-8 "$infile" | \
sed -e 's/</&lt;/g' -e 's/&/&amp;/g' < $cattxt
echo '</pre>'
echo '</body></html>'
fi
echo '</pre></body></html>'