PPT filter: use mso-dump
This commit is contained in:
parent
2f73e1df75
commit
44995858f5
3 changed files with 154 additions and 70 deletions
BIN
src/filters/msodump.zip
Normal file
BIN
src/filters/msodump.zip
Normal file
Binary file not shown.
141
src/filters/ppt-dump.py
Executable file
141
src/filters/ppt-dump.py
Executable file
|
@ -0,0 +1,141 @@
|
||||||
|
#!/usr/bin/env python2
|
||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# Copyright (c) 2010 Kohei Yoshida, Thorsten Behrens
|
||||||
|
#
|
||||||
|
# Permission is hereby granted, free of charge, to any person
|
||||||
|
# obtaining a copy of this software and associated documentation
|
||||||
|
# files (the "Software"), to deal in the Software without
|
||||||
|
# restriction, including without limitation the rights to use,
|
||||||
|
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
# copies of the Software, and to permit persons to whom the
|
||||||
|
# Software is furnished to do so, subject to the following
|
||||||
|
# conditions:
|
||||||
|
#
|
||||||
|
# The above copyright notice and this permission notice shall be
|
||||||
|
# included in all copies or substantial portions of the Software.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
# OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
|
||||||
|
import sys, os.path, getopt
|
||||||
|
sys.path.append(sys.path[0]+"/msodump.zip/src")
|
||||||
|
import ole, pptstream, globals, olestream
|
||||||
|
|
||||||
|
from globals import error
|
||||||
|
|
||||||
|
def usage (exname):
|
||||||
|
exname = os.path.basename(exname)
|
||||||
|
msg = """Usage: %s [options] [ppt file]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--help displays this help message.
|
||||||
|
--no-struct-output suppress normal disassembly output
|
||||||
|
--dump-text print the textual content
|
||||||
|
"""%exname
|
||||||
|
print msg
|
||||||
|
|
||||||
|
|
||||||
|
class PPTDumper(object):
|
||||||
|
|
||||||
|
def __init__ (self, filepath, params):
|
||||||
|
self.filepath = filepath
|
||||||
|
self.params = params
|
||||||
|
|
||||||
|
def __printDirHeader (self, dirname, byteLen):
|
||||||
|
dirname = globals.encodeName(dirname)
|
||||||
|
globals.outputln("")
|
||||||
|
globals.outputln("="*68)
|
||||||
|
globals.outputln("%s (size: %d bytes)"%(dirname, byteLen))
|
||||||
|
globals.outputln("-"*68)
|
||||||
|
|
||||||
|
def dump (self):
|
||||||
|
file = open(self.filepath, 'rb')
|
||||||
|
strm = pptstream.PPTFile(file.read(), self.params)
|
||||||
|
file.close()
|
||||||
|
strm.printStreamInfo()
|
||||||
|
strm.printHeader()
|
||||||
|
strm.printDirectory()
|
||||||
|
dirnames = strm.getDirectoryNames()
|
||||||
|
result = True
|
||||||
|
for dirname in dirnames:
|
||||||
|
if len(dirname) == 0 or dirname == 'Root Entry':
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
dirstrm = strm.getDirectoryStreamByName(dirname)
|
||||||
|
except Exception, err:
|
||||||
|
error("getDirectoryStreamByName(%s): %s\n" % (dirname,str(err)))
|
||||||
|
# The previous version was killed by the exception
|
||||||
|
# here, so the equivalent is to break, but maybe there
|
||||||
|
# is no reason to do so.
|
||||||
|
break
|
||||||
|
self.__printDirHeader(dirname, len(dirstrm.bytes))
|
||||||
|
if dirname == "PowerPoint Document":
|
||||||
|
if not self.__readSubStream(dirstrm):
|
||||||
|
result = False
|
||||||
|
elif dirname == "Current User":
|
||||||
|
if not self.__readSubStream(dirstrm):
|
||||||
|
result = False
|
||||||
|
elif dirname == "\x05DocumentSummaryInformation":
|
||||||
|
strm = olestream.PropertySetStream(dirstrm.bytes)
|
||||||
|
strm.read()
|
||||||
|
else:
|
||||||
|
globals.dumpBytes(dirstrm.bytes, 512)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def __readSubStream (self, strm):
|
||||||
|
# read all records in substream
|
||||||
|
return strm.readRecords()
|
||||||
|
|
||||||
|
|
||||||
|
def main (args):
|
||||||
|
exname, args = args[0], args[1:]
|
||||||
|
if len(args) < 1:
|
||||||
|
print("takes at least one argument")
|
||||||
|
usage(exname)
|
||||||
|
return
|
||||||
|
|
||||||
|
params = globals.Params()
|
||||||
|
try:
|
||||||
|
opts, args = getopt.getopt(args, "h",
|
||||||
|
["help", "debug", "show-sector-chain",
|
||||||
|
"no-struct-output", "dump-text"])
|
||||||
|
for opt, arg in opts:
|
||||||
|
if opt in ['-h', '--help']:
|
||||||
|
usage(exname)
|
||||||
|
return
|
||||||
|
elif opt in ['--debug']:
|
||||||
|
params.debug = True
|
||||||
|
elif opt in ['--show-sector-chain']:
|
||||||
|
params.showSectorChain = True
|
||||||
|
elif opt in ['--no-struct-output']:
|
||||||
|
globals.muteOutput(1)
|
||||||
|
params.noStructOutput = True
|
||||||
|
elif opt in ['--dump-text']:
|
||||||
|
params.dumpText = True
|
||||||
|
else:
|
||||||
|
error("unknown option %s\n"%opt)
|
||||||
|
usage()
|
||||||
|
|
||||||
|
except getopt.GetoptError:
|
||||||
|
error("error parsing input options\n")
|
||||||
|
usage(exname)
|
||||||
|
return
|
||||||
|
|
||||||
|
dumper = PPTDumper(args[0], params)
|
||||||
|
if not dumper.dump():
|
||||||
|
error("FAILURE\n")
|
||||||
|
if params.dumpText:
|
||||||
|
print(globals.textdump.replace("\r", "\n"))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main(sys.argv)
|
|
@ -17,11 +17,9 @@
|
||||||
|
|
||||||
#================================================================
|
#================================================================
|
||||||
# Handle powerpoint files for recoll.
|
# Handle powerpoint files for recoll.
|
||||||
# Uses catppt from the catdoc utilities
|
# Use unoconv, this is very slow, but catppt just can't handle the majority
|
||||||
# (http://ftp.45.free.net/~vitus/software/catdoc/)
|
# of semi-modern ppt files
|
||||||
# In my experience, this sometimes fail to extract text, printing "Default
|
|
||||||
# Design" ou "format par defaut" instead and only.
|
|
||||||
#
|
|
||||||
#================================================================
|
#================================================================
|
||||||
|
|
||||||
# set variables
|
# set variables
|
||||||
|
@ -96,72 +94,17 @@ umask 77
|
||||||
# !! Leave the following line unmodified !
|
# !! Leave the following line unmodified !
|
||||||
#ENDRECFILTCOMMONCODE
|
#ENDRECFILTCOMMONCODE
|
||||||
|
|
||||||
havecappt=no
|
filtersdir=`dirname $0`
|
||||||
iscmd cappt && havecappt=yes
|
checkcmds $filtersdir/ppt-dump.py
|
||||||
haveunoconv=no
|
|
||||||
iscmd unoconv && haveunoconv=yes
|
|
||||||
iscmd pdftotext || haveunoconv=no
|
|
||||||
|
|
||||||
if test X$havecatppt = Xno -a X$haveunoconv = Xno ; then
|
mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text"
|
||||||
# checkcmds will exit with the appropriate salutations
|
|
||||||
checkcmds catppt unoconv pdftotext
|
|
||||||
fi
|
|
||||||
|
|
||||||
# This needs a temp dir because we first output pdf (outputting html
|
cat <<EOF
|
||||||
# would produce one file per page), and pdftotext can't read from
|
<html><head>
|
||||||
# stdin
|
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
|
||||||
if test z"$RECOLL_TMPDIR" != z; then
|
</head><body><pre>
|
||||||
ttdir=$RECOLL_TMPDIR
|
EOF
|
||||||
elif test z"$TMPDIR" != z ; then
|
|
||||||
ttdir=$TMPDIR
|
|
||||||
else
|
|
||||||
ttdir=/tmp
|
|
||||||
fi
|
|
||||||
|
|
||||||
tmpdir=$ttdir/rclppt_tmp$$
|
$mso "$infile"| sed -e 's/</</g' -e 's/&/&/g'
|
||||||
mkdir $tmpdir || exit 1
|
|
||||||
mkdir $tmpdir/rclppttmp || exit 1
|
|
||||||
|
|
||||||
# We have to use a directory as output parameter to unoconv. Up to
|
echo '</pre></body></html>'
|
||||||
# version 0.5, it could not use a file name for this
|
|
||||||
unopdf=$tmpdir/rclppttmp
|
|
||||||
cattxt=$tmpdir/rclppttmp/output.txt
|
|
||||||
cleanup()
|
|
||||||
{
|
|
||||||
# Note that we're using a constant part (rclppttmp), which
|
|
||||||
# hopefully guarantees that we can't do big mistakes here.
|
|
||||||
rm -rf $tmpdir/rclppttmp
|
|
||||||
rmdir $tmpdir
|
|
||||||
}
|
|
||||||
trap cleanup EXIT HUP QUIT INT TERM
|
|
||||||
|
|
||||||
# Try catppt. If the output looks too small and unoconv is available,
|
|
||||||
# use this instead. unoconv is very slow but it handles newer files
|
|
||||||
# that catppt will not convert.
|
|
||||||
#
|
|
||||||
# I'm not sure of the right test for detecting catppt failure. On the
|
|
||||||
# sample I have, it outputs Azure\n1_Azure\n\n. I don't know if Azure
|
|
||||||
# is a good marker of failure. Anyway, it seems unlikely that a real
|
|
||||||
# ppt would have fewer than 5 lines
|
|
||||||
|
|
||||||
catppt -d utf-8 "$infile" > $cattxt
|
|
||||||
lines=`wc -l < $cattxt`
|
|
||||||
|
|
||||||
if test $lines -lt 5 -a X$haveunoconv = Xyes; then
|
|
||||||
unoconv -f pdf -o "$unopdf" "$infile"
|
|
||||||
sinfile=`basename "$infile"`
|
|
||||||
`dirname $0`/rclpdf "$unopdf/${sinfile%.*}.pdf"
|
|
||||||
else
|
|
||||||
# output the catppt result
|
|
||||||
echo '<html><head>'
|
|
||||||
#echo '<title>' "$title" '</title>'
|
|
||||||
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
|
||||||
echo '</head><body>'
|
|
||||||
echo '<pre>'
|
|
||||||
|
|
||||||
catppt -d utf-8 "$infile" | \
|
|
||||||
sed -e 's/</</g' -e 's/&/&/g' < $cattxt
|
|
||||||
|
|
||||||
echo '</pre>'
|
|
||||||
echo '</body></html>'
|
|
||||||
fi
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue