PPT filter: use mso-dump
This commit is contained in:
parent
2f73e1df75
commit
44995858f5
3 changed files with 154 additions and 70 deletions
BIN
src/filters/msodump.zip
Normal file
BIN
src/filters/msodump.zip
Normal file
Binary file not shown.
141
src/filters/ppt-dump.py
Executable file
141
src/filters/ppt-dump.py
Executable file
|
@ -0,0 +1,141 @@
|
|||
#!/usr/bin/env python2
|
||||
########################################################################
|
||||
#
|
||||
# Copyright (c) 2010 Kohei Yoshida, Thorsten Behrens
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person
|
||||
# obtaining a copy of this software and associated documentation
|
||||
# files (the "Software"), to deal in the Software without
|
||||
# restriction, including without limitation the rights to use,
|
||||
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following
|
||||
# conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be
|
||||
# included in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
# OTHER DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
########################################################################
|
||||
|
||||
import sys, os.path, getopt
|
||||
sys.path.append(sys.path[0]+"/msodump.zip/src")
|
||||
import ole, pptstream, globals, olestream
|
||||
|
||||
from globals import error
|
||||
|
||||
def usage (exname):
|
||||
exname = os.path.basename(exname)
|
||||
msg = """Usage: %s [options] [ppt file]
|
||||
|
||||
Options:
|
||||
--help displays this help message.
|
||||
--no-struct-output suppress normal disassembly output
|
||||
--dump-text print the textual content
|
||||
"""%exname
|
||||
print msg
|
||||
|
||||
|
||||
class PPTDumper(object):
|
||||
|
||||
def __init__ (self, filepath, params):
|
||||
self.filepath = filepath
|
||||
self.params = params
|
||||
|
||||
def __printDirHeader (self, dirname, byteLen):
|
||||
dirname = globals.encodeName(dirname)
|
||||
globals.outputln("")
|
||||
globals.outputln("="*68)
|
||||
globals.outputln("%s (size: %d bytes)"%(dirname, byteLen))
|
||||
globals.outputln("-"*68)
|
||||
|
||||
def dump (self):
|
||||
file = open(self.filepath, 'rb')
|
||||
strm = pptstream.PPTFile(file.read(), self.params)
|
||||
file.close()
|
||||
strm.printStreamInfo()
|
||||
strm.printHeader()
|
||||
strm.printDirectory()
|
||||
dirnames = strm.getDirectoryNames()
|
||||
result = True
|
||||
for dirname in dirnames:
|
||||
if len(dirname) == 0 or dirname == 'Root Entry':
|
||||
continue
|
||||
|
||||
try:
|
||||
dirstrm = strm.getDirectoryStreamByName(dirname)
|
||||
except Exception, err:
|
||||
error("getDirectoryStreamByName(%s): %s\n" % (dirname,str(err)))
|
||||
# The previous version was killed by the exception
|
||||
# here, so the equivalent is to break, but maybe there
|
||||
# is no reason to do so.
|
||||
break
|
||||
self.__printDirHeader(dirname, len(dirstrm.bytes))
|
||||
if dirname == "PowerPoint Document":
|
||||
if not self.__readSubStream(dirstrm):
|
||||
result = False
|
||||
elif dirname == "Current User":
|
||||
if not self.__readSubStream(dirstrm):
|
||||
result = False
|
||||
elif dirname == "\x05DocumentSummaryInformation":
|
||||
strm = olestream.PropertySetStream(dirstrm.bytes)
|
||||
strm.read()
|
||||
else:
|
||||
globals.dumpBytes(dirstrm.bytes, 512)
|
||||
return result
|
||||
|
||||
def __readSubStream (self, strm):
|
||||
# read all records in substream
|
||||
return strm.readRecords()
|
||||
|
||||
|
||||
def main (args):
|
||||
exname, args = args[0], args[1:]
|
||||
if len(args) < 1:
|
||||
print("takes at least one argument")
|
||||
usage(exname)
|
||||
return
|
||||
|
||||
params = globals.Params()
|
||||
try:
|
||||
opts, args = getopt.getopt(args, "h",
|
||||
["help", "debug", "show-sector-chain",
|
||||
"no-struct-output", "dump-text"])
|
||||
for opt, arg in opts:
|
||||
if opt in ['-h', '--help']:
|
||||
usage(exname)
|
||||
return
|
||||
elif opt in ['--debug']:
|
||||
params.debug = True
|
||||
elif opt in ['--show-sector-chain']:
|
||||
params.showSectorChain = True
|
||||
elif opt in ['--no-struct-output']:
|
||||
globals.muteOutput(1)
|
||||
params.noStructOutput = True
|
||||
elif opt in ['--dump-text']:
|
||||
params.dumpText = True
|
||||
else:
|
||||
error("unknown option %s\n"%opt)
|
||||
usage()
|
||||
|
||||
except getopt.GetoptError:
|
||||
error("error parsing input options\n")
|
||||
usage(exname)
|
||||
return
|
||||
|
||||
dumper = PPTDumper(args[0], params)
|
||||
if not dumper.dump():
|
||||
error("FAILURE\n")
|
||||
if params.dumpText:
|
||||
print(globals.textdump.replace("\r", "\n"))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
|
@ -17,11 +17,9 @@
|
|||
|
||||
#================================================================
|
||||
# Handle powerpoint files for recoll.
|
||||
# Uses catppt from the catdoc utilities
|
||||
# (http://ftp.45.free.net/~vitus/software/catdoc/)
|
||||
# In my experience, this sometimes fail to extract text, printing "Default
|
||||
# Design" ou "format par defaut" instead and only.
|
||||
#
|
||||
# Use unoconv, this is very slow, but catppt just can't handle the majority
|
||||
# of semi-modern ppt files
|
||||
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
|
@ -96,72 +94,17 @@ umask 77
|
|||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
havecappt=no
|
||||
iscmd cappt && havecappt=yes
|
||||
haveunoconv=no
|
||||
iscmd unoconv && haveunoconv=yes
|
||||
iscmd pdftotext || haveunoconv=no
|
||||
filtersdir=`dirname $0`
|
||||
checkcmds $filtersdir/ppt-dump.py
|
||||
|
||||
if test X$havecatppt = Xno -a X$haveunoconv = Xno ; then
|
||||
# checkcmds will exit with the appropriate salutations
|
||||
checkcmds catppt unoconv pdftotext
|
||||
fi
|
||||
mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text"
|
||||
|
||||
# This needs a temp dir because we first output pdf (outputting html
|
||||
# would produce one file per page), and pdftotext can't read from
|
||||
# stdin
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
cat <<EOF
|
||||
<html><head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
|
||||
</head><body><pre>
|
||||
EOF
|
||||
|
||||
tmpdir=$ttdir/rclppt_tmp$$
|
||||
mkdir $tmpdir || exit 1
|
||||
mkdir $tmpdir/rclppttmp || exit 1
|
||||
$mso "$infile"| sed -e 's/</</g' -e 's/&/&/g'
|
||||
|
||||
# We have to use a directory as output parameter to unoconv. Up to
|
||||
# version 0.5, it could not use a file name for this
|
||||
unopdf=$tmpdir/rclppttmp
|
||||
cattxt=$tmpdir/rclppttmp/output.txt
|
||||
cleanup()
|
||||
{
|
||||
# Note that we're using a constant part (rclppttmp), which
|
||||
# hopefully guarantees that we can't do big mistakes here.
|
||||
rm -rf $tmpdir/rclppttmp
|
||||
rmdir $tmpdir
|
||||
}
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
# Try catppt. If the output looks too small and unoconv is available,
|
||||
# use this instead. unoconv is very slow but it handles newer files
|
||||
# that catppt will not convert.
|
||||
#
|
||||
# I'm not sure of the right test for detecting catppt failure. On the
|
||||
# sample I have, it outputs Azure\n1_Azure\n\n. I don't know if Azure
|
||||
# is a good marker of failure. Anyway, it seems unlikely that a real
|
||||
# ppt would have fewer than 5 lines
|
||||
|
||||
catppt -d utf-8 "$infile" > $cattxt
|
||||
lines=`wc -l < $cattxt`
|
||||
|
||||
if test $lines -lt 5 -a X$haveunoconv = Xyes; then
|
||||
unoconv -f pdf -o "$unopdf" "$infile"
|
||||
sinfile=`basename "$infile"`
|
||||
`dirname $0`/rclpdf "$unopdf/${sinfile%.*}.pdf"
|
||||
else
|
||||
# output the catppt result
|
||||
echo '<html><head>'
|
||||
#echo '<title>' "$title" '</title>'
|
||||
echo '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
|
||||
echo '</head><body>'
|
||||
echo '<pre>'
|
||||
|
||||
catppt -d utf-8 "$infile" | \
|
||||
sed -e 's/</</g' -e 's/&/&/g' < $cattxt
|
||||
|
||||
echo '</pre>'
|
||||
echo '</body></html>'
|
||||
fi
|
||||
echo '</pre></body></html>'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue