From c98bdb0edd5ebf5b222e1df1470a038a68f288f6 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 8 Apr 2016 10:24:52 +0200 Subject: [PATCH] converted rcldjvu to python --- src/Makefile.am | 2 +- src/filters/rcldjvu | 180 ---------------------------------------- src/filters/rcldjvu.py | 107 ++++++++++++++++++++++++ src/sampleconf/mimeconf | 2 +- 4 files changed, 109 insertions(+), 182 deletions(-) delete mode 100755 src/filters/rcldjvu create mode 100755 src/filters/rcldjvu.py diff --git a/src/Makefile.am b/src/Makefile.am index 6b0f25b7..72133dd7 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -566,7 +566,7 @@ filters/rclcheckneedretry.sh \ filters/rclchm \ filters/rclconfig.py \ filters/rcldia \ -filters/rcldjvu \ +filters/rcldjvu.py \ filters/rcldoc.py \ filters/rcldvi \ filters/rclepub \ diff --git a/src/filters/rcldjvu b/src/filters/rcldjvu deleted file mode 100755 index 93210a71..00000000 --- a/src/filters/rcldjvu +++ /dev/null @@ -1,180 +0,0 @@ -#!/bin/sh -# @(#$Id: rcldjvu,v 1.6 2008-10-08 08:27:34 dockes Exp $ (C) 2005 J.F.Dockes - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -#================================================================ -# Extract text from a djvu file by executing djvused and djvutxt -# -# We use djvused to extract a possible title, djvutxt for the text -# -# Of course this only means anything if the djvu document actually has -# a text layer ! -# -# djvu utilities (04-2010) have a bug in which they try to interpret -# and convert file paths as character data, and fail miserably if the -# locale is not consistent with the actual encoding of the path (which -# could be arbitrary binary for all they know). We use a temporary -# symbolic link to get around this. -# -#================================================================ - -progname="rcldjvu" -filetype=dejavu - - -#RECFILTCOMMONCODE -############################################################################## -# !! Leave the previous line unmodified!! Code imported from the -# recfiltcommon file - -# Utility code common to all shell filters. This could be sourced at run -# time, but it's slightly more efficient to include the code in the -# filters at build time (with a sed script). - -# Describe error in a way that can be interpreted by our caller -senderror() -{ - echo RECFILTERROR $* - # Also alert on stderr just in case - echo ":2:$progname::: $*" 1>&2 - exit 1 -} - -iscmd() -{ - cmd=$1 - case $cmd in - */*) - if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;; - *) - oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs - for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done - return 1 ;; - esac -} - -checkcmds() -{ - for cmd in $*;do - if iscmd $cmd - then - a=1 - else - senderror HELPERNOTFOUND $cmd - fi - done -} - -# show help message -if test $# -ne 1 -o "$1" = "--help" -then - echo "Convert a $filetype file to HTML text for Recoll indexing." - echo "Usage: $progname [infile]" - exit 1 -fi - -infile="$1" - -# check the input file existence (may be '-' for stdin) -if test "X$infile" != X- -a ! -f "$infile" -then - senderror INPUTNOSUCHFILE "$infile" -fi - -# protect access to our temp files and directories -umask 77 - -############################################################################## -# !! Leave the following line unmodified ! -#ENDRECFILTCOMMONCODE - -checkcmds djvutxt djvused awk - -# We need a temporary symlink to avoid path encoding issues -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi -tmplink=$ttdir/rcldjvu_tmp$$.djvu -rm -f $tmplink -ln -s "$infile" $tmplink || exit 1 - -cleanup() -{ - rm -f $tmplink -} - -trap cleanup EXIT HUP QUIT INT TERM - -# Title: we try to extract it from the annotations. djvused outputs string -# in C/awk \-escaped notation. Awk can only process this in string -# constants, so we have a first awk pass to create an awk program to parse -# the string as a constant (...). This is not exactly robust or nice -title=`djvused "$tmplink" -e 'select 1;output-ant' | \ -grep ' (title ' | sed -e 's/^.* (title //' -e 's/)$//' |\ -awk ' -{ - printf("BEGIN" " {s = %s; print s}\n", $0) -}' | awk -f -` - - -cat < - - $title - - - -
-EOF
-
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-djvutxt "$tmplink" | sed -e 's/[ 	][ 	]*$//' | \
-awk 'BEGIN'\
-' {
-  cont = ""
-}
-{
-    $0 = cont $0
-    cont = ""
-
-    if ($0 == "\f") {
-       print "

\n
\n

" - next - } else if ($0 ~ /[-]$/) { - # Break at last whitespace - match($0, "[ \t][^ \t]+$") - line = substr($0, 0, RSTART) - cont = substr($0, RSTART, RLENGTH) - $0 = line - gsub("-", "", cont) - } - gsub(/&/, "\\&", $0) - gsub(//, "\\>", $0) - print $0 -}' - -cat < - - -EOF diff --git a/src/filters/rcldjvu.py b/src/filters/rcldjvu.py new file mode 100755 index 00000000..46198a51 --- /dev/null +++ b/src/filters/rcldjvu.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# Copyright (C) 2016 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Recoll DJVU extractor + +from __future__ import print_function + +import os +import sys +import re +import rclexecm +import subprocess + +class DJVUExtractor: + def __init__(self, em): + self.currentindex = 0 + self.djvused = None + self.djvutxt = None + self.em = em + + def extractone(self, params): + self.em.setmimetype('text/html') + + # Extract metadata + if self.djvused: + try: + metadata = subprocess.check_output([self.djvused, self.filename, + "-e", "select 1;print-meta"]) + except Exception as e: + self.em.rclog("djvused failed: %s" % e) + author = "" + title = "" + metadata = metadata.decode('UTF-8', 'replace') + for line in metadata.split('\n'): + line = line.split('"') + if len(line) >= 2: + nm = line[0].strip() + if nm == "author": + author = ' '.join(line[1:]) + elif nm == "title": + title = ' '.join(line[1:]) + + # Main text + try: + txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename]) + except Exception as e: + self.em.rclog("djvused failed: %s" % e) + return (False, "", "", rclexecm.RclExecM.eofnow) + txtdata = txtdata.decode('UTF-8', 'replace') + + data = '''''' + self.em.htmlescape(title) + '''''' + data += '''''' + if author: + data += '''''' + data += '''

'''
+
+        data += self.em.htmlescape(txtdata)
+        data += '''
''' + return (True, data, "", rclexecm.RclExecM.eofnext) + + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.filename = params["filename:"] + self.currentindex = 0 + #self.em.rclog("openfile: [%s]" % self.filename) + + if not self.djvutxt: + self.djvutxt = rclexecm.which("djvutxt") + if not self.djvutxt: + print("RECFILTERROR HELPERNOTFOUND djvutxt") + sys.exit(1); + self.djvused = rclexecm.which("djvused") + + return True + + def getipath(self, params): + return self.extractone(params) + return (ok, data, ipath, eof) + + def getnext(self, params): + if self.currentindex >= 1: + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(params) + self.currentindex += 1 + return ret + +# Main program: create protocol handler and extractor and run them +proto = rclexecm.RclExecM() +extract = DJVUExtractor(proto) +rclexecm.main(proto, extract) diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 50b73ea0..7e3fbc04 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -128,7 +128,7 @@ image/jp2 = execm rclimg image/jpeg = execm rclimg image/png = execm rclimg image/tiff = execm rclimg -image/vnd.djvu = exec rcldjvu +image/vnd.djvu = execm rcldjvu.py image/svg+xml = execm rclsvg.py image/x-xcf = execm rclimg inode/symlink = internal