From c98bdb0edd5ebf5b222e1df1470a038a68f288f6 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Fri, 8 Apr 2016 10:24:52 +0200
Subject: [PATCH] converted rcldjvu to python

---
 src/Makefile.am         |   2 +-
 src/filters/rcldjvu     | 180 ----------------------------------------
 src/filters/rcldjvu.py  | 107 ++++++++++++++++++++++++
 src/sampleconf/mimeconf |   2 +-
 4 files changed, 109 insertions(+), 182 deletions(-)
 delete mode 100755 src/filters/rcldjvu
 create mode 100755 src/filters/rcldjvu.py

diff --git a/src/Makefile.am b/src/Makefile.am
index 6b0f25b7..72133dd7 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -566,7 +566,7 @@ filters/rclcheckneedretry.sh \
 filters/rclchm \
 filters/rclconfig.py \
 filters/rcldia \
-filters/rcldjvu \
+filters/rcldjvu.py \
 filters/rcldoc.py \
 filters/rcldvi \
 filters/rclepub \
diff --git a/src/filters/rcldjvu b/src/filters/rcldjvu
deleted file mode 100755
index 93210a71..00000000
--- a/src/filters/rcldjvu
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/bin/sh
-# @(#$Id: rcldjvu,v 1.6 2008-10-08 08:27:34 dockes Exp $  (C) 2005 J.F.Dockes
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
-
-#================================================================
-# Extract text from a djvu file by executing djvused and djvutxt
-#
-# We use djvused to extract a possible title, djvutxt for the text
-#
-# Of course this only means anything if the djvu document actually has
-# a text layer !
-#
-# djvu utilities (04-2010) have a bug in which they try to interpret
-# and convert file paths as character data, and fail miserably if the
-# locale is not consistent with the actual encoding of the path (which
-# could be arbitrary binary for all they know). We use a temporary
-# symbolic link to get around this.
-# 
-#================================================================
-
-progname="rcldjvu"
-filetype=dejavu
-
-
-#RECFILTCOMMONCODE
-##############################################################################
-# !! Leave the previous line unmodified!! Code imported from the
-# recfiltcommon file
-
-# Utility code common to all shell filters. This could be sourced at run
-# time, but it's slightly more efficient to include the code in the
-# filters at build time (with a sed script).
-
-# Describe error in a way that can be interpreted by our caller
-senderror()
-{
-    echo RECFILTERROR $*
-    # Also alert on stderr just in case
-    echo ":2:$progname::: $*" 1>&2
-    exit 1
-}
-
-iscmd()
-{
-    cmd=$1
-    case $cmd in
-    */*)
-	if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
-    *)
-      oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
-      for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
-      return 1 ;;
-    esac
-}
-
-checkcmds()
-{
-    for cmd in $*;do
-      if iscmd $cmd 
-      then 
-        a=1
-      else 
-        senderror HELPERNOTFOUND $cmd
-      fi
-    done
-}
-
-# show help message
-if test $# -ne 1 -o "$1" = "--help" 
-then
-  echo "Convert a $filetype file to HTML text for Recoll indexing."
-  echo "Usage: $progname [infile]"
-  exit 1
-fi
-
-infile="$1"
-
-# check the input file existence (may be '-' for stdin)
-if test "X$infile" != X- -a ! -f "$infile"
-then
-  senderror INPUTNOSUCHFILE "$infile"
-fi
-
-# protect access to our temp files and directories
-umask 77
-
-##############################################################################
-# !! Leave the following line unmodified !
-#ENDRECFILTCOMMONCODE
-
-checkcmds djvutxt djvused awk
-
-# We need a temporary symlink to avoid path encoding issues
-if test z"$RECOLL_TMPDIR" != z; then
-   ttdir=$RECOLL_TMPDIR
-elif test z"$TMPDIR" != z ; then
-   ttdir=$TMPDIR
-else
-   ttdir=/tmp
-fi
-tmplink=$ttdir/rcldjvu_tmp$$.djvu
-rm -f $tmplink
-ln -s "$infile" $tmplink || exit 1
-
-cleanup()
-{
-    rm -f $tmplink
-}
-    
-trap cleanup EXIT HUP QUIT INT TERM
-
-# Title: we try to extract it from the annotations. djvused outputs string
-# in C/awk \-escaped notation. Awk can only process this in string
-# constants, so we have a first awk pass to create an awk program to parse
-# the string as a constant (...). This is not exactly robust or nice
-title=`djvused "$tmplink" -e 'select 1;output-ant' | \
-grep ' (title ' | sed -e 's/^.* (title //' -e 's/)$//' |\
-awk '
-{
-      printf("BEGIN" " {s = %s; print s}\n", $0)
-}' | awk -f -`
-
-
-cat <<EOF
-<html>
-<head>
-    <title>$title</title>
-    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
-</head>
-<body>
-<pre>
-EOF
-
-# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
-# is an awk program
-djvutxt "$tmplink" | sed -e 's/[ 	][ 	]*$//' | \
-awk 'BEGIN'\
-' {
-  cont = ""
-}
-{
-    $0 = cont $0
-    cont = ""
-
-    if ($0 == "\f") {
-       print "</p>\n<hr>\n<p>"
-       next
-    } else if ($0 ~ /[-]$/) {
-      # Break at last whitespace
-      match($0, "[ \t][^ \t]+$")
-      line = substr($0, 0, RSTART)
-      cont = substr($0, RSTART, RLENGTH)
-      $0 = line
-      gsub("-", "", cont)
-    }
-    gsub(/&/, "\\&amp;", $0)
-    gsub(/</, "\\&lt;", $0)
-    gsub(/>/, "\\&gt;", $0)
-    print $0      
-}'
-
-cat <<EOF
-</pre>
-</body>
-</html>
-EOF
diff --git a/src/filters/rcldjvu.py b/src/filters/rcldjvu.py
new file mode 100755
index 00000000..46198a51
--- /dev/null
+++ b/src/filters/rcldjvu.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# Copyright (C) 2016 J.F.Dockes
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+# Recoll DJVU extractor
+
+from __future__ import print_function
+
+import os
+import sys
+import re
+import rclexecm
+import subprocess
+
+class DJVUExtractor:
+    def __init__(self, em):
+        self.currentindex = 0
+        self.djvused = None
+        self.djvutxt = None
+        self.em = em
+
+    def extractone(self, params):
+        self.em.setmimetype('text/html')
+
+        # Extract metadata
+        if self.djvused:
+            try:
+                metadata = subprocess.check_output([self.djvused, self.filename,
+                                                    "-e", "select 1;print-meta"])
+            except Exception as e:
+                self.em.rclog("djvused failed: %s" % e)
+        author = ""
+        title = ""
+        metadata = metadata.decode('UTF-8', 'replace')
+        for line in metadata.split('\n'):
+            line = line.split('"')
+            if len(line) >= 2:
+                nm = line[0].strip()
+                if nm == "author":
+                    author = ' '.join(line[1:])
+                elif nm == "title":
+                    title = ' '.join(line[1:])
+
+        # Main text
+        try:
+            txtdata = subprocess.check_output([self.djvutxt, "--escape", self.filename])
+        except Exception as e:
+            self.em.rclog("djvused failed: %s" % e)
+            return (False, "", "", rclexecm.RclExecM.eofnow)
+        txtdata = txtdata.decode('UTF-8', 'replace')
+
+        data = '''<html><head><title>''' + self.em.htmlescape(title) + '''</title>'''
+        data += '''<meta http-equiv="Content-Type" '''
+        data += '''content="text/html;charset=UTF-8">'''
+        if author:
+            data += '''<meta name="author" content="''' + \
+                    self.em.htmlescape(author) + '''">'''
+        data += '''</head><body><pre>'''
+
+        data += self.em.htmlescape(txtdata)
+        data += '''</pre></body></html>'''
+        return (True, data, "", rclexecm.RclExecM.eofnext)
+
+    ###### File type handler api, used by rclexecm ---------->
+    def openfile(self, params):
+        self.filename = params["filename:"]
+        self.currentindex = 0
+        #self.em.rclog("openfile: [%s]" % self.filename)
+
+        if not self.djvutxt:
+            self.djvutxt = rclexecm.which("djvutxt")
+            if not self.djvutxt:
+                print("RECFILTERROR HELPERNOTFOUND djvutxt")
+                sys.exit(1);
+            self.djvused = rclexecm.which("djvused")
+
+        return True
+
+    def getipath(self, params):
+        return self.extractone(params)
+        return (ok, data, ipath, eof)
+        
+    def getnext(self, params):
+        if self.currentindex >= 1:
+            return (False, "", "", rclexecm.RclExecM.eofnow)
+        else:
+            ret= self.extractone(params)
+            self.currentindex += 1
+            return ret
+
+# Main program: create protocol handler and extractor and run them
+proto = rclexecm.RclExecM()
+extract = DJVUExtractor(proto)
+rclexecm.main(proto, extract)
diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf
index 50b73ea0..7e3fbc04 100644
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@@ -128,7 +128,7 @@ image/jp2 = execm rclimg
 image/jpeg = execm rclimg
 image/png = execm rclimg
 image/tiff = execm rclimg
-image/vnd.djvu = exec rcldjvu
+image/vnd.djvu = execm rcldjvu.py
 image/svg+xml = execm rclsvg.py
 image/x-xcf = execm rclimg
 inode/symlink = internal