diff --git a/src/filters/rcltar b/src/filters/rcltar new file mode 100755 index 00000000..3008d304 --- /dev/null +++ b/src/filters/rcltar @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +# Tar-file filter for Recoll +# Thanks to Recoll user Martin Ziegler +# This is a modified version of /usr/share/recoll/filters/rclzip +# It works not only for tar-files, but automatically for gzipped and +# bzipped tar-files at well. + +import rclexecm + +try: + from tarfile import TarFile, open +except: + print "RECFILTERROR HELPERNOTFOUND python:tarfile" + sys.exit(1); + +class TarExtractor: + def __init__(self, em): + self.currentindex = 0 + self.em = em + self.namen = [] + + def extractone(self, ipath): + docdata = "" + try: + docdata = self.tar.extractfile(ipath).read() + ok = True + except Exception, err: + ok = False + iseof = rclexecm.RclExecM.noteof + if self.currentindex >= len(self.namen) -1: + iseof = rclexecm.RclExecM.eofnext + if isinstance(ipath, unicode): + ipath = ipath.encode("utf-8") + return (ok, docdata, ipath, iseof) + + def openfile(self, params): + self.currentindex = 0 + try: + self.tar = open(name=params["filename:"],mode='r') + self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())] + return True + except: + return False + + def getipath(self, params): + ipath = params["ipath:"] + ok, data, ipath, eof = self.extractone(ipath) + if ok: + return (ok, data, ipath, eof) + try: + ipath = ipath.decode("utf-8") + return self.extractone(ipath) + except Exception, err: + return (ok, data, ipath, eof) + + def getnext(self, params): + if self.currentindex >= len(self.namen): + self.namen=[] + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret= self.extractone(self.namen[self.currentindex]) + self.currentindex += 1 + return ret + + +proto = rclexecm.RclExecM() +extract = TarExtractor(proto) +rclexecm.main(proto, extract) diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index d81b2a9c..ebcd41cb 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -86,6 +86,7 @@ application/x-perl = internal text/plain application/x-rar = execm rclrar;charset=default application/x-scribus = exec rclscribus application/x-shellscript = internal text/plain +#application/x-tar = execm rcltar application/x-tex = exec rcltex application/x-webarchive = execm rclwar application/zip = execm rclzip;charset=default @@ -299,6 +300,7 @@ other = application/vnd.sun.xml.draw \ application/x-fsdirectory \ application/x-mimehtml \ application/x-rar \ + application/x-tar \ application/x-webarchive \ application/zip \ diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap index c27aec14..05e7f38f 100644 --- a/src/sampleconf/mimemap +++ b/src/sampleconf/mimemap @@ -53,6 +53,16 @@ #.Z = application/x-compress .zip = application/zip +# The rcltar module can handle compressed tar formats internally so we +# use application/x-tar for all tar files compressed or not. Note that tar +# file indexing is disabled by default, you'll need to copy and uncomment +# the application/x-tar commented line from mimeconf into your personal config +.tar = application/x-tar +.tar.gz = application/x-tar +.tgz = application/x-tar +.tbz = application/x-tar +.tar.bz2 = application/x-tar + .doc = application/msword .ppt = application/vnd.ms-powerpoint .xls = application/vnd.ms-excel @@ -135,9 +145,9 @@ # indexallfilenames is set (so this is different from skippedNames). It's a # bit unconsistent to have it listed among the suffix translations, but no # problem in practice. -recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz .log.gz .md5 .map \ +recoll_noindex = .md5 .map \ .o .lib .dll .a .sys .exe .com \ - .dat .bak .rdf .log .db .msf .pid \ + .dat .bak .rdf .log.gz .log .db .msf .pid \ ,v ~ # # Special handling of .txt files inside ~/.gaim and ~/.purple directories diff --git a/tests/config/mimeconf b/tests/config/mimeconf index 2a092a6d..974d09ce 100644 --- a/tests/config/mimeconf +++ b/tests/config/mimeconf @@ -7,3 +7,5 @@ # with the same name in the central directory. The syntax for setting # values is identical. +[index] +application/x-tar = execm rcltar diff --git a/tests/tar/tar.sh b/tests/tar/tar.sh new file mode 100755 index 00000000..19109c83 --- /dev/null +++ b/tests/tar/tar.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +topdir=`dirname $0`/.. +. $topdir/shared.sh + +initvariables $0 + +( + recollq TARUNIQUETERM2 + +) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout + + +diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 + +checkresult diff --git a/tests/tar/tar.txt b/tests/tar/tar.txt new file mode 100644 index 00000000..b2bf94fd --- /dev/null +++ b/tests/tar/tar.txt @@ -0,0 +1,6 @@ +5 results +text/plain [file:///home/dockes/projets/fulltext/testrecoll/tar/tarfile.tar] [tarfile.tar] 15 bytes +text/plain [file:///home/dockes/projets/fulltext/testrecoll/tar/tarfile.tar.gz] [tarfile.tar.gz] 15 bytes +text/plain [file:///home/dockes/projets/fulltext/testrecoll/tar/tarfile.tgz] [tarfile.tgz] 15 bytes +text/plain [file:///home/dockes/projets/fulltext/testrecoll/tar/tarfile.tbz] [tarfile.tbz] 15 bytes +text/plain [file:///home/dockes/projets/fulltext/testrecoll/tar/tarfile.tar.bz2] [tarfile.tar.bz2] 15 bytes