more python3 tweaks
This commit is contained in:
parent
8cb67b371f
commit
ca5fe29841
6 changed files with 51 additions and 28 deletions
|
@ -32,13 +32,17 @@ class InfoExtractor:
|
||||||
nodename, docdata = self.contents[index]
|
nodename, docdata = self.contents[index]
|
||||||
nodename = self.em.htmlescape(nodename)
|
nodename = self.em.htmlescape(nodename)
|
||||||
docdata = self.em.htmlescape(docdata)
|
docdata = self.em.htmlescape(docdata)
|
||||||
|
print("type(docdata) = %s type(nodename) = %s"% \
|
||||||
|
(type(docdata), type(nodename)), file=sys.stderr)
|
||||||
# strange whitespace to avoid changing the module tests (same as old)
|
# strange whitespace to avoid changing the module tests (same as old)
|
||||||
docdata = b'\n<html>\n <head>\n <title>' + nodename + \
|
docdata = b'\n<html>\n <head>\n <title>' + \
|
||||||
|
nodename + \
|
||||||
b'</title>\n' + \
|
b'</title>\n' + \
|
||||||
' <meta name="rclaptg" content="gnuinfo">\n' + \
|
b' <meta name="rclaptg" content="gnuinfo">\n' + \
|
||||||
b' </head>\n <body>\n' + \
|
b' </head>\n <body>\n' + \
|
||||||
b' <pre style="white-space: pre-wrap">\n ' + \
|
b' <pre style="white-space: pre-wrap">\n ' + \
|
||||||
docdata + b'\n </pre></body>\n</html>\n'
|
docdata + \
|
||||||
|
b'\n </pre></body>\n</html>\n'
|
||||||
|
|
||||||
iseof = rclexecm.RclExecM.noteof
|
iseof = rclexecm.RclExecM.noteof
|
||||||
if self.currentindex >= len(self.contents) -1:
|
if self.currentindex >= len(self.contents) -1:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
# Read a .kar midi karaoke file and translate to recoll indexable format
|
# Read a .kar midi karaoke file and translate to recoll indexable format
|
||||||
# This does not work with Python3 yet because python:midi doesn't
|
# This does not work with Python3 yet because python:midi doesn't
|
||||||
|
|
|
@ -120,24 +120,24 @@ class PDFExtractor:
|
||||||
inheader = False
|
inheader = False
|
||||||
inbody = False
|
inbody = False
|
||||||
didcs = False
|
didcs = False
|
||||||
output = ''
|
output = b''
|
||||||
cont = ''
|
cont = b''
|
||||||
for line in input.split('\n'):
|
for line in input.split(b'\n'):
|
||||||
line = cont + line
|
line = cont + line
|
||||||
cont = ''
|
cont = b''
|
||||||
if re.search('</head>', line):
|
if re.search(b'</head>', line):
|
||||||
inheader = False
|
inheader = False
|
||||||
if re.search('</pre>', line):
|
if re.search(b'</pre>', line):
|
||||||
inbody = False
|
inbody = False
|
||||||
if inheader:
|
if inheader:
|
||||||
if not didcs:
|
if not didcs:
|
||||||
output += '<meta http-equiv="Content-Type"' + \
|
output += b'<meta http-equiv="Content-Type"' + \
|
||||||
'content="text/html; charset=UTF-8">\n'
|
b'content="text/html; charset=UTF-8">\n'
|
||||||
didcs = True
|
didcs = True
|
||||||
|
|
||||||
m = re.search(r'(.*<title>)(.*)(<\/title>.*)', line)
|
m = re.search(rb'(.*<title>)(.*)(<\/title>.*)', line)
|
||||||
if not m:
|
if not m:
|
||||||
m = re.search(r'(.*content=")(.*)(".*/>.*)', line)
|
m = re.search(rb'(.*content=")(.*)(".*/>.*)', line)
|
||||||
if m:
|
if m:
|
||||||
line = m.group(1) + self.em.htmlescape(m.group(2)) + \
|
line = m.group(1) + self.em.htmlescape(m.group(2)) + \
|
||||||
m.group(3)
|
m.group(3)
|
||||||
|
@ -145,7 +145,7 @@ class PDFExtractor:
|
||||||
# Recoll treats "Subject" as a "title" element
|
# Recoll treats "Subject" as a "title" element
|
||||||
# (based on emails). The PDF "Subject" metadata
|
# (based on emails). The PDF "Subject" metadata
|
||||||
# field is more like an HTML "description"
|
# field is more like an HTML "description"
|
||||||
line = re.sub('name="Subject"', 'name="Description"', line, 1)
|
line = re.sub(b'name="Subject"', b'name="Description"', line, 1)
|
||||||
|
|
||||||
elif inbody:
|
elif inbody:
|
||||||
# Remove end-of-line hyphenation. It's not clear that
|
# Remove end-of-line hyphenation. It's not clear that
|
||||||
|
@ -158,12 +158,12 @@ class PDFExtractor:
|
||||||
#cont = m.group(2).rstrip('-')
|
#cont = m.group(2).rstrip('-')
|
||||||
line = self.em.htmlescape(line)
|
line = self.em.htmlescape(line)
|
||||||
|
|
||||||
if re.search('<head>', line):
|
if re.search(b'<head>', line):
|
||||||
inheader = True
|
inheader = True
|
||||||
if re.search('<pre>', line):
|
if re.search(b'<pre>', line):
|
||||||
inbody = True
|
inbody = True
|
||||||
|
|
||||||
output += line + '\n'
|
output += line + b'\n'
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
|
@ -23,15 +23,15 @@ class TarExtractor:
|
||||||
self.namen = []
|
self.namen = []
|
||||||
|
|
||||||
def extractone(self, ipath):
|
def extractone(self, ipath):
|
||||||
docdata = ""
|
docdata = b''
|
||||||
try:
|
try:
|
||||||
info = self.tar.getmember(ipath)
|
info = self.tar.getmember(ipath)
|
||||||
if info.size > self.em.maxmembersize:
|
if info.size > self.em.maxmembersize:
|
||||||
# skip
|
# skip
|
||||||
docdata = ""
|
docdata = b''
|
||||||
self.em.rclog("extractone: entry %s size %d too big" %
|
self.em.rclog("extractone: entry %s size %d too big" %
|
||||||
(ipath, info.size))
|
(ipath, info.size))
|
||||||
docdata = "" # raise TarError("Member too big")
|
docdata = b'' # raise TarError("Member too big")
|
||||||
else:
|
else:
|
||||||
docdata = self.tar.extractfile(ipath).read()
|
docdata = self.tar.extractfile(ipath).read()
|
||||||
ok = True
|
ok = True
|
||||||
|
@ -45,7 +45,7 @@ class TarExtractor:
|
||||||
def openfile(self, params):
|
def openfile(self, params):
|
||||||
self.currentindex = -1
|
self.currentindex = -1
|
||||||
try:
|
try:
|
||||||
self.tar = tarfile.open(name=params["filename:"],mode='r')
|
self.tar = tarfile.open(name=params["filename:"], mode='r')
|
||||||
#self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())]
|
#self.namen = [ y.name for y in filter(lambda z:z.isfile(),self.tar.getmembers())]
|
||||||
self.namen = [ y.name for y in [z for z in self.tar.getmembers() if z.isfile()]]
|
self.namen = [ y.name for y in [z for z in self.tar.getmembers() if z.isfile()]]
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ class TxtDump:
|
||||||
|
|
||||||
fn = params["filename:"]
|
fn = params["filename:"]
|
||||||
# No charset, so recoll will have to use its config to guess it
|
# No charset, so recoll will have to use its config to guess it
|
||||||
txt = '<html><head><title></title></head><body><pre>'
|
txt = b'<html><head><title></title></head><body><pre>'
|
||||||
try:
|
try:
|
||||||
f = open(fn, "rb")
|
f = open(fn, "rb")
|
||||||
txt += self.em.htmlescape(f.read())
|
txt += self.em.htmlescape(f.read())
|
||||||
|
@ -29,7 +29,7 @@ class TxtDump:
|
||||||
self.em.rclog("TxtDump: %s : %s" % (fn, err))
|
self.em.rclog("TxtDump: %s : %s" % (fn, err))
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
|
||||||
txt += '</pre></body></html>'
|
txt += b'</pre></body></html>'
|
||||||
return (True, txt, "", rclexecm.RclExecM.eofnext)
|
return (True, txt, "", rclexecm.RclExecM.eofnext)
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
###### File type handler api, used by rclexecm ---------->
|
||||||
|
|
|
@ -1,14 +1,33 @@
|
||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
|
# Copyright (C) 2015 J.F.Dockes
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
# Transform XML output from xls-dump.py into csv format.
|
# Transform XML output from xls-dump.py into csv format.
|
||||||
# Note: msodumper is not compatible with python3.
|
#
|
||||||
|
# Note: this would be difficult to make compatible with python 3 <= 3.4
|
||||||
|
# because of the use of % interpolation on what should be bytes.
|
||||||
|
# The python2 restriction is not a big issue at this point because
|
||||||
|
# msodumper is not compatible with python3 anyway
|
||||||
|
# % interpolation for bytes is planned for python 3.5, at which point
|
||||||
|
# porting this module will become trivial.
|
||||||
|
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import xml.sax
|
import xml.sax
|
||||||
sys.path.append(sys.path[0]+"/msodump.zip")
|
|
||||||
from msodumper.globals import error
|
|
||||||
|
|
||||||
dtt = True
|
dtt = True
|
||||||
|
|
||||||
|
@ -62,7 +81,7 @@ if __name__ == '__main__':
|
||||||
xml.sax.parse(sys.stdin, handler)
|
xml.sax.parse(sys.stdin, handler)
|
||||||
print(handler.output)
|
print(handler.output)
|
||||||
except BaseException as err:
|
except BaseException as err:
|
||||||
error("xml-parse: %s\n" % (str(sys.exc_info()[:2]),))
|
print("xml-parse: %s\n" % (str(sys.exc_info()[:2]),), file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue