Implement filter for .7z files. Based on rclzip and rcltar
This commit is contained in:
commit
d802e87e5a
908 changed files with 379473 additions and 0 deletions
45
tests/pythonapi/extract.py
Normal file
45
tests/pythonapi/extract.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
import sys
|
||||
import hashlib
|
||||
from recoll import recoll
|
||||
from recoll import rclextract
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
ISP3 = True
|
||||
else:
|
||||
ISP3 = False
|
||||
|
||||
def utf8string(s):
|
||||
if ISP3:
|
||||
return s
|
||||
else:
|
||||
return s.encode('utf8')
|
||||
|
||||
db = recoll.connect()
|
||||
query = db.query()
|
||||
|
||||
# This normally has only one result, a well-known html file
|
||||
nres = query.execute("HtmlAttachment_uniqueTerm", stemming=0)
|
||||
print("Result count: %d %d" % (nres, query.rowcount))
|
||||
doc = query.fetchone()
|
||||
xtrac = rclextract.Extractor(doc)
|
||||
doc = xtrac.textextract(doc.ipath)
|
||||
print("Text length: %d"%len(doc.text))
|
||||
|
||||
refdigest = 'bfbb63f7a245c31767585b45014dbd07'
|
||||
|
||||
# This normally has 2 results, one of which is a pdf attachment.
|
||||
nres = query.execute("population_size_cultural_transmission", stemming=0)
|
||||
for doc in query:
|
||||
if doc.mimetype == 'application/pdf':
|
||||
xtrac = rclextract.Extractor(doc)
|
||||
filename = xtrac.idoctofile(doc.ipath, doc.mimetype)
|
||||
f = open(filename, 'rb')
|
||||
data = f.read()
|
||||
f.close()
|
||||
m = hashlib.md5()
|
||||
m.update(data)
|
||||
digest = m.hexdigest()
|
||||
print(digest)
|
||||
if digest != refdigest:
|
||||
print("extract.py: wrong digest for extracted file!")
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue