Updated the python external indexer sample
This commit is contained in:
parent
aef606d529
commit
0de2faef3c
3 changed files with 204 additions and 62 deletions
27
src/python/samples/README.txt
Normal file
27
src/python/samples/README.txt
Normal file
|
@ -0,0 +1,27 @@
|
|||
Python samples:
|
||||
|
||||
rclmbox.py
|
||||
backends
|
||||
A sample external indexer and its backends link file (see the user manual
|
||||
programming section). The sample indexes a directory containing mbox files.
|
||||
|
||||
rcldlkp.py
|
||||
Another sample indexer for a simple %-separated record format.
|
||||
|
||||
recollq.py
|
||||
recollqsd.py
|
||||
Sample query programs based on the Python query interface.
|
||||
|
||||
recollgui/
|
||||
A sample GUI based on the python query interface.
|
||||
|
||||
docdups.py
|
||||
A script based on the Xapian Python interface which explores a Recoll index
|
||||
and prints out sets of duplicate documents (based on the md5 hashes).
|
||||
|
||||
mutt-recoll.py
|
||||
Interface between recoll and mutt (based on mutt-notmuch). Not related to
|
||||
the Recoll Python API, this executes recollq.
|
||||
|
||||
trconfig.py
|
||||
Not useful at all: internal exercises for the python rclconfig interface.
|
5
src/python/samples/backends
Normal file
5
src/python/samples/backends
Normal file
|
@ -0,0 +1,5 @@
|
|||
[MBOX]
|
||||
fetch = python \
|
||||
/home/dockes/projets/fulltext/recoll/src/python/samples/rclmbox.py fetch
|
||||
makesig = python \
|
||||
/home/dockes/projets/fulltext/recoll/src/python/samples/rclmbox.py makesig
|
|
@ -1,44 +1,101 @@
|
|||
#!/usr/bin/env python
|
||||
"""An example that uses python tools to parse mbox/rfcxxx format and index
|
||||
messages. Not supposed to run as-is or be really useful"""
|
||||
"""This sample uses the Recoll Python API to index a directory
|
||||
containing mbox files. This is not particularly useful as Recoll
|
||||
itself can do this better (e.g. this script does not process
|
||||
attachments), but it shows the use of most of the Recoll interface
|
||||
features, except 'parent_udi' (we do not create a 'self' document to
|
||||
act as the parent)."""
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import glob
|
||||
import os
|
||||
import stat
|
||||
import mailbox
|
||||
import email.header
|
||||
import email.utils
|
||||
#import sys
|
||||
|
||||
try:
|
||||
from recoll import recoll
|
||||
except:
|
||||
import recoll
|
||||
|
||||
import os
|
||||
import stat
|
||||
# EDIT
|
||||
# Change this for some directory with mbox files, such as a
|
||||
# Thunderbird/Icedove mail storage directory.
|
||||
mbdir = os.path.expanduser("~/mail")
|
||||
#mbdir = os.path.expanduser("~/.icedove/n8n19644.default/Mail/Local Folders/")
|
||||
|
||||
mbfile = os.path.expanduser("~/mbox")
|
||||
rclconf = os.path.expanduser("~/.recoll")
|
||||
# EDIT
|
||||
# Change this to wherever you want your recoll data to live. Create
|
||||
# the directory with a (possibly empty) recoll.conf in it before first
|
||||
# running the script
|
||||
rclconf = os.path.expanduser("~/.recoll-extern")
|
||||
|
||||
# Utility: extract text for named header
|
||||
def header_value(msg, nm, to_utf = False):
|
||||
value = msg.get(nm)
|
||||
if value == None:
|
||||
return ""
|
||||
value = value.replace("\n", "")
|
||||
value = value.replace("\r", "")
|
||||
#print value
|
||||
#value = value.replace("\n", "")
|
||||
#value = value.replace("\r", "")
|
||||
parts = email.header.decode_header(value)
|
||||
#print parts
|
||||
univalue = u""
|
||||
for part in parts:
|
||||
if part[1] != None:
|
||||
univalue += unicode(part[0], part[1]) + " "
|
||||
else:
|
||||
univalue += part[0] + " "
|
||||
try:
|
||||
if part[1] != None:
|
||||
univalue += part[0].decode(part[1]) + u" "
|
||||
else:
|
||||
if isinstance(part[0], bytes):
|
||||
univalue += part[0].decode("cp1252") + u" "
|
||||
else:
|
||||
univalue += part[0] + u" "
|
||||
except Exception as err:
|
||||
print("Failed decoding header: %s" % err, file=sys.stderr)
|
||||
pass
|
||||
if to_utf:
|
||||
return univalue.encode('utf-8')
|
||||
else:
|
||||
return univalue
|
||||
|
||||
# Utility: extract text parts from body
|
||||
def extract_text(msg):
|
||||
"""Extract and decode all text/plain parts from the message"""
|
||||
text = u""
|
||||
# We only output the headers for previewing, else they're already
|
||||
# output/indexed as fields.
|
||||
if "RECOLL_FILTER_FORPREVIEW" in os.environ and \
|
||||
os.environ["RECOLL_FILTER_FORPREVIEW"] == "yes":
|
||||
text += u"From: " + header_value(msg, "From") + u"\n"
|
||||
text += u"To: " + header_value(msg, "To") + u"\n"
|
||||
text += u"Subject: " + header_value(msg, "Subject") + u"\n"
|
||||
# text += u"Content-Type: text/plain; charset=UTF-8\n"
|
||||
#text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
|
||||
text += u"\n"
|
||||
for part in msg.walk():
|
||||
if part.is_multipart():
|
||||
pass
|
||||
else:
|
||||
ct = part.get_content_type()
|
||||
if ct.lower() == "text/plain":
|
||||
charset = part.get_content_charset("cp1252")
|
||||
try:
|
||||
ntxt = part.get_payload(None, True).decode(charset)
|
||||
text += ntxt
|
||||
except Exception as err:
|
||||
print("Failed decoding payload: %s" % err,
|
||||
file=sys.stderr)
|
||||
pass
|
||||
return text
|
||||
|
||||
|
||||
|
||||
class mbox_indexer:
|
||||
def __init__(self, mbfile):
|
||||
"""The indexer classs. An object is created for indexing one mbox folder"""
|
||||
def __init__(self, db, mbfile):
|
||||
"""Initialize for writable db recoll.Db object and mbfile mbox
|
||||
file. We retrieve the the file size and mtime."""
|
||||
self.db = db
|
||||
self.mbfile = mbfile
|
||||
stdata = os.stat(mbfile)
|
||||
self.fmtime = stdata[stat.ST_MTIME]
|
||||
|
@ -46,73 +103,126 @@ class mbox_indexer:
|
|||
self.msgnum = 1
|
||||
|
||||
def sig(self):
|
||||
"""Create update verification value for mbox file:
|
||||
modification time concatenated with size should cover most
|
||||
cases"""
|
||||
return str(self.fmtime) + ":" + str(self.fbytes)
|
||||
|
||||
def udi(self, msgnum):
|
||||
"""Create unique document identifier for message. This should
|
||||
be shorter than 150 bytes, which we optimistically don't check
|
||||
here, as we just concatenate the mbox file name and message
|
||||
number"""
|
||||
return self.mbfile + ":" + str(msgnum)
|
||||
|
||||
def index(self, db):
|
||||
if not db.needUpdate(self.udi(1), self.sig()):
|
||||
print("Index is up to date");
|
||||
def index(self):
|
||||
if not self.db.needUpdate(self.udi(1), self.sig()):
|
||||
print("Index is up to date for %s"%self.mbfile, file=sys.stderr);
|
||||
return None
|
||||
mb = mailbox.mbox(self.mbfile)
|
||||
for msg in mb.values():
|
||||
print("Indexing message %d" % self.msgnum);
|
||||
self.index_message(db, msg)
|
||||
print("Indexing message %d" % self.msgnum, file=sys.stderr);
|
||||
self.index_message(msg)
|
||||
self.msgnum += 1
|
||||
|
||||
def index_message(self, db, msg):
|
||||
def getdata(self, ipath):
|
||||
"""Implements the 'fetch' data access interface (called at
|
||||
query time from the command line)."""
|
||||
#print("mbox::getdata: ipath: %s" % ipath, file=sys.stderr)
|
||||
imsgnum = int(ipath)
|
||||
mb = mailbox.mbox(self.mbfile)
|
||||
msgnum = 0;
|
||||
for msg in mb.values():
|
||||
msgnum += 1
|
||||
if msgnum == imsgnum:
|
||||
return extract_text(msg)
|
||||
return ""
|
||||
|
||||
def index_message(self, msg):
|
||||
doc = recoll.Doc()
|
||||
|
||||
# Misc standard recoll fields
|
||||
doc.author = header_value(msg, "From")
|
||||
doc.recipient = header_value(msg, "To") + " " + header_value(msg, "Cc")
|
||||
# url
|
||||
doc.url = "file://" + self.mbfile
|
||||
# utf8fn
|
||||
# ipath
|
||||
doc.ipath = str(self.msgnum)
|
||||
# mimetype
|
||||
doc.mimetype = "message/rfc822"
|
||||
# mtime
|
||||
dte = header_value(msg, "Date")
|
||||
tm = email.utils.parsedate_tz(dte)
|
||||
if tm == None:
|
||||
doc.mtime = str(self.fmtime)
|
||||
else:
|
||||
doc.mtime = str(email.utils.mktime_tz(tm))
|
||||
# origcharset
|
||||
# title
|
||||
doc.title = header_value(msg, "Subject")
|
||||
# keywords
|
||||
# abstract
|
||||
# author
|
||||
# fbytes
|
||||
doc.fbytes = str(self.fbytes)
|
||||
# text
|
||||
text = u""
|
||||
text += u"From: " + header_value(msg, "From") + u"\n"
|
||||
text += u"To: " + header_value(msg, "To") + u"\n"
|
||||
text += u"Subject: " + header_value(msg, "Subject") + u"\n"
|
||||
#text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
|
||||
text += u"\n"
|
||||
for part in msg.walk():
|
||||
if part.is_multipart():
|
||||
pass
|
||||
else:
|
||||
ct = part.get_content_type()
|
||||
if ct.lower() == "text/plain":
|
||||
charset = part.get_content_charset("iso-8859-1")
|
||||
#print "charset: ", charset
|
||||
#print "text: ", part.get_payload(None, True)
|
||||
text += unicode(part.get_payload(None, True), charset)
|
||||
doc.text = text
|
||||
# dbytes
|
||||
doc.dbytes = str(len(text))
|
||||
# sig
|
||||
|
||||
# Custom field
|
||||
doc.myfield = "some value"
|
||||
|
||||
# Main document text and MIME type
|
||||
doc.text = extract_text(msg)
|
||||
doc.dbytes = str(len(doc.text.encode('UTF-8')))
|
||||
doc.mimetype = "text/plain"
|
||||
|
||||
# Store data for later "up to date" checks
|
||||
doc.sig = self.sig()
|
||||
|
||||
# The rclbes field is the link between the index data and this
|
||||
# script when used at query time
|
||||
doc.rclbes = "MBOX"
|
||||
|
||||
# These get stored inside the index, and returned at query
|
||||
# time, but the main identifier is the condensed 'udi'
|
||||
doc.url = "file://" + self.mbfile
|
||||
doc.ipath = str(self.msgnum)
|
||||
# The udi is the unique document identifier, later used if we
|
||||
# want to e.g. delete the document index data (and other ops).
|
||||
udi = self.udi(self.msgnum)
|
||||
db.addOrUpdate(udi, doc)
|
||||
|
||||
self.db.addOrUpdate(udi, doc)
|
||||
|
||||
db = recoll.connect(confdir=rclconf, writable=1)
|
||||
# Index a directory containing mbox files
|
||||
def index_mboxdir(dir):
|
||||
db = recoll.connect(confdir=rclconf, writable=1)
|
||||
entries = glob.glob(dir + "/*")
|
||||
for ent in entries:
|
||||
if '.' in os.path.basename(ent):
|
||||
# skip .log etc. our mboxes have no exts
|
||||
continue
|
||||
if not os.path.isfile(ent):
|
||||
continue
|
||||
print("Processing %s"%ent)
|
||||
mbidx = mbox_indexer(db, ent)
|
||||
mbidx.index()
|
||||
db.purge()
|
||||
|
||||
mbidx = mbox_indexer(mbfile)
|
||||
mbidx.index(db)
|
||||
usage_string='''Usage:
|
||||
rclmbox.py
|
||||
Index the directory (the path is hard-coded inside the script)
|
||||
rclmbox.py [fetch|makesig] udi url ipath
|
||||
fetch subdoc data or make signature (query time)
|
||||
'''
|
||||
def usage():
|
||||
print("%s" % usage_string, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
index_mboxdir(mbdir)
|
||||
else:
|
||||
# cmd [fetch|makesig] udi url ipath
|
||||
if len(sys.argv) != 5:
|
||||
usage()
|
||||
cmd = sys.argv[1]
|
||||
udi = sys.argv[2]
|
||||
url = sys.argv[3]
|
||||
ipath = sys.argv[4]
|
||||
|
||||
mbfile = url.replace('file://', '')
|
||||
# no need for a db for getdata or makesig.
|
||||
mbidx = mbox_indexer(None, mbfile)
|
||||
|
||||
if cmd == 'fetch':
|
||||
print("%s"%mbidx.getdata(ipath).encode('UTF-8'), end="")
|
||||
elif cmd == 'makesig':
|
||||
print(mbidx.sig(), end="")
|
||||
else:
|
||||
usage()
|
||||
|
||||
sys.exit(0)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue