Updated the python external indexer sample
This commit is contained in:
parent
aef606d529
commit
0de2faef3c
3 changed files with 204 additions and 62 deletions
27
src/python/samples/README.txt
Normal file
27
src/python/samples/README.txt
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
Python samples:
|
||||||
|
|
||||||
|
rclmbox.py
|
||||||
|
backends
|
||||||
|
A sample external indexer and its backends link file (see the user manual
|
||||||
|
programming section). The sample indexes a directory containing mbox files.
|
||||||
|
|
||||||
|
rcldlkp.py
|
||||||
|
Another sample indexer for a simple %-separated record format.
|
||||||
|
|
||||||
|
recollq.py
|
||||||
|
recollqsd.py
|
||||||
|
Sample query programs based on the Python query interface.
|
||||||
|
|
||||||
|
recollgui/
|
||||||
|
A sample GUI based on the python query interface.
|
||||||
|
|
||||||
|
docdups.py
|
||||||
|
A script based on the Xapian Python interface which explores a Recoll index
|
||||||
|
and prints out sets of duplicate documents (based on the md5 hashes).
|
||||||
|
|
||||||
|
mutt-recoll.py
|
||||||
|
Interface between recoll and mutt (based on mutt-notmuch). Not related to
|
||||||
|
the Recoll Python API, this executes recollq.
|
||||||
|
|
||||||
|
trconfig.py
|
||||||
|
Not useful at all: internal exercises for the python rclconfig interface.
|
5
src/python/samples/backends
Normal file
5
src/python/samples/backends
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
[MBOX]
|
||||||
|
fetch = python \
|
||||||
|
/home/dockes/projets/fulltext/recoll/src/python/samples/rclmbox.py fetch
|
||||||
|
makesig = python \
|
||||||
|
/home/dockes/projets/fulltext/recoll/src/python/samples/rclmbox.py makesig
|
|
@ -1,44 +1,101 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
"""An example that uses python tools to parse mbox/rfcxxx format and index
|
"""This sample uses the Recoll Python API to index a directory
|
||||||
messages. Not supposed to run as-is or be really useful"""
|
containing mbox files. This is not particularly useful as Recoll
|
||||||
|
itself can do this better (e.g. this script does not process
|
||||||
|
attachments), but it shows the use of most of the Recoll interface
|
||||||
|
features, except 'parent_udi' (we do not create a 'self' document to
|
||||||
|
act as the parent)."""
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import stat
|
||||||
import mailbox
|
import mailbox
|
||||||
import email.header
|
import email.header
|
||||||
import email.utils
|
import email.utils
|
||||||
#import sys
|
|
||||||
try:
|
try:
|
||||||
from recoll import recoll
|
from recoll import recoll
|
||||||
except:
|
except:
|
||||||
import recoll
|
import recoll
|
||||||
|
|
||||||
import os
|
# EDIT
|
||||||
import stat
|
# Change this for some directory with mbox files, such as a
|
||||||
|
# Thunderbird/Icedove mail storage directory.
|
||||||
|
mbdir = os.path.expanduser("~/mail")
|
||||||
|
#mbdir = os.path.expanduser("~/.icedove/n8n19644.default/Mail/Local Folders/")
|
||||||
|
|
||||||
mbfile = os.path.expanduser("~/mbox")
|
# EDIT
|
||||||
rclconf = os.path.expanduser("~/.recoll")
|
# Change this to wherever you want your recoll data to live. Create
|
||||||
|
# the directory with a (possibly empty) recoll.conf in it before first
|
||||||
|
# running the script
|
||||||
|
rclconf = os.path.expanduser("~/.recoll-extern")
|
||||||
|
|
||||||
|
# Utility: extract text for named header
|
||||||
def header_value(msg, nm, to_utf = False):
|
def header_value(msg, nm, to_utf = False):
|
||||||
value = msg.get(nm)
|
value = msg.get(nm)
|
||||||
if value == None:
|
if value == None:
|
||||||
return ""
|
return ""
|
||||||
value = value.replace("\n", "")
|
#value = value.replace("\n", "")
|
||||||
value = value.replace("\r", "")
|
#value = value.replace("\r", "")
|
||||||
#print value
|
|
||||||
parts = email.header.decode_header(value)
|
parts = email.header.decode_header(value)
|
||||||
#print parts
|
|
||||||
univalue = u""
|
univalue = u""
|
||||||
for part in parts:
|
for part in parts:
|
||||||
if part[1] != None:
|
try:
|
||||||
univalue += unicode(part[0], part[1]) + " "
|
if part[1] != None:
|
||||||
else:
|
univalue += part[0].decode(part[1]) + u" "
|
||||||
univalue += part[0] + " "
|
else:
|
||||||
|
if isinstance(part[0], bytes):
|
||||||
|
univalue += part[0].decode("cp1252") + u" "
|
||||||
|
else:
|
||||||
|
univalue += part[0] + u" "
|
||||||
|
except Exception as err:
|
||||||
|
print("Failed decoding header: %s" % err, file=sys.stderr)
|
||||||
|
pass
|
||||||
if to_utf:
|
if to_utf:
|
||||||
return univalue.encode('utf-8')
|
return univalue.encode('utf-8')
|
||||||
else:
|
else:
|
||||||
return univalue
|
return univalue
|
||||||
|
|
||||||
|
# Utility: extract text parts from body
|
||||||
|
def extract_text(msg):
|
||||||
|
"""Extract and decode all text/plain parts from the message"""
|
||||||
|
text = u""
|
||||||
|
# We only output the headers for previewing, else they're already
|
||||||
|
# output/indexed as fields.
|
||||||
|
if "RECOLL_FILTER_FORPREVIEW" in os.environ and \
|
||||||
|
os.environ["RECOLL_FILTER_FORPREVIEW"] == "yes":
|
||||||
|
text += u"From: " + header_value(msg, "From") + u"\n"
|
||||||
|
text += u"To: " + header_value(msg, "To") + u"\n"
|
||||||
|
text += u"Subject: " + header_value(msg, "Subject") + u"\n"
|
||||||
|
# text += u"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
|
#text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
|
||||||
|
text += u"\n"
|
||||||
|
for part in msg.walk():
|
||||||
|
if part.is_multipart():
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
ct = part.get_content_type()
|
||||||
|
if ct.lower() == "text/plain":
|
||||||
|
charset = part.get_content_charset("cp1252")
|
||||||
|
try:
|
||||||
|
ntxt = part.get_payload(None, True).decode(charset)
|
||||||
|
text += ntxt
|
||||||
|
except Exception as err:
|
||||||
|
print("Failed decoding payload: %s" % err,
|
||||||
|
file=sys.stderr)
|
||||||
|
pass
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class mbox_indexer:
|
class mbox_indexer:
|
||||||
def __init__(self, mbfile):
|
"""The indexer classs. An object is created for indexing one mbox folder"""
|
||||||
|
def __init__(self, db, mbfile):
|
||||||
|
"""Initialize for writable db recoll.Db object and mbfile mbox
|
||||||
|
file. We retrieve the the file size and mtime."""
|
||||||
|
self.db = db
|
||||||
self.mbfile = mbfile
|
self.mbfile = mbfile
|
||||||
stdata = os.stat(mbfile)
|
stdata = os.stat(mbfile)
|
||||||
self.fmtime = stdata[stat.ST_MTIME]
|
self.fmtime = stdata[stat.ST_MTIME]
|
||||||
|
@ -46,73 +103,126 @@ class mbox_indexer:
|
||||||
self.msgnum = 1
|
self.msgnum = 1
|
||||||
|
|
||||||
def sig(self):
|
def sig(self):
|
||||||
|
"""Create update verification value for mbox file:
|
||||||
|
modification time concatenated with size should cover most
|
||||||
|
cases"""
|
||||||
return str(self.fmtime) + ":" + str(self.fbytes)
|
return str(self.fmtime) + ":" + str(self.fbytes)
|
||||||
|
|
||||||
def udi(self, msgnum):
|
def udi(self, msgnum):
|
||||||
|
"""Create unique document identifier for message. This should
|
||||||
|
be shorter than 150 bytes, which we optimistically don't check
|
||||||
|
here, as we just concatenate the mbox file name and message
|
||||||
|
number"""
|
||||||
return self.mbfile + ":" + str(msgnum)
|
return self.mbfile + ":" + str(msgnum)
|
||||||
|
|
||||||
def index(self, db):
|
def index(self):
|
||||||
if not db.needUpdate(self.udi(1), self.sig()):
|
if not self.db.needUpdate(self.udi(1), self.sig()):
|
||||||
print("Index is up to date");
|
print("Index is up to date for %s"%self.mbfile, file=sys.stderr);
|
||||||
return None
|
return None
|
||||||
mb = mailbox.mbox(self.mbfile)
|
mb = mailbox.mbox(self.mbfile)
|
||||||
for msg in mb.values():
|
for msg in mb.values():
|
||||||
print("Indexing message %d" % self.msgnum);
|
print("Indexing message %d" % self.msgnum, file=sys.stderr);
|
||||||
self.index_message(db, msg)
|
self.index_message(msg)
|
||||||
self.msgnum += 1
|
self.msgnum += 1
|
||||||
|
|
||||||
def index_message(self, db, msg):
|
def getdata(self, ipath):
|
||||||
|
"""Implements the 'fetch' data access interface (called at
|
||||||
|
query time from the command line)."""
|
||||||
|
#print("mbox::getdata: ipath: %s" % ipath, file=sys.stderr)
|
||||||
|
imsgnum = int(ipath)
|
||||||
|
mb = mailbox.mbox(self.mbfile)
|
||||||
|
msgnum = 0;
|
||||||
|
for msg in mb.values():
|
||||||
|
msgnum += 1
|
||||||
|
if msgnum == imsgnum:
|
||||||
|
return extract_text(msg)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def index_message(self, msg):
|
||||||
doc = recoll.Doc()
|
doc = recoll.Doc()
|
||||||
|
|
||||||
|
# Misc standard recoll fields
|
||||||
doc.author = header_value(msg, "From")
|
doc.author = header_value(msg, "From")
|
||||||
doc.recipient = header_value(msg, "To") + " " + header_value(msg, "Cc")
|
doc.recipient = header_value(msg, "To") + " " + header_value(msg, "Cc")
|
||||||
# url
|
|
||||||
doc.url = "file://" + self.mbfile
|
|
||||||
# utf8fn
|
|
||||||
# ipath
|
|
||||||
doc.ipath = str(self.msgnum)
|
|
||||||
# mimetype
|
|
||||||
doc.mimetype = "message/rfc822"
|
|
||||||
# mtime
|
|
||||||
dte = header_value(msg, "Date")
|
dte = header_value(msg, "Date")
|
||||||
tm = email.utils.parsedate_tz(dte)
|
tm = email.utils.parsedate_tz(dte)
|
||||||
if tm == None:
|
if tm == None:
|
||||||
doc.mtime = str(self.fmtime)
|
doc.mtime = str(self.fmtime)
|
||||||
else:
|
else:
|
||||||
doc.mtime = str(email.utils.mktime_tz(tm))
|
doc.mtime = str(email.utils.mktime_tz(tm))
|
||||||
# origcharset
|
|
||||||
# title
|
|
||||||
doc.title = header_value(msg, "Subject")
|
doc.title = header_value(msg, "Subject")
|
||||||
# keywords
|
|
||||||
# abstract
|
|
||||||
# author
|
|
||||||
# fbytes
|
|
||||||
doc.fbytes = str(self.fbytes)
|
doc.fbytes = str(self.fbytes)
|
||||||
# text
|
|
||||||
text = u""
|
# Custom field
|
||||||
text += u"From: " + header_value(msg, "From") + u"\n"
|
doc.myfield = "some value"
|
||||||
text += u"To: " + header_value(msg, "To") + u"\n"
|
|
||||||
text += u"Subject: " + header_value(msg, "Subject") + u"\n"
|
# Main document text and MIME type
|
||||||
#text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
|
doc.text = extract_text(msg)
|
||||||
text += u"\n"
|
doc.dbytes = str(len(doc.text.encode('UTF-8')))
|
||||||
for part in msg.walk():
|
doc.mimetype = "text/plain"
|
||||||
if part.is_multipart():
|
|
||||||
pass
|
# Store data for later "up to date" checks
|
||||||
else:
|
|
||||||
ct = part.get_content_type()
|
|
||||||
if ct.lower() == "text/plain":
|
|
||||||
charset = part.get_content_charset("iso-8859-1")
|
|
||||||
#print "charset: ", charset
|
|
||||||
#print "text: ", part.get_payload(None, True)
|
|
||||||
text += unicode(part.get_payload(None, True), charset)
|
|
||||||
doc.text = text
|
|
||||||
# dbytes
|
|
||||||
doc.dbytes = str(len(text))
|
|
||||||
# sig
|
|
||||||
doc.sig = self.sig()
|
doc.sig = self.sig()
|
||||||
|
|
||||||
|
# The rclbes field is the link between the index data and this
|
||||||
|
# script when used at query time
|
||||||
|
doc.rclbes = "MBOX"
|
||||||
|
|
||||||
|
# These get stored inside the index, and returned at query
|
||||||
|
# time, but the main identifier is the condensed 'udi'
|
||||||
|
doc.url = "file://" + self.mbfile
|
||||||
|
doc.ipath = str(self.msgnum)
|
||||||
|
# The udi is the unique document identifier, later used if we
|
||||||
|
# want to e.g. delete the document index data (and other ops).
|
||||||
udi = self.udi(self.msgnum)
|
udi = self.udi(self.msgnum)
|
||||||
db.addOrUpdate(udi, doc)
|
|
||||||
|
|
||||||
|
self.db.addOrUpdate(udi, doc)
|
||||||
|
|
||||||
db = recoll.connect(confdir=rclconf, writable=1)
|
# Index a directory containing mbox files
|
||||||
|
def index_mboxdir(dir):
|
||||||
|
db = recoll.connect(confdir=rclconf, writable=1)
|
||||||
|
entries = glob.glob(dir + "/*")
|
||||||
|
for ent in entries:
|
||||||
|
if '.' in os.path.basename(ent):
|
||||||
|
# skip .log etc. our mboxes have no exts
|
||||||
|
continue
|
||||||
|
if not os.path.isfile(ent):
|
||||||
|
continue
|
||||||
|
print("Processing %s"%ent)
|
||||||
|
mbidx = mbox_indexer(db, ent)
|
||||||
|
mbidx.index()
|
||||||
|
db.purge()
|
||||||
|
|
||||||
mbidx = mbox_indexer(mbfile)
|
usage_string='''Usage:
|
||||||
mbidx.index(db)
|
rclmbox.py
|
||||||
|
Index the directory (the path is hard-coded inside the script)
|
||||||
|
rclmbox.py [fetch|makesig] udi url ipath
|
||||||
|
fetch subdoc data or make signature (query time)
|
||||||
|
'''
|
||||||
|
def usage():
|
||||||
|
print("%s" % usage_string, file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
index_mboxdir(mbdir)
|
||||||
|
else:
|
||||||
|
# cmd [fetch|makesig] udi url ipath
|
||||||
|
if len(sys.argv) != 5:
|
||||||
|
usage()
|
||||||
|
cmd = sys.argv[1]
|
||||||
|
udi = sys.argv[2]
|
||||||
|
url = sys.argv[3]
|
||||||
|
ipath = sys.argv[4]
|
||||||
|
|
||||||
|
mbfile = url.replace('file://', '')
|
||||||
|
# no need for a db for getdata or makesig.
|
||||||
|
mbidx = mbox_indexer(None, mbfile)
|
||||||
|
|
||||||
|
if cmd == 'fetch':
|
||||||
|
print("%s"%mbidx.getdata(ipath).encode('UTF-8'), end="")
|
||||||
|
elif cmd == 'makesig':
|
||||||
|
print(mbidx.sig(), end="")
|
||||||
|
else:
|
||||||
|
usage()
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue