Commit 01f2c3c1 authored by Julien Cristau's avatar Julien Cristau
Browse files

Move parsing of headers from mboximport script to hooks

Taking information from headers and turning it into our schema's
attributes/relations shouldn't depend on the import script.

To make parsing easier on the hooks side, the 'headers' attribute is
now expected to be an undecoded ascii-only string.
parent 8cbfcbc0d921
......@@ -10,6 +10,10 @@
"""
__docformat__ = "restructuredtext en"
import re
import email
from logilab.common.umessage import UMessage
from logilab.mtconverter import TransformError
from cubicweb import UnknownEid
......@@ -84,7 +88,7 @@ class AnalyzeEmailText(hook.Operation):
fix_ownership(self.cnx, evargs['trinfo'].eid, self.email)
class AddEmailHook(hook.Hook):
class AddEmailCommentHook(hook.Hook):
"""an email has been added, check if associated content should be created
"""
__regid__ = 'extractmailcontent'
......@@ -101,3 +105,90 @@ class AddEmailHook(hook.Hook):
ExtractEmailInformation(self._cw, email=self.entity, info=info)
break
AnalyzeEmailText(self._cw, email=self.entity)
CLEANUP_RGX = re.compile(r'\bre\s*:', re.I|re.U)
def cleanup_subject(string):
return CLEANUP_RGX.sub('', string).strip()
class AddEmailPreHook(hook.Hook):
__regid__ = 'extractmailmetadata'
__select__ = hook.Hook.__select__ & is_instance('Email')
events = ('before_add_entity',)
def address_eid(self, address, alias=None):
rql = 'Any X WHERE X is EmailAddress, X address LIKE %(addr)s'
rset = self._cw.execute(rql, {'addr': address})
if not rset:
# create a new email address to link to
alias = alias or None
# XXX could try to link created address to a person
eaddress = self._cw.create_entity('EmailAddress', address=address,
alias=alias)
return eaddress.eid
# check for a prefered form if any
return rset.get_entity(0, 0).prefered.eid
def thread_eid(self, subject, replyeid):
if replyeid:
rset = self._cw.execute('EmailThread X WHERE Y in_thread X, Y eid %(y)s',
{'y': replyeid})
if rset:
return rset[0][0]
subject = cleanup_subject(subject)
# XXX too unspecific?
rset = self._cw.execute('EmailThread X WHERE X title %(title)s',
{'title': subject})
if rset:
return rset[0][0]
thread = self._cw.create_entity('EmailThread', title=subject)
return thread.eid
def __call__(self):
msg = self.entity
if 'headers' not in msg.cw_edited:
return
try:
message = UMessage(email.message_from_string(msg.headers))
except Exception:
self.exception('bad message headers')
return
# XXX why limit to a single sender?
if 'messageid' not in msg.cw_edited:
msg.cw_edited['messageid'] = message.get('message-id')
if 'subject' not in msg.cw_edited:
msg.cw_edited['subject'] = message.get('subject') or u'(no subject)'
if 'date' not in msg.cw_edited:
msg.cw_edited['date'] = message.date()
if 'sender' not in msg.cw_edited:
sender = message.multi_addrs('from')[0]
sendereid = self.address_eid(sender[1], sender[0])
msg.cw_edited['sender'] = sendereid
replyto = message.get('in-reply-to')
if replyto and 'reply_to' not in msg.cw_edited:
rset = self._cw.find('Email', messageid=replyto)
if rset:
# XXX reply_to should allow multiple objects
msg.cw_edited['reply_to'] = rset[0][0]
if 'in_thread' not in msg.cw_edited:
msg.cw_edited['in_thread'] = self.thread_eid(msg.subject, msg.reply_to and msg.reply_to[0])
class AddEmailPostHook(AddEmailPreHook):
events = ('after_add_entity',)
def __call__(self):
msg = self.entity
if 'headers' not in msg.cw_edited:
return
try:
message = UMessage(email.message_from_string(msg.headers))
except Exception:
return
if not msg.cc:
msg.cw_set(cc=[self.address_eid(addr, name)
for name, addr in message.multi_addrs('cc')])
if not msg.recipients:
msg.cw_set(recipients=set(self.address_eid(addr, name)
for name, addr in message.multi_addrs('to')))
......@@ -6,7 +6,6 @@
"""
__docformat__ = "restructuredtext en"
import re
import mailbox
from itertools import combinations
from rfc822 import parsedate
......@@ -15,10 +14,6 @@ from logilab.common.umessage import message_from_file
from cubicweb import Binary
CLEANUP_RGX = re.compile(r'\bre\s*:', re.I|re.U)
def cleanup_subject(string):
return CLEANUP_RGX.sub('', string).strip()
class StreamMailbox(mailbox.mbox):
"""A read-only mbox format mailbox from stream."""
......@@ -102,35 +97,14 @@ class MBOXImporter(object):
self._notify_skipped(msgid)
return
# Email entity
subject = message.get('subject') or u'(no subject)'
sender = message.multi_addrs('from')[0]
sendereid = self.address_eid(sender[1], sender[0])
# don't use the UMessage's headers() so the decoding can be done on the server side
headers = u'\n'.join(u'%s: %s' % header for header in message.message.items())
email = self.cnx.create_entity('Email', messageid=msgid,
subject=subject, date=message.date(),
headers=message.headers(),
sender=sendereid)
headers=headers)
self._notify_created('email', email.eid)
# link to mailing list
self.mailinglist_link(message, email.eid)
# link to recipients
for name, addr in message.multi_addrs('to'):
self.execute('SET X recipients Y WHERE X eid %(x)s, Y eid %(y)s',
{'x': email.eid, 'y': self.address_eid(addr, name)})
for name, addr in message.multi_addrs('cc'):
self.execute('SET X cc Y WHERE X eid %(x)s, Y eid %(y)s',
{'x': email.eid, 'y': self.address_eid(addr, name)})
# link to replied email if any
replyto = message.get('in-reply-to')
replyeid = None
if replyto:
rset = self.execute('Email X WHERE X messageid %(id)s', {'id': replyto})
if rset:
replyeid = rset[0][0]
self.execute('SET X reply_to Y WHERE X eid %(x)s, Y eid %(y)s',
{'x': email.eid, 'y': replyeid})
# link to an EmailThread
self.execute('SET X in_thread Y WHERE X eid %(x)s, Y eid %(y)s',
{'x': email.eid, 'y': self.thread_eid(subject, replyeid)})
self._part_index = 0
self._context = None
self._alternatives = []
......@@ -209,35 +183,3 @@ class MBOXImporter(object):
if self._context == 'alternative':
self._alternatives[-1].append(epart.eid)
def address_eid(self, address, alias=None):
rql = 'Any X WHERE X is EmailAddress, X address %(addr)s'
rset = self.execute(rql, {'addr': address})
if not rset:
address = address.lower()
rset = self.execute(rql, {'addr': address})
if not rset:
# create a new email address to link to
alias = alias or None
# XXX could try to link created address to a person
eaddress = self.cnx.create_entity('EmailAddress', address=address,
alias=alias)
self._notify_created('emailaddress', eaddress.eid)
return eaddress.eid
# check for a prefered form if any
return rset.get_entity(0, 0).prefered.eid
def thread_eid(self, subject, replyeid):
if replyeid is not None:
rset = self.execute('EmailThread X WHERE Y in_thread X, Y eid %(y)s',
{'y': replyeid})
if rset:
return rset[0][0]
subject = cleanup_subject(subject)
rset = self.execute('EmailThread X WHERE X title %(title)s',
{'title': subject})
if rset:
return rset[0][0]
thread = self.cnx.create_entity('EmailThread', title=subject)
self._notify_created('emailthread', thread.eid)
return thread.eid
......@@ -31,8 +31,7 @@ class MBOXImporterTC(CubicWebTC):
def _test_base(self, mi):
self.assertEqual(sorted([(x, len(y)) for x, y in mi.created.items()]),
[('email', 2), ('emailaddress', 4),
('emailpart', 5), ('emailthread', 2), ('file', 2)])
[('email', 2), ('emailpart', 5), ('file', 2)])
self.assertEqual(mi.skipped, [])
rset = mi.cnx.execute('Any X ORDERBY S WHERE X is Email, X subject S')
self.assertEqual(len(rset), 2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment