2019-12-28 15:58:48 +01:00
|
|
|
'''
|
|
|
|
|
revised version of email reply parser
|
|
|
|
|
integration with nettime, crumb, spectre, etc.
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
2019-12-27 17:31:20 +01:00
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EmailReplyParser(object):
|
|
|
|
|
""" Represents a email message that is parsed.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def read(text):
|
|
|
|
|
""" Factory method that splits email into list of fragments
|
|
|
|
|
|
|
|
|
|
text - A string email body
|
|
|
|
|
|
|
|
|
|
Returns an EmailMessage instance
|
|
|
|
|
"""
|
|
|
|
|
return EmailMessage(text).read()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_reply(text):
|
|
|
|
|
""" Provides the reply portion of email.
|
|
|
|
|
|
|
|
|
|
text - A string email body
|
|
|
|
|
|
|
|
|
|
Returns reply body message
|
|
|
|
|
"""
|
|
|
|
|
return EmailReplyParser.read(text).reply
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EmailMessage(object):
|
|
|
|
|
""" An email message represents a parsed email body.
|
2019-12-28 15:58:48 +01:00
|
|
|
|
|
|
|
|
ex:
|
|
|
|
|
|
|
|
|
|
Em 2019-09-10 08:19, podinski escreveu:
|
|
|
|
|
El 11/07/2012, a las 11:44, Domenico Quaranta escribió:
|
|
|
|
|
|
2019-12-27 17:31:20 +01:00
|
|
|
"""
|
|
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
# SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)|(--\n)')
|
|
|
|
|
SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)')
|
|
|
|
|
QUOTE_HDR_REGEX = re.compile('(On.*wrote:$)|(Em.*escreveu:$)|(El.*escribió:$)|(-------- Forwarded Message --------)')
|
|
|
|
|
|
2019-12-27 17:31:20 +01:00
|
|
|
QUOTED_REGEX = re.compile(r'(>+)')
|
|
|
|
|
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
|
|
|
|
|
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
|
|
|
|
|
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
|
|
|
|
|
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
|
|
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
# nettime regex
|
|
|
|
|
NT_REGEX = r'(?!# distributed|# <nettime>|# collaborative|# more info:|# archive:)'
|
|
|
|
|
|
2019-12-27 17:31:20 +01:00
|
|
|
def __init__(self, text):
|
|
|
|
|
self.fragments = []
|
|
|
|
|
self.fragment = None
|
|
|
|
|
self.text = text.replace('\r\n', '\n')
|
|
|
|
|
self.found_visible = False
|
|
|
|
|
|
|
|
|
|
def read(self):
|
|
|
|
|
""" Creates new fragment for each line
|
|
|
|
|
and labels as a signature, quote, or hidden.
|
|
|
|
|
|
|
|
|
|
Returns EmailMessage instance
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
self.found_visible = False
|
|
|
|
|
|
|
|
|
|
is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text)
|
|
|
|
|
if is_multi_quote_header:
|
|
|
|
|
self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text)
|
|
|
|
|
|
|
|
|
|
# Fix any outlook style replies, with the reply immediately above the signature boundary line
|
|
|
|
|
# See email_2_2.txt for an example
|
|
|
|
|
self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE)
|
|
|
|
|
|
|
|
|
|
self.lines = self.text.split('\n')
|
|
|
|
|
self.lines.reverse()
|
|
|
|
|
|
|
|
|
|
for line in self.lines:
|
|
|
|
|
self._scan_line(line)
|
|
|
|
|
|
|
|
|
|
self._finish_fragment()
|
|
|
|
|
|
|
|
|
|
self.fragments.reverse()
|
|
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def reply(self):
|
|
|
|
|
""" Captures reply message within email
|
|
|
|
|
"""
|
|
|
|
|
reply = []
|
|
|
|
|
for f in self.fragments:
|
|
|
|
|
if not (f.hidden or f.quoted):
|
|
|
|
|
reply.append(f.content)
|
|
|
|
|
return '\n'.join(reply)
|
|
|
|
|
|
|
|
|
|
def _scan_line(self, line):
|
|
|
|
|
""" Reviews each line in email message and determines fragment type
|
|
|
|
|
|
|
|
|
|
line - a row of text from an email message
|
2019-12-28 15:58:48 +01:00
|
|
|
"""
|
|
|
|
|
|
2019-12-27 17:31:20 +01:00
|
|
|
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
|
|
|
|
|
is_quoted = self.QUOTED_REGEX.match(line) is not None
|
|
|
|
|
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None
|
|
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
if self.fragment and self.SIG_REGEX.match(line): # done stop
|
|
|
|
|
self.fragment.signature = True
|
|
|
|
|
self._finish_fragment()
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# if self.fragment and len(line.strip()) == 0:
|
|
|
|
|
# if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
|
|
|
|
|
# self.fragment.signature = True
|
|
|
|
|
# self._finish_fragment()
|
2019-12-27 17:31:20 +01:00
|
|
|
|
|
|
|
|
if self.fragment \
|
|
|
|
|
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
|
|
|
|
|
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):
|
|
|
|
|
|
|
|
|
|
self.fragment.lines.append(line)
|
2019-12-28 15:58:48 +01:00
|
|
|
|
2019-12-27 17:31:20 +01:00
|
|
|
else:
|
|
|
|
|
self._finish_fragment()
|
|
|
|
|
self.fragment = Fragment(is_quoted, line, headers=is_header)
|
|
|
|
|
|
|
|
|
|
def quote_header(self, line):
|
|
|
|
|
""" Determines whether line is part of a quoted area
|
|
|
|
|
|
|
|
|
|
line - a row of the email message
|
|
|
|
|
|
|
|
|
|
Returns True or False
|
|
|
|
|
"""
|
|
|
|
|
return self.QUOTE_HDR_REGEX.match(line[::-1]) is not None
|
|
|
|
|
|
|
|
|
|
def _finish_fragment(self):
|
|
|
|
|
""" Creates fragment
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if self.fragment:
|
|
|
|
|
self.fragment.finish()
|
|
|
|
|
if self.fragment.headers:
|
|
|
|
|
# Regardless of what's been seen to this point, if we encounter a headers fragment,
|
|
|
|
|
# all the previous fragments should be marked hidden and found_visible set to False.
|
|
|
|
|
self.found_visible = False
|
|
|
|
|
for f in self.fragments:
|
|
|
|
|
f.hidden = True
|
|
|
|
|
if not self.found_visible:
|
|
|
|
|
if self.fragment.quoted \
|
|
|
|
|
or self.fragment.headers \
|
|
|
|
|
or self.fragment.signature \
|
|
|
|
|
or (len(self.fragment.content.strip()) == 0):
|
|
|
|
|
|
|
|
|
|
self.fragment.hidden = True
|
|
|
|
|
else:
|
|
|
|
|
self.found_visible = True
|
|
|
|
|
self.fragments.append(self.fragment)
|
|
|
|
|
self.fragment = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Fragment(object):
|
|
|
|
|
""" A Fragment is a part of
|
|
|
|
|
an Email Message, labeling each part.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, quoted, first_line, headers=False):
|
|
|
|
|
self.signature = False
|
|
|
|
|
self.headers = headers
|
|
|
|
|
self.hidden = False
|
|
|
|
|
self.quoted = quoted
|
|
|
|
|
self._content = None
|
|
|
|
|
self.lines = [first_line]
|
|
|
|
|
|
|
|
|
|
def finish(self):
|
|
|
|
|
""" Creates block of content with lines
|
|
|
|
|
belonging to fragment.
|
|
|
|
|
"""
|
|
|
|
|
self.lines.reverse()
|
2019-12-28 15:58:48 +01:00
|
|
|
# self._content = '\n'.join(self.lines)
|
2019-12-27 17:31:20 +01:00
|
|
|
self._content = '\n'.join(self.lines)
|
|
|
|
|
self.lines = None
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def content(self):
|
|
|
|
|
return self._content.strip()
|