emailreply
This commit is contained in:
parent
51a55d1968
commit
94055e4bf4
166
export/emailreply.py
Normal file
166
export/emailreply.py
Normal file
@ -0,0 +1,166 @@
|
||||
import re
|
||||
|
||||
|
||||
class EmailReplyParser(object):
|
||||
""" Represents a email message that is parsed.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def read(text):
|
||||
""" Factory method that splits email into list of fragments
|
||||
|
||||
text - A string email body
|
||||
|
||||
Returns an EmailMessage instance
|
||||
"""
|
||||
return EmailMessage(text).read()
|
||||
|
||||
@staticmethod
|
||||
def parse_reply(text):
|
||||
""" Provides the reply portion of email.
|
||||
|
||||
text - A string email body
|
||||
|
||||
Returns reply body message
|
||||
"""
|
||||
return EmailReplyParser.read(text).reply
|
||||
|
||||
|
||||
class EmailMessage(object):
|
||||
""" An email message represents a parsed email body.
|
||||
"""
|
||||
|
||||
SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
|
||||
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
|
||||
QUOTED_REGEX = re.compile(r'(>+)')
|
||||
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
|
||||
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
|
||||
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
|
||||
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
|
||||
|
||||
def __init__(self, text):
|
||||
self.fragments = []
|
||||
self.fragment = None
|
||||
self.text = text.replace('\r\n', '\n')
|
||||
self.found_visible = False
|
||||
|
||||
def read(self):
|
||||
""" Creates new fragment for each line
|
||||
and labels as a signature, quote, or hidden.
|
||||
|
||||
Returns EmailMessage instance
|
||||
"""
|
||||
|
||||
self.found_visible = False
|
||||
|
||||
is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text)
|
||||
if is_multi_quote_header:
|
||||
self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text)
|
||||
|
||||
# Fix any outlook style replies, with the reply immediately above the signature boundary line
|
||||
# See email_2_2.txt for an example
|
||||
self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE)
|
||||
|
||||
self.lines = self.text.split('\n')
|
||||
self.lines.reverse()
|
||||
|
||||
for line in self.lines:
|
||||
self._scan_line(line)
|
||||
|
||||
self._finish_fragment()
|
||||
|
||||
self.fragments.reverse()
|
||||
|
||||
return self
|
||||
|
||||
@property
|
||||
def reply(self):
|
||||
""" Captures reply message within email
|
||||
"""
|
||||
reply = []
|
||||
for f in self.fragments:
|
||||
if not (f.hidden or f.quoted):
|
||||
reply.append(f.content)
|
||||
return '\n'.join(reply)
|
||||
|
||||
def _scan_line(self, line):
|
||||
""" Reviews each line in email message and determines fragment type
|
||||
|
||||
line - a row of text from an email message
|
||||
"""
|
||||
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
|
||||
is_quoted = self.QUOTED_REGEX.match(line) is not None
|
||||
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None
|
||||
|
||||
if self.fragment and len(line.strip()) == 0:
|
||||
if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
|
||||
self.fragment.signature = True
|
||||
self._finish_fragment()
|
||||
|
||||
if self.fragment \
|
||||
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
|
||||
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):
|
||||
|
||||
self.fragment.lines.append(line)
|
||||
else:
|
||||
self._finish_fragment()
|
||||
self.fragment = Fragment(is_quoted, line, headers=is_header)
|
||||
|
||||
def quote_header(self, line):
|
||||
""" Determines whether line is part of a quoted area
|
||||
|
||||
line - a row of the email message
|
||||
|
||||
Returns True or False
|
||||
"""
|
||||
return self.QUOTE_HDR_REGEX.match(line[::-1]) is not None
|
||||
|
||||
def _finish_fragment(self):
|
||||
""" Creates fragment
|
||||
"""
|
||||
|
||||
if self.fragment:
|
||||
self.fragment.finish()
|
||||
if self.fragment.headers:
|
||||
# Regardless of what's been seen to this point, if we encounter a headers fragment,
|
||||
# all the previous fragments should be marked hidden and found_visible set to False.
|
||||
self.found_visible = False
|
||||
for f in self.fragments:
|
||||
f.hidden = True
|
||||
if not self.found_visible:
|
||||
if self.fragment.quoted \
|
||||
or self.fragment.headers \
|
||||
or self.fragment.signature \
|
||||
or (len(self.fragment.content.strip()) == 0):
|
||||
|
||||
self.fragment.hidden = True
|
||||
else:
|
||||
self.found_visible = True
|
||||
self.fragments.append(self.fragment)
|
||||
self.fragment = None
|
||||
|
||||
|
||||
class Fragment(object):
|
||||
""" A Fragment is a part of
|
||||
an Email Message, labeling each part.
|
||||
"""
|
||||
|
||||
def __init__(self, quoted, first_line, headers=False):
|
||||
self.signature = False
|
||||
self.headers = headers
|
||||
self.hidden = False
|
||||
self.quoted = quoted
|
||||
self._content = None
|
||||
self.lines = [first_line]
|
||||
|
||||
def finish(self):
|
||||
""" Creates block of content with lines
|
||||
belonging to fragment.
|
||||
"""
|
||||
self.lines.reverse()
|
||||
self._content = '\n'.join(self.lines)
|
||||
self.lines = None
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
return self._content.strip()
|
||||
Loading…
x
Reference in New Issue
Block a user