diff --git a/export/emailreply.py b/export/emailreply.py new file mode 100644 index 0000000..dced2c0 --- /dev/null +++ b/export/emailreply.py @@ -0,0 +1,166 @@ +import re + + +class EmailReplyParser(object): + """ Represents a email message that is parsed. + """ + + @staticmethod + def read(text): + """ Factory method that splits email into list of fragments + + text - A string email body + + Returns an EmailMessage instance + """ + return EmailMessage(text).read() + + @staticmethod + def parse_reply(text): + """ Provides the reply portion of email. + + text - A string email body + + Returns reply body message + """ + return EmailReplyParser.read(text).reply + + +class EmailMessage(object): + """ An email message represents a parsed email body. + """ + + SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') + QUOTE_HDR_REGEX = re.compile('On.*wrote:$') + QUOTED_REGEX = re.compile(r'(>+)') + HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') + _MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' + MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) + MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL) + + def __init__(self, text): + self.fragments = [] + self.fragment = None + self.text = text.replace('\r\n', '\n') + self.found_visible = False + + def read(self): + """ Creates new fragment for each line + and labels as a signature, quote, or hidden. + + Returns EmailMessage instance + """ + + self.found_visible = False + + is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) + if is_multi_quote_header: + self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) + + # Fix any outlook style replies, with the reply immediately above the signature boundary line + # See email_2_2.txt for an example + self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE) + + self.lines = self.text.split('\n') + self.lines.reverse() + + for line in self.lines: + self._scan_line(line) + + self._finish_fragment() + + self.fragments.reverse() + + return self + + @property + def reply(self): + """ Captures reply message within email + """ + reply = [] + for f in self.fragments: + if not (f.hidden or f.quoted): + reply.append(f.content) + return '\n'.join(reply) + + def _scan_line(self, line): + """ Reviews each line in email message and determines fragment type + + line - a row of text from an email message + """ + is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None + is_quoted = self.QUOTED_REGEX.match(line) is not None + is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None + + if self.fragment and len(line.strip()) == 0: + if self.SIG_REGEX.match(self.fragment.lines[-1].strip()): + self.fragment.signature = True + self._finish_fragment() + + if self.fragment \ + and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or + (self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))): + + self.fragment.lines.append(line) + else: + self._finish_fragment() + self.fragment = Fragment(is_quoted, line, headers=is_header) + + def quote_header(self, line): + """ Determines whether line is part of a quoted area + + line - a row of the email message + + Returns True or False + """ + return self.QUOTE_HDR_REGEX.match(line[::-1]) is not None + + def _finish_fragment(self): + """ Creates fragment + """ + + if self.fragment: + self.fragment.finish() + if self.fragment.headers: + # Regardless of what's been seen to this point, if we encounter a headers fragment, + # all the previous fragments should be marked hidden and found_visible set to False. + self.found_visible = False + for f in self.fragments: + f.hidden = True + if not self.found_visible: + if self.fragment.quoted \ + or self.fragment.headers \ + or self.fragment.signature \ + or (len(self.fragment.content.strip()) == 0): + + self.fragment.hidden = True + else: + self.found_visible = True + self.fragments.append(self.fragment) + self.fragment = None + + +class Fragment(object): + """ A Fragment is a part of + an Email Message, labeling each part. + """ + + def __init__(self, quoted, first_line, headers=False): + self.signature = False + self.headers = headers + self.hidden = False + self.quoted = quoted + self._content = None + self.lines = [first_line] + + def finish(self): + """ Creates block of content with lines + belonging to fragment. + """ + self.lines.reverse() + self._content = '\n'.join(self.lines) + self.lines = None + + @property + def content(self): + return self._content.strip() \ No newline at end of file