''' revised version of email reply parser integration with nettime, crumb, spectre, etc. ''' import re class EmailReplyParser(object): """ Represents a email message that is parsed. """ @staticmethod def read(text): """ Factory method that splits email into list of fragments text - A string email body Returns an EmailMessage instance """ return EmailMessage(text).read() @staticmethod def parse_reply(text): """ Provides the reply portion of email. text - A string email body Returns reply body message """ return EmailReplyParser.read(text).reply class EmailMessage(object): """ An email message represents a parsed email body. ex: Em 2019-09-10 08:19, podinski escreveu: El 11/07/2012, a las 11:44, Domenico Quaranta escribió: """ # SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)|(--\n)') SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)') QUOTE_HDR_REGEX = re.compile('(On.*wrote:$)|(Em.*escreveu:$)|(El.*escribió:$)|(-------- Forwarded Message --------)') QUOTED_REGEX = re.compile(r'(>+)') HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') _MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL) # nettime regex NT_REGEX = r'(?!# distributed|# |# collaborative|# more info:|# archive:)' def __init__(self, text): self.fragments = [] self.fragment = None self.text = text.replace('\r\n', '\n') self.found_visible = False def read(self): """ Creates new fragment for each line and labels as a signature, quote, or hidden. Returns EmailMessage instance """ self.found_visible = False is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text) if is_multi_quote_header: self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text) # Fix any outlook style replies, with the reply immediately above the signature boundary line # See email_2_2.txt for an example self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE) self.lines = self.text.split('\n') self.lines.reverse() for line in self.lines: self._scan_line(line) self._finish_fragment() self.fragments.reverse() return self @property def reply(self): """ Captures reply message within email """ reply = [] for f in self.fragments: if not (f.hidden or f.quoted): reply.append(f.content) return '\n'.join(reply) def _scan_line(self, line): """ Reviews each line in email message and determines fragment type line - a row of text from an email message """ is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None is_quoted = self.QUOTED_REGEX.match(line) is not None is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None if self.fragment and self.SIG_REGEX.match(line): # done stop self.fragment.signature = True self._finish_fragment() return # if self.fragment and len(line.strip()) == 0: # if self.SIG_REGEX.match(self.fragment.lines[-1].strip()): # self.fragment.signature = True # self._finish_fragment() if self.fragment \ and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or (self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))): self.fragment.lines.append(line) else: self._finish_fragment() self.fragment = Fragment(is_quoted, line, headers=is_header) def quote_header(self, line): """ Determines whether line is part of a quoted area line - a row of the email message Returns True or False """ return self.QUOTE_HDR_REGEX.match(line[::-1]) is not None def _finish_fragment(self): """ Creates fragment """ if self.fragment: self.fragment.finish() if self.fragment.headers: # Regardless of what's been seen to this point, if we encounter a headers fragment, # all the previous fragments should be marked hidden and found_visible set to False. self.found_visible = False for f in self.fragments: f.hidden = True if not self.found_visible: if self.fragment.quoted \ or self.fragment.headers \ or self.fragment.signature \ or (len(self.fragment.content.strip()) == 0): self.fragment.hidden = True else: self.found_visible = True self.fragments.append(self.fragment) self.fragment = None class Fragment(object): """ A Fragment is a part of an Email Message, labeling each part. """ def __init__(self, quoted, first_line, headers=False): self.signature = False self.headers = headers self.hidden = False self.quoted = quoted self._content = None self.lines = [first_line] def finish(self): """ Creates block of content with lines belonging to fragment. """ self.lines.reverse() # self._content = '\n'.join(self.lines) self._content = '\n'.join(self.lines) self.lines = None @property def content(self): return self._content.strip()