diff --git a/export/emailreply.py b/export/emailreply.py index dced2c0..0904ea8 100644 --- a/export/emailreply.py +++ b/export/emailreply.py @@ -1,3 +1,9 @@ +''' +revised version of email reply parser +integration with nettime, crumb, spectre, etc. + +''' + import re @@ -28,16 +34,27 @@ class EmailReplyParser(object): class EmailMessage(object): """ An email message represents a parsed email body. + + ex: + + Em 2019-09-10 08:19, podinski escreveu: + El 11/07/2012, a las 11:44, Domenico Quaranta escribió: + """ - SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') - QUOTE_HDR_REGEX = re.compile('On.*wrote:$') + # SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)|(--\n)') + SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)') + QUOTE_HDR_REGEX = re.compile('(On.*wrote:$)|(Em.*escreveu:$)|(El.*escribió:$)|(-------- Forwarded Message --------)') + QUOTED_REGEX = re.compile(r'(>+)') HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') _MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL) + # nettime regex + NT_REGEX = r'(?!# distributed|# |# collaborative|# more info:|# archive:)' + def __init__(self, text): self.fragments = [] self.fragment = None @@ -87,21 +104,28 @@ class EmailMessage(object): """ Reviews each line in email message and determines fragment type line - a row of text from an email message - """ + """ + is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None is_quoted = self.QUOTED_REGEX.match(line) is not None is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None - if self.fragment and len(line.strip()) == 0: - if self.SIG_REGEX.match(self.fragment.lines[-1].strip()): - self.fragment.signature = True - self._finish_fragment() + if self.fragment and self.SIG_REGEX.match(line): # done stop + self.fragment.signature = True + self._finish_fragment() + return + + # if self.fragment and len(line.strip()) == 0: + # if self.SIG_REGEX.match(self.fragment.lines[-1].strip()): + # self.fragment.signature = True + # self._finish_fragment() if self.fragment \ and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or (self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))): self.fragment.lines.append(line) + else: self._finish_fragment() self.fragment = Fragment(is_quoted, line, headers=is_header) @@ -158,6 +182,7 @@ class Fragment(object): belonging to fragment. """ self.lines.reverse() + # self._content = '\n'.join(self.lines) self._content = '\n'.join(self.lines) self.lines = None diff --git a/export/exportxml.py b/export/exportxml.py index f39c802..8f11715 100644 --- a/export/exportxml.py +++ b/export/exportxml.py @@ -1,8 +1,11 @@ import json, os, logging import xml.etree.ElementTree as et +from xml.sax.saxutils import unescape, escape import export.utils import config from datetime import datetime +from export import emailreply +import re nn = 0 @@ -14,39 +17,109 @@ def export_generate_path(tag): now = datetime.now() return os.path.join(config.export['path'], tag + "_[now].xml") -def emit_mail_xml(msg, li, xmlel): +def make_xml_element(el, val): + return "<" + el + ">" + escape(val) + "" + + +def emit_mail_xml(msg, li): global nn nn += 1 logging.info("export xml: " + li) - mail = et.SubElement(xmlel, 'mail') + subject = make_xml_element("subject", msg['subject']) + "\n" - subject = et.SubElement(mail, 'subject') - subject.text = export.utils.format_subject(msg['subject']) - - to = et.SubElement(mail, 'to') if 'to' in msg: - to.text = msg['to'] + to = make_xml_element("to", msg['to']) + "\n" else: - to.text = 'n/a' + to = make_xml_element("to", 'n/a') + "\n" - from_ = et.SubElement(mail, 'from') - from_.text = msg['from'] + from_ = make_xml_element("from", msg['from']) + "\n" - date = et.SubElement(mail, 'date') - date.text = msg['date'] + date = make_xml_element("date", msg['date']) + "\n" - content = et.SubElement(mail, 'content') - content.text = export.utils.format_content(msg['content']) + ''' + todo: + - filter reply + - unescape XML + ''' + e = emailreply.EmailMessage(export.utils.format_content(msg['content'])) + e.read() + + content_stripped = re.sub(r'(?\n" + subject + from_ + to + date + content + "\n" + + # content = et.SubElement(mail, 'content') + # content.text = e.reply # recursuve "follow-up" if 'follow-up' in msg: followups = export.utils.index_follow_up(msg) followups.sort(key=lambda tup: tup[0]) for d, f in followups: - emit_mail_xml(f, li, xmlel) + mail += emit_mail_xml(f, li) + + return mail + + +# def emit_mail_xml(msg, li, xmlel): + +# global nn +# nn += 1 + +# logging.info("export xml: " + li) + +# mail = et.SubElement(xmlel, 'mail') + +# subject = et.SubElement(mail, 'subject') +# subject.text = export.utils.format_subject(msg['subject']) + +# to = et.SubElement(mail, 'to') +# if 'to' in msg: +# to.text = msg['to'] +# else: +# to.text = 'n/a' + +# from_ = et.SubElement(mail, 'from') +# from_.text = msg['from'] + +# date = et.SubElement(mail, 'date') +# date.text = msg['date'] + +# ''' +# todo: +# - filter reply +# - unescape XML +# ''' +# e = emailreply.EmailMessage(export.utils.format_content(msg['content'])) +# e.read() + +# escape_table = { +# "&": "&", +# ">": ">", +# "<": "<" +# } + +# content_str = "" + escape(e.reply, escape_table) + "" + +# print(content_str) + +# content = et.fromstring(content_str) +# mail.append(content) + +# # content = et.SubElement(mail, 'content') +# # content.text = e.reply + +# # recursuve "follow-up" +# if 'follow-up' in msg: +# followups = export.utils.index_follow_up(msg) +# followups.sort(key=lambda tup: tup[0]) +# for d, f in followups: +# emit_mail_xml(f, li, xmlel) #------------------------------------------------------------ @@ -61,28 +134,79 @@ def export_single_tag(t, sel, fout): ch = sel[t] - chapter = et.Element('chapter') - chapter_title = et.SubElement(chapter, 'title') - chapter_title.text = t - chapter_desc = et.SubElement(chapter, 'desc') - chapter_desc.text = ch['desc'] + chapter = "\n" - chapter_mails = et.SubElement(chapter, 'mails') + chapter_title = make_xml_element("title", t) + "\n" + + chapter_desc = make_xml_element("desc", ch['desc']) + "\n" + + chapter_mails = "\n" for m in ch['lists']: - emit_mail_xml(m, m['list'], chapter_mails) + chapter_mails += emit_mail_xml(m, m['list']) - fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore'))) + chapter_mails += "\n" + + chapter = "\n" + chapter_mails + "" + + fout.write(chapter.encode('utf-8')) + + + # # root = et.ElementTree(chapter) + # # root.write(fout, encoding="utf-8", xml_declaration=True) + + # # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore')) + # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore')) + # xml = '' + xml + + # print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore')) + + # fout.write(et.tostring(chapter).decode('utf-8', 'ignore')) return True + +# def export_single_tag(t, sel, fout): + +# if t not in list(sel.keys()): +# logging.error("Tag: " + t + " does not exists.") +# return False + +# ch = sel[t] + +# chapter = et.Element('chapter') +# chapter_title = et.SubElement(chapter, 'title') +# chapter_title.text = t + +# chapter_desc = et.SubElement(chapter, 'desc') +# chapter_desc.text = ch['desc'] + +# chapter_mails = et.SubElement(chapter, 'mails') + +# for m in ch['lists']: +# emit_mail_xml(m, m['list'], chapter_mails) + + +# # root = et.ElementTree(chapter) +# # root.write(fout, encoding="utf-8", xml_declaration=True) + +# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore')) +# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore')) +# xml = '' + xml + +# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore')) + +# fout.write(et.tostring(chapter).decode('utf-8', 'ignore')) + +# return True + def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump): with open(sel_dump) as fin: d = json.load(fin) - with open(xml_dump, 'w') as fout: + with open(xml_dump, 'wb') as fout: for k in d.keys(): if not export_single_tag(k, d, fout): logging.error("Error exporting: " + k) @@ -97,7 +221,7 @@ def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump): now = datetime.now() xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S")) - with open(xml_out, 'w') as fout: + with open(xml_out, 'wb') as fout: if not export_single_tag(tag, d, fout): logging.error("Error exporting: " + tag) return False