cleaning emails

This commit is contained in:
gauthiier 2019-12-28 15:58:48 +01:00
parent 94055e4bf4
commit 827fe48128
2 changed files with 181 additions and 32 deletions

View File

@ -1,3 +1,9 @@
'''
revised version of email reply parser
integration with nettime, crumb, spectre, etc.
'''
import re import re
@ -28,16 +34,27 @@ class EmailReplyParser(object):
class EmailMessage(object): class EmailMessage(object):
""" An email message represents a parsed email body. """ An email message represents a parsed email body.
ex:
Em 2019-09-10 08:19, podinski escreveu:
El 11/07/2012, a las 11:44, Domenico Quaranta escribió:
""" """
SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') # SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)|(--\n)')
QUOTE_HDR_REGEX = re.compile('On.*wrote:$') SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)')
QUOTE_HDR_REGEX = re.compile('(On.*wrote:$)|(Em.*escreveu:$)|(El.*escribió:$)|(-------- Forwarded Message --------)')
QUOTED_REGEX = re.compile(r'(>+)') QUOTED_REGEX = re.compile(r'(>+)')
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' _MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL) MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
# nettime regex
NT_REGEX = r'(?!# distributed|# <nettime>|# collaborative|# more info:|# archive:)'
def __init__(self, text): def __init__(self, text):
self.fragments = [] self.fragments = []
self.fragment = None self.fragment = None
@ -87,21 +104,28 @@ class EmailMessage(object):
""" Reviews each line in email message and determines fragment type """ Reviews each line in email message and determines fragment type
line - a row of text from an email message line - a row of text from an email message
""" """
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
is_quoted = self.QUOTED_REGEX.match(line) is not None is_quoted = self.QUOTED_REGEX.match(line) is not None
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None
if self.fragment and len(line.strip()) == 0: if self.fragment and self.SIG_REGEX.match(line): # done stop
if self.SIG_REGEX.match(self.fragment.lines[-1].strip()): self.fragment.signature = True
self.fragment.signature = True self._finish_fragment()
self._finish_fragment() return
# if self.fragment and len(line.strip()) == 0:
# if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
# self.fragment.signature = True
# self._finish_fragment()
if self.fragment \ if self.fragment \
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))): (self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):
self.fragment.lines.append(line) self.fragment.lines.append(line)
else: else:
self._finish_fragment() self._finish_fragment()
self.fragment = Fragment(is_quoted, line, headers=is_header) self.fragment = Fragment(is_quoted, line, headers=is_header)
@ -158,6 +182,7 @@ class Fragment(object):
belonging to fragment. belonging to fragment.
""" """
self.lines.reverse() self.lines.reverse()
# self._content = '\n'.join(self.lines)
self._content = '\n'.join(self.lines) self._content = '\n'.join(self.lines)
self.lines = None self.lines = None

View File

@ -1,8 +1,11 @@
import json, os, logging import json, os, logging
import xml.etree.ElementTree as et import xml.etree.ElementTree as et
from xml.sax.saxutils import unescape, escape
import export.utils import export.utils
import config import config
from datetime import datetime from datetime import datetime
from export import emailreply
import re
nn = 0 nn = 0
@ -14,39 +17,109 @@ def export_generate_path(tag):
now = datetime.now() now = datetime.now()
return os.path.join(config.export['path'], tag + "_[now].xml") return os.path.join(config.export['path'], tag + "_[now].xml")
def emit_mail_xml(msg, li, xmlel): def make_xml_element(el, val):
return "<" + el + ">" + escape(val) + "</" + el + ">"
def emit_mail_xml(msg, li):
global nn global nn
nn += 1 nn += 1
logging.info("export xml: " + li) logging.info("export xml: " + li)
mail = et.SubElement(xmlel, 'mail') subject = make_xml_element("subject", msg['subject']) + "\n"
subject = et.SubElement(mail, 'subject')
subject.text = export.utils.format_subject(msg['subject'])
to = et.SubElement(mail, 'to')
if 'to' in msg: if 'to' in msg:
to.text = msg['to'] to = make_xml_element("to", msg['to']) + "\n"
else: else:
to.text = 'n/a' to = make_xml_element("to", 'n/a') + "\n"
from_ = et.SubElement(mail, 'from') from_ = make_xml_element("from", msg['from']) + "\n"
from_.text = msg['from']
date = et.SubElement(mail, 'date') date = make_xml_element("date", msg['date']) + "\n"
date.text = msg['date']
content = et.SubElement(mail, 'content') '''
content.text = export.utils.format_content(msg['content']) todo:
- filter reply
- unescape XML
'''
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
e.read()
content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
content = make_xml_element("content", content_stripped) + "\n"
mail = "<mail>\n" + subject + from_ + to + date + content + "</mail>\n"
# content = et.SubElement(mail, 'content')
# content.text = e.reply
# recursuve "follow-up" # recursuve "follow-up"
if 'follow-up' in msg: if 'follow-up' in msg:
followups = export.utils.index_follow_up(msg) followups = export.utils.index_follow_up(msg)
followups.sort(key=lambda tup: tup[0]) followups.sort(key=lambda tup: tup[0])
for d, f in followups: for d, f in followups:
emit_mail_xml(f, li, xmlel) mail += emit_mail_xml(f, li)
return mail
# def emit_mail_xml(msg, li, xmlel):
# global nn
# nn += 1
# logging.info("export xml: " + li)
# mail = et.SubElement(xmlel, 'mail')
# subject = et.SubElement(mail, 'subject')
# subject.text = export.utils.format_subject(msg['subject'])
# to = et.SubElement(mail, 'to')
# if 'to' in msg:
# to.text = msg['to']
# else:
# to.text = 'n/a'
# from_ = et.SubElement(mail, 'from')
# from_.text = msg['from']
# date = et.SubElement(mail, 'date')
# date.text = msg['date']
# '''
# todo:
# - filter reply
# - unescape XML
# '''
# e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
# e.read()
# escape_table = {
# "&": "&amp;",
# ">": "&gt;",
# "<": "&lt;"
# }
# content_str = "<content>" + escape(e.reply, escape_table) + "</content>"
# print(content_str)
# content = et.fromstring(content_str)
# mail.append(content)
# # content = et.SubElement(mail, 'content')
# # content.text = e.reply
# # recursuve "follow-up"
# if 'follow-up' in msg:
# followups = export.utils.index_follow_up(msg)
# followups.sort(key=lambda tup: tup[0])
# for d, f in followups:
# emit_mail_xml(f, li, xmlel)
#------------------------------------------------------------ #------------------------------------------------------------
@ -61,28 +134,79 @@ def export_single_tag(t, sel, fout):
ch = sel[t] ch = sel[t]
chapter = et.Element('chapter')
chapter_title = et.SubElement(chapter, 'title')
chapter_title.text = t
chapter_desc = et.SubElement(chapter, 'desc') chapter = "<chapter>\n"
chapter_desc.text = ch['desc']
chapter_mails = et.SubElement(chapter, 'mails') chapter_title = make_xml_element("title", t) + "\n"
chapter_desc = make_xml_element("desc", ch['desc']) + "\n"
chapter_mails = "<mails>\n"
for m in ch['lists']: for m in ch['lists']:
emit_mail_xml(m, m['list'], chapter_mails) chapter_mails += emit_mail_xml(m, m['list'])
fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore'))) chapter_mails += "</mails>\n"
chapter = "<chapter>\n" + chapter_mails + "</chapter>"
fout.write(chapter.encode('utf-8'))
# # root = et.ElementTree(chapter)
# # root.write(fout, encoding="utf-8", xml_declaration=True)
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
return True return True
# def export_single_tag(t, sel, fout):
# if t not in list(sel.keys()):
# logging.error("Tag: " + t + " does not exists.")
# return False
# ch = sel[t]
# chapter = et.Element('chapter')
# chapter_title = et.SubElement(chapter, 'title')
# chapter_title.text = t
# chapter_desc = et.SubElement(chapter, 'desc')
# chapter_desc.text = ch['desc']
# chapter_mails = et.SubElement(chapter, 'mails')
# for m in ch['lists']:
# emit_mail_xml(m, m['list'], chapter_mails)
# # root = et.ElementTree(chapter)
# # root.write(fout, encoding="utf-8", xml_declaration=True)
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
# return True
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump): def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin: with open(sel_dump) as fin:
d = json.load(fin) d = json.load(fin)
with open(xml_dump, 'w') as fout: with open(xml_dump, 'wb') as fout:
for k in d.keys(): for k in d.keys():
if not export_single_tag(k, d, fout): if not export_single_tag(k, d, fout):
logging.error("Error exporting: " + k) logging.error("Error exporting: " + k)
@ -97,7 +221,7 @@ def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
now = datetime.now() now = datetime.now()
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S")) xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
with open(xml_out, 'w') as fout: with open(xml_out, 'wb') as fout:
if not export_single_tag(tag, d, fout): if not export_single_tag(tag, d, fout):
logging.error("Error exporting: " + tag) logging.error("Error exporting: " + tag)
return False return False