cleaning emails

This commit is contained in:
gauthiier 2019-12-28 15:58:48 +01:00
parent 94055e4bf4
commit 827fe48128
2 changed files with 181 additions and 32 deletions

View File

@ -1,3 +1,9 @@
'''
revised version of email reply parser
integration with nettime, crumb, spectre, etc.
'''
import re
@ -28,16 +34,27 @@ class EmailReplyParser(object):
class EmailMessage(object):
""" An email message represents a parsed email body.
ex:
Em 2019-09-10 08:19, podinski escreveu:
El 11/07/2012, a las 11:44, Domenico Quaranta escribió:
"""
SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
# SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)|(--\n)')
SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)')
QUOTE_HDR_REGEX = re.compile('(On.*wrote:$)|(Em.*escreveu:$)|(El.*escribió:$)|(-------- Forwarded Message --------)')
QUOTED_REGEX = re.compile(r'(>+)')
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
# nettime regex
NT_REGEX = r'(?!# distributed|# <nettime>|# collaborative|# more info:|# archive:)'
def __init__(self, text):
self.fragments = []
self.fragment = None
@ -88,20 +105,27 @@ class EmailMessage(object):
line - a row of text from an email message
"""
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
is_quoted = self.QUOTED_REGEX.match(line) is not None
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None
if self.fragment and len(line.strip()) == 0:
if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
if self.fragment and self.SIG_REGEX.match(line): # done stop
self.fragment.signature = True
self._finish_fragment()
return
# if self.fragment and len(line.strip()) == 0:
# if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
# self.fragment.signature = True
# self._finish_fragment()
if self.fragment \
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):
self.fragment.lines.append(line)
else:
self._finish_fragment()
self.fragment = Fragment(is_quoted, line, headers=is_header)
@ -158,6 +182,7 @@ class Fragment(object):
belonging to fragment.
"""
self.lines.reverse()
# self._content = '\n'.join(self.lines)
self._content = '\n'.join(self.lines)
self.lines = None

View File

@ -1,8 +1,11 @@
import json, os, logging
import xml.etree.ElementTree as et
from xml.sax.saxutils import unescape, escape
import export.utils
import config
from datetime import datetime
from export import emailreply
import re
nn = 0
@ -14,39 +17,109 @@ def export_generate_path(tag):
now = datetime.now()
return os.path.join(config.export['path'], tag + "_[now].xml")
def emit_mail_xml(msg, li, xmlel):
def make_xml_element(el, val):
return "<" + el + ">" + escape(val) + "</" + el + ">"
def emit_mail_xml(msg, li):
global nn
nn += 1
logging.info("export xml: " + li)
mail = et.SubElement(xmlel, 'mail')
subject = make_xml_element("subject", msg['subject']) + "\n"
subject = et.SubElement(mail, 'subject')
subject.text = export.utils.format_subject(msg['subject'])
to = et.SubElement(mail, 'to')
if 'to' in msg:
to.text = msg['to']
to = make_xml_element("to", msg['to']) + "\n"
else:
to.text = 'n/a'
to = make_xml_element("to", 'n/a') + "\n"
from_ = et.SubElement(mail, 'from')
from_.text = msg['from']
from_ = make_xml_element("from", msg['from']) + "\n"
date = et.SubElement(mail, 'date')
date.text = msg['date']
date = make_xml_element("date", msg['date']) + "\n"
content = et.SubElement(mail, 'content')
content.text = export.utils.format_content(msg['content'])
'''
todo:
- filter reply
- unescape XML
'''
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
e.read()
content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
content = make_xml_element("content", content_stripped) + "\n"
mail = "<mail>\n" + subject + from_ + to + date + content + "</mail>\n"
# content = et.SubElement(mail, 'content')
# content.text = e.reply
# recursuve "follow-up"
if 'follow-up' in msg:
followups = export.utils.index_follow_up(msg)
followups.sort(key=lambda tup: tup[0])
for d, f in followups:
emit_mail_xml(f, li, xmlel)
mail += emit_mail_xml(f, li)
return mail
# def emit_mail_xml(msg, li, xmlel):
# global nn
# nn += 1
# logging.info("export xml: " + li)
# mail = et.SubElement(xmlel, 'mail')
# subject = et.SubElement(mail, 'subject')
# subject.text = export.utils.format_subject(msg['subject'])
# to = et.SubElement(mail, 'to')
# if 'to' in msg:
# to.text = msg['to']
# else:
# to.text = 'n/a'
# from_ = et.SubElement(mail, 'from')
# from_.text = msg['from']
# date = et.SubElement(mail, 'date')
# date.text = msg['date']
# '''
# todo:
# - filter reply
# - unescape XML
# '''
# e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
# e.read()
# escape_table = {
# "&": "&amp;",
# ">": "&gt;",
# "<": "&lt;"
# }
# content_str = "<content>" + escape(e.reply, escape_table) + "</content>"
# print(content_str)
# content = et.fromstring(content_str)
# mail.append(content)
# # content = et.SubElement(mail, 'content')
# # content.text = e.reply
# # recursuve "follow-up"
# if 'follow-up' in msg:
# followups = export.utils.index_follow_up(msg)
# followups.sort(key=lambda tup: tup[0])
# for d, f in followups:
# emit_mail_xml(f, li, xmlel)
#------------------------------------------------------------
@ -61,28 +134,79 @@ def export_single_tag(t, sel, fout):
ch = sel[t]
chapter = et.Element('chapter')
chapter_title = et.SubElement(chapter, 'title')
chapter_title.text = t
chapter_desc = et.SubElement(chapter, 'desc')
chapter_desc.text = ch['desc']
chapter = "<chapter>\n"
chapter_mails = et.SubElement(chapter, 'mails')
chapter_title = make_xml_element("title", t) + "\n"
chapter_desc = make_xml_element("desc", ch['desc']) + "\n"
chapter_mails = "<mails>\n"
for m in ch['lists']:
emit_mail_xml(m, m['list'], chapter_mails)
chapter_mails += emit_mail_xml(m, m['list'])
fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
chapter_mails += "</mails>\n"
chapter = "<chapter>\n" + chapter_mails + "</chapter>"
fout.write(chapter.encode('utf-8'))
# # root = et.ElementTree(chapter)
# # root.write(fout, encoding="utf-8", xml_declaration=True)
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
return True
# def export_single_tag(t, sel, fout):
# if t not in list(sel.keys()):
# logging.error("Tag: " + t + " does not exists.")
# return False
# ch = sel[t]
# chapter = et.Element('chapter')
# chapter_title = et.SubElement(chapter, 'title')
# chapter_title.text = t
# chapter_desc = et.SubElement(chapter, 'desc')
# chapter_desc.text = ch['desc']
# chapter_mails = et.SubElement(chapter, 'mails')
# for m in ch['lists']:
# emit_mail_xml(m, m['list'], chapter_mails)
# # root = et.ElementTree(chapter)
# # root.write(fout, encoding="utf-8", xml_declaration=True)
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
# return True
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin:
d = json.load(fin)
with open(xml_dump, 'w') as fout:
with open(xml_dump, 'wb') as fout:
for k in d.keys():
if not export_single_tag(k, d, fout):
logging.error("Error exporting: " + k)
@ -97,7 +221,7 @@ def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
now = datetime.now()
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
with open(xml_out, 'w') as fout:
with open(xml_out, 'wb') as fout:
if not export_single_tag(tag, d, fout):
logging.error("Error exporting: " + tag)
return False