cleaning emails
This commit is contained in:
parent
94055e4bf4
commit
827fe48128
@ -1,3 +1,9 @@
|
||||
'''
|
||||
revised version of email reply parser
|
||||
integration with nettime, crumb, spectre, etc.
|
||||
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
|
||||
@ -28,16 +34,27 @@ class EmailReplyParser(object):
|
||||
|
||||
class EmailMessage(object):
|
||||
""" An email message represents a parsed email body.
|
||||
|
||||
ex:
|
||||
|
||||
Em 2019-09-10 08:19, podinski escreveu:
|
||||
El 11/07/2012, a las 11:44, Domenico Quaranta escribió:
|
||||
|
||||
"""
|
||||
|
||||
SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
|
||||
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
|
||||
# SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)|(--\n)')
|
||||
SIG_REGEX = re.compile(r'(^(#|\*)\s{2,}distributed)')
|
||||
QUOTE_HDR_REGEX = re.compile('(On.*wrote:$)|(Em.*escreveu:$)|(El.*escribió:$)|(-------- Forwarded Message --------)')
|
||||
|
||||
QUOTED_REGEX = re.compile(r'(>+)')
|
||||
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
|
||||
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
|
||||
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
|
||||
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
|
||||
|
||||
# nettime regex
|
||||
NT_REGEX = r'(?!# distributed|# <nettime>|# collaborative|# more info:|# archive:)'
|
||||
|
||||
def __init__(self, text):
|
||||
self.fragments = []
|
||||
self.fragment = None
|
||||
@ -88,20 +105,27 @@ class EmailMessage(object):
|
||||
|
||||
line - a row of text from an email message
|
||||
"""
|
||||
|
||||
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
|
||||
is_quoted = self.QUOTED_REGEX.match(line) is not None
|
||||
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None
|
||||
|
||||
if self.fragment and len(line.strip()) == 0:
|
||||
if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
|
||||
if self.fragment and self.SIG_REGEX.match(line): # done stop
|
||||
self.fragment.signature = True
|
||||
self._finish_fragment()
|
||||
return
|
||||
|
||||
# if self.fragment and len(line.strip()) == 0:
|
||||
# if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
|
||||
# self.fragment.signature = True
|
||||
# self._finish_fragment()
|
||||
|
||||
if self.fragment \
|
||||
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
|
||||
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):
|
||||
|
||||
self.fragment.lines.append(line)
|
||||
|
||||
else:
|
||||
self._finish_fragment()
|
||||
self.fragment = Fragment(is_quoted, line, headers=is_header)
|
||||
@ -158,6 +182,7 @@ class Fragment(object):
|
||||
belonging to fragment.
|
||||
"""
|
||||
self.lines.reverse()
|
||||
# self._content = '\n'.join(self.lines)
|
||||
self._content = '\n'.join(self.lines)
|
||||
self.lines = None
|
||||
|
||||
|
||||
@ -1,8 +1,11 @@
|
||||
import json, os, logging
|
||||
import xml.etree.ElementTree as et
|
||||
from xml.sax.saxutils import unescape, escape
|
||||
import export.utils
|
||||
import config
|
||||
from datetime import datetime
|
||||
from export import emailreply
|
||||
import re
|
||||
|
||||
nn = 0
|
||||
|
||||
@ -14,39 +17,109 @@ def export_generate_path(tag):
|
||||
now = datetime.now()
|
||||
return os.path.join(config.export['path'], tag + "_[now].xml")
|
||||
|
||||
def emit_mail_xml(msg, li, xmlel):
|
||||
def make_xml_element(el, val):
|
||||
return "<" + el + ">" + escape(val) + "</" + el + ">"
|
||||
|
||||
|
||||
def emit_mail_xml(msg, li):
|
||||
|
||||
global nn
|
||||
nn += 1
|
||||
|
||||
logging.info("export xml: " + li)
|
||||
|
||||
mail = et.SubElement(xmlel, 'mail')
|
||||
subject = make_xml_element("subject", msg['subject']) + "\n"
|
||||
|
||||
subject = et.SubElement(mail, 'subject')
|
||||
subject.text = export.utils.format_subject(msg['subject'])
|
||||
|
||||
to = et.SubElement(mail, 'to')
|
||||
if 'to' in msg:
|
||||
to.text = msg['to']
|
||||
to = make_xml_element("to", msg['to']) + "\n"
|
||||
else:
|
||||
to.text = 'n/a'
|
||||
to = make_xml_element("to", 'n/a') + "\n"
|
||||
|
||||
from_ = et.SubElement(mail, 'from')
|
||||
from_.text = msg['from']
|
||||
from_ = make_xml_element("from", msg['from']) + "\n"
|
||||
|
||||
date = et.SubElement(mail, 'date')
|
||||
date.text = msg['date']
|
||||
date = make_xml_element("date", msg['date']) + "\n"
|
||||
|
||||
content = et.SubElement(mail, 'content')
|
||||
content.text = export.utils.format_content(msg['content'])
|
||||
'''
|
||||
todo:
|
||||
- filter reply
|
||||
- unescape XML
|
||||
'''
|
||||
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
|
||||
e.read()
|
||||
|
||||
content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
|
||||
|
||||
content = make_xml_element("content", content_stripped) + "\n"
|
||||
|
||||
mail = "<mail>\n" + subject + from_ + to + date + content + "</mail>\n"
|
||||
|
||||
# content = et.SubElement(mail, 'content')
|
||||
# content.text = e.reply
|
||||
|
||||
# recursuve "follow-up"
|
||||
if 'follow-up' in msg:
|
||||
followups = export.utils.index_follow_up(msg)
|
||||
followups.sort(key=lambda tup: tup[0])
|
||||
for d, f in followups:
|
||||
emit_mail_xml(f, li, xmlel)
|
||||
mail += emit_mail_xml(f, li)
|
||||
|
||||
return mail
|
||||
|
||||
|
||||
# def emit_mail_xml(msg, li, xmlel):
|
||||
|
||||
# global nn
|
||||
# nn += 1
|
||||
|
||||
# logging.info("export xml: " + li)
|
||||
|
||||
# mail = et.SubElement(xmlel, 'mail')
|
||||
|
||||
# subject = et.SubElement(mail, 'subject')
|
||||
# subject.text = export.utils.format_subject(msg['subject'])
|
||||
|
||||
# to = et.SubElement(mail, 'to')
|
||||
# if 'to' in msg:
|
||||
# to.text = msg['to']
|
||||
# else:
|
||||
# to.text = 'n/a'
|
||||
|
||||
# from_ = et.SubElement(mail, 'from')
|
||||
# from_.text = msg['from']
|
||||
|
||||
# date = et.SubElement(mail, 'date')
|
||||
# date.text = msg['date']
|
||||
|
||||
# '''
|
||||
# todo:
|
||||
# - filter reply
|
||||
# - unescape XML
|
||||
# '''
|
||||
# e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
|
||||
# e.read()
|
||||
|
||||
# escape_table = {
|
||||
# "&": "&",
|
||||
# ">": ">",
|
||||
# "<": "<"
|
||||
# }
|
||||
|
||||
# content_str = "<content>" + escape(e.reply, escape_table) + "</content>"
|
||||
|
||||
# print(content_str)
|
||||
|
||||
# content = et.fromstring(content_str)
|
||||
# mail.append(content)
|
||||
|
||||
# # content = et.SubElement(mail, 'content')
|
||||
# # content.text = e.reply
|
||||
|
||||
# # recursuve "follow-up"
|
||||
# if 'follow-up' in msg:
|
||||
# followups = export.utils.index_follow_up(msg)
|
||||
# followups.sort(key=lambda tup: tup[0])
|
||||
# for d, f in followups:
|
||||
# emit_mail_xml(f, li, xmlel)
|
||||
|
||||
|
||||
#------------------------------------------------------------
|
||||
@ -61,28 +134,79 @@ def export_single_tag(t, sel, fout):
|
||||
|
||||
ch = sel[t]
|
||||
|
||||
chapter = et.Element('chapter')
|
||||
chapter_title = et.SubElement(chapter, 'title')
|
||||
chapter_title.text = t
|
||||
|
||||
chapter_desc = et.SubElement(chapter, 'desc')
|
||||
chapter_desc.text = ch['desc']
|
||||
chapter = "<chapter>\n"
|
||||
|
||||
chapter_mails = et.SubElement(chapter, 'mails')
|
||||
chapter_title = make_xml_element("title", t) + "\n"
|
||||
|
||||
chapter_desc = make_xml_element("desc", ch['desc']) + "\n"
|
||||
|
||||
chapter_mails = "<mails>\n"
|
||||
|
||||
for m in ch['lists']:
|
||||
emit_mail_xml(m, m['list'], chapter_mails)
|
||||
chapter_mails += emit_mail_xml(m, m['list'])
|
||||
|
||||
fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
|
||||
chapter_mails += "</mails>\n"
|
||||
|
||||
chapter = "<chapter>\n" + chapter_mails + "</chapter>"
|
||||
|
||||
fout.write(chapter.encode('utf-8'))
|
||||
|
||||
|
||||
# # root = et.ElementTree(chapter)
|
||||
# # root.write(fout, encoding="utf-8", xml_declaration=True)
|
||||
|
||||
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
|
||||
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
||||
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
|
||||
|
||||
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
||||
|
||||
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# def export_single_tag(t, sel, fout):
|
||||
|
||||
# if t not in list(sel.keys()):
|
||||
# logging.error("Tag: " + t + " does not exists.")
|
||||
# return False
|
||||
|
||||
# ch = sel[t]
|
||||
|
||||
# chapter = et.Element('chapter')
|
||||
# chapter_title = et.SubElement(chapter, 'title')
|
||||
# chapter_title.text = t
|
||||
|
||||
# chapter_desc = et.SubElement(chapter, 'desc')
|
||||
# chapter_desc.text = ch['desc']
|
||||
|
||||
# chapter_mails = et.SubElement(chapter, 'mails')
|
||||
|
||||
# for m in ch['lists']:
|
||||
# emit_mail_xml(m, m['list'], chapter_mails)
|
||||
|
||||
|
||||
# # root = et.ElementTree(chapter)
|
||||
# # root.write(fout, encoding="utf-8", xml_declaration=True)
|
||||
|
||||
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
|
||||
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
||||
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
|
||||
|
||||
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
||||
|
||||
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
||||
|
||||
# return True
|
||||
|
||||
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
|
||||
|
||||
with open(sel_dump) as fin:
|
||||
d = json.load(fin)
|
||||
|
||||
with open(xml_dump, 'w') as fout:
|
||||
with open(xml_dump, 'wb') as fout:
|
||||
for k in d.keys():
|
||||
if not export_single_tag(k, d, fout):
|
||||
logging.error("Error exporting: " + k)
|
||||
@ -97,7 +221,7 @@ def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
|
||||
now = datetime.now()
|
||||
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
|
||||
|
||||
with open(xml_out, 'w') as fout:
|
||||
with open(xml_out, 'wb') as fout:
|
||||
if not export_single_tag(tag, d, fout):
|
||||
logging.error("Error exporting: " + tag)
|
||||
return False
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user