2019-12-26 18:12:49 +01:00
|
|
|
from datetime import datetime
|
|
|
|
|
import regex as re
|
|
|
|
|
import email.utils, logging
|
|
|
|
|
|
|
|
|
|
xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]')
|
|
|
|
|
nl_re = re.compile('\n\n\n')
|
|
|
|
|
ind_re = re.compile('--------------------------------------------------------------------------')
|
|
|
|
|
|
|
|
|
|
def format_subject(s):
|
|
|
|
|
return ' '.join(s.split())
|
|
|
|
|
|
|
|
|
|
def format_content(c):
|
|
|
|
|
|
|
|
|
|
c = c.strip()
|
|
|
|
|
|
|
|
|
|
# new lines
|
|
|
|
|
c = re.sub(nl_re, '\n', c)
|
|
|
|
|
|
|
|
|
|
## weird stuff
|
|
|
|
|
# 1. indesign automatic overset... (? dunno why ?)
|
|
|
|
|
# ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime)
|
|
|
|
|
c = re.sub(ind_re, '', c)
|
|
|
|
|
|
|
|
|
|
return c
|
|
|
|
|
|
|
|
|
|
# returns utc timestamp
|
|
|
|
|
def parse_date_msg(msg):
|
|
|
|
|
date_str = msg['date']
|
|
|
|
|
time_tz = None
|
|
|
|
|
try:
|
|
|
|
|
date_tz = email.utils.parsedate_tz(date_str)
|
|
|
|
|
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
|
|
|
|
except TypeError:
|
|
|
|
|
logging.warning("Format Date TypeError")
|
|
|
|
|
loggin.warning(" > " + date_str)
|
|
|
|
|
return None
|
|
|
|
|
except ValueError:
|
|
|
|
|
loggin.warning("Format Date ValueError")
|
|
|
|
|
loggin.warning(" > " + date_str)
|
|
|
|
|
return None
|
|
|
|
|
except Exception as ex:
|
|
|
|
|
print(ex)
|
|
|
|
|
finally:
|
|
|
|
|
return time_tz
|
|
|
|
|
|
|
|
|
|
# recursive
|
|
|
|
|
def index_follow_up(msg):
|
|
|
|
|
r = []
|
2020-01-12 12:16:10 +01:00
|
|
|
if 'follow-up' in msg:
|
2019-12-26 18:12:49 +01:00
|
|
|
for m in msg['follow-up']:
|
|
|
|
|
d = parse_date_msg(m)
|
|
|
|
|
if d is None:
|
|
|
|
|
d = parse_date_msg(msg) # same as parent
|
|
|
|
|
r.append((d, m))
|
|
|
|
|
r += index_follow_up(m)
|
|
|
|
|
return r
|
|
|
|
|
|
|
|
|
|
# See for Nevejan?s research
|
|
|
|
|
def remove_invalid_xml_characters(s):
|
|
|
|
|
return re.sub(xml_re, '', s)
|
|
|
|
|
# return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s)
|
|
|
|
|
# return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)
|