2020-01-12 12:16:10 +01:00

63 lines
1.5 KiB
Python

from datetime import datetime
import regex as re
import email.utils, logging
xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]')
nl_re = re.compile('\n\n\n')
ind_re = re.compile('--------------------------------------------------------------------------')
def format_subject(s):
return ' '.join(s.split())
def format_content(c):
c = c.strip()
# new lines
c = re.sub(nl_re, '\n', c)
## weird stuff
# 1. indesign automatic overset... (? dunno why ?)
# ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime)
c = re.sub(ind_re, '', c)
return c
# returns utc timestamp
def parse_date_msg(msg):
date_str = msg['date']
time_tz = None
try:
date_tz = email.utils.parsedate_tz(date_str)
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
except TypeError:
logging.warning("Format Date TypeError")
loggin.warning(" > " + date_str)
return None
except ValueError:
loggin.warning("Format Date ValueError")
loggin.warning(" > " + date_str)
return None
except Exception as ex:
print(ex)
finally:
return time_tz
# recursive
def index_follow_up(msg):
r = []
if 'follow-up' in msg:
for m in msg['follow-up']:
d = parse_date_msg(m)
if d is None:
d = parse_date_msg(msg) # same as parent
r.append((d, m))
r += index_follow_up(m)
return r
# See for Nevejan?s research
def remove_invalid_xml_characters(s):
return re.sub(xml_re, '', s)
# return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s)
# return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)