from datetime import datetime import regex as re import email.utils, logging xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]') nl_re = re.compile('\n\n\n') ind_re = re.compile('--------------------------------------------------------------------------') def format_subject(s): return ' '.join(s.split()) def format_content(c): c = c.strip() # new lines c = re.sub(nl_re, '\n', c) ## weird stuff # 1. indesign automatic overset... (? dunno why ?) # ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime) c = re.sub(ind_re, '', c) return c # returns utc timestamp def parse_date_msg(msg): date_str = msg['date'] time_tz = None try: date_tz = email.utils.parsedate_tz(date_str) time_tz = email.utils.mktime_tz(date_tz) #utc timestamp except TypeError: logging.warning("Format Date TypeError") loggin.warning(" > " + date_str) return None except ValueError: loggin.warning("Format Date ValueError") loggin.warning(" > " + date_str) return None except Exception as ex: print(ex) finally: return time_tz # recursive def index_follow_up(msg): r = [] if 'follow-up' in msg: for m in msg['follow-up']: d = parse_date_msg(m) if d is None: d = parse_date_msg(msg) # same as parent r.append((d, m)) r += index_follow_up(m) return r # See for Nevejan?s research def remove_invalid_xml_characters(s): return re.sub(xml_re, '', s) # return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s) # return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)