import email, datetime, sys import hashlib import dateparser def format_content(msg): return msg['content'] def format_url(msg): return msg['url'] def format_author(msg): if 'author_name' not in msg or msg['author_name'] is None: return None author_str = msg['author_name'].replace('"', '') if "by way of" in author_str: toks = author_str.split("by way of") if toks[0] == "": author_str = format_from(msg) elif toks[0][-1] == "(": author_str = toks[0][:-1].strip() else: author_str = toks[0] if ("(" in author_str) or ("<" in author_str): # ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault # print("±±±±±±") # print("name: " + author_str) # print("from: " + msg['from']) if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'): author_str = author_str.split('(')[0].strip() else: author_str = email.utils.parseaddr(author_str)[0] # print(" Name:" + author_str.replace('"', '')) # print(" From:" + format_from(msg)) if " ," in author_str: # nettime's_roving_reporter , thing.net {AT} bbs.thing.net author_str = author_str.split(' ,')[0] return author_str def format_from_token(from_str, sep): from_addr = email.utils.parseaddr(from_str)[1] if sep not in from_addr: tok = from_str.split() try: at = tok.index(sep) from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]]) if from_addr.startswith('<') or from_addr.endswith('>'): from_addr = from_addr.strip('<').strip('>') except ValueError: print(tok) print("error formating 'from' " + from_str + " -- expecting sep: " + sep) return None else: from_addr = from_addr.replace(sep, '{AT}') return "".join(from_addr.lower().split()) def format_from(msg): if 'from' not in msg or msg['from'] is None: return None from_str = msg['from'] if " {AT} " in from_str: return format_from_token(from_str, '{AT}') elif " at " in from_str: return format_from_token(from_str, 'at') elif "@" in from_str: return format_from_token(from_str, '@') else: return "".join(from_str.split()) def format_to(msg): if "to" not in msg or msg["to"] is None: return "n/a" to_str = msg["to"] toks = email.utils.parseaddr(to_str) # print(toks) if len(toks) == 2: to_str = toks[1] return "".join(to_str.lower().split()) # returns utc timestamp --- old... def format_date_utc(msg, archive_name): if 'date' not in msg or msg['date'] is None: return None date_str = msg['date'].replace('.', '') time_tz = None try: date_tz = email.utils.parsedate_tz(date_str) time_tz = email.utils.mktime_tz(date_tz) #utc timestamp except TypeError: print("Format Date TypeError") print(" > " + date_str) return None except ValueError: print("Format Date ValueError") print(" > " + date_str) return None finally: return time_tz def format_date(msg, archive_name): if 'date' not in msg or msg['date'] is None: return None # date_str = msg['date'].replace('.', '') date_str = msg['date'] # fix Thu, 01 Aug 2002 17:33:08 +0900 (JST) if '(' in date_str: date_str = date_str.split('(')[0].rstrip() date_time = dateparser.parse(date_str) if date_time is None: # random stuff... fix = False toks = date_str.split() if len(toks[-1]) == 5 or len(toks[-1]) == 4: # ex. Thu, 24 Jan 2002 15:21:31 -0000 if toks[-1] in ['+0000', '-0000', '0000']: date_str = date_str[:-5] fix = True # ex. Fri, 25 Jan 2002 13:21:49 +1050 elif toks[-1][-2] == '5': d = list(date_str) d[-2] = '3' date_str = "".join(d) fix = True if toks[-1][-1] != '0': #ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005' date_str = date_str[:-5] fix = True if 'Fru' in toks[0]: date_str = date_str.replace('Fru', 'Fri') fix = True elif 'Thur' in toks[0]: date_str = date_str.replace('Thur', 'Thu') fix = True if not fix: # print("----") return None date_time = dateparser.parse(date_str) if date_time is None: if 'GMT' in date_str: # ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100' date_str = date_str.split('GMT')[0].rstrip() fix = True if 'METDST' in toks[-1]: # ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST' date_str = date_str.replace('METDST', 'MET') fix = True if not fix: # print("++++") return None date_time = dateparser.parse(date_str) return date_time # else: # print(date_str) # date_time = datetime.datetime.fromtimestamp(time_tz) min_d = datetime.datetime.strptime(min_date(archive_name), "%d/%m/%Y") max_d = datetime.datetime.now() date_time_naive = date_time.replace(tzinfo=None) if date_time_naive < min_d or date_time_naive > max_d: return None return date_time def format_subject(msg, archive_name): if 'subject' not in msg or msg['subject'] is None: return None return msg['subject'] def format_id(msg, archive_name): if "message-id" in msg: return msg['message-id'] else: # create hash with author_name + date s = msg['author_name'] + msg['date'] sha = hashlib.sha1(s.encode('utf-8')) return sha.hexdigest() # format='%d/%m/%Y' def min_date(archive_name): if archive_name == "nettime_l": return '01/10/1995' elif archive_name == "spectre": return '01/08/2001' elif archive_name == "empyre": return '01/01/2002' elif archive_name == "crumb": return '01/02/2001' elif archive_name == "oldboys": return '01/03/2001' elif archive_name == "nettime_bold": return '01/01/2000' elif archive_name == "syndicate": return '01/01/1996'