2019-07-11 13:21:42 +02:00
|
|
|
import email, datetime, sys
|
|
|
|
|
import hashlib
|
|
|
|
|
import dateparser
|
|
|
|
|
|
|
|
|
|
def format_content(msg):
|
|
|
|
|
return msg['content']
|
|
|
|
|
|
|
|
|
|
def format_url(msg):
|
|
|
|
|
return msg['url']
|
|
|
|
|
|
|
|
|
|
def format_author(msg):
|
|
|
|
|
|
|
|
|
|
if 'author_name' not in msg or msg['author_name'] is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
author_str = msg['author_name'].replace('"', '')
|
|
|
|
|
|
|
|
|
|
if "by way of" in author_str:
|
|
|
|
|
toks = author_str.split("by way of")
|
|
|
|
|
if toks[0] == "":
|
|
|
|
|
author_str = format_from(msg)
|
|
|
|
|
elif toks[0][-1] == "(":
|
|
|
|
|
author_str = toks[0][:-1].strip()
|
|
|
|
|
else:
|
|
|
|
|
author_str = toks[0]
|
|
|
|
|
|
|
|
|
|
if ("(" in author_str) or ("<" in author_str):
|
|
|
|
|
# ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault <zx {AT} xyz.net>
|
|
|
|
|
# print("±±±±±±")
|
|
|
|
|
# print("name: " + author_str)
|
|
|
|
|
# print("from: " + msg['from'])
|
|
|
|
|
if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'):
|
|
|
|
|
author_str = author_str.split('(')[0].strip()
|
|
|
|
|
else:
|
|
|
|
|
author_str = email.utils.parseaddr(author_str)[0]
|
|
|
|
|
# print(" Name:" + author_str.replace('"', ''))
|
|
|
|
|
# print(" From:" + format_from(msg))
|
|
|
|
|
|
|
|
|
|
if " ," in author_str:
|
|
|
|
|
# nettime's_roving_reporter , thing.net {AT} bbs.thing.net
|
|
|
|
|
author_str = author_str.split(' ,')[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return author_str
|
|
|
|
|
|
|
|
|
|
def format_from_token(from_str, sep):
|
|
|
|
|
from_addr = email.utils.parseaddr(from_str)[1]
|
|
|
|
|
if sep not in from_addr:
|
|
|
|
|
tok = from_str.split()
|
|
|
|
|
try:
|
|
|
|
|
at = tok.index(sep)
|
|
|
|
|
from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
|
|
|
|
|
if from_addr.startswith('<') or from_addr.endswith('>'):
|
|
|
|
|
from_addr = from_addr.strip('<').strip('>')
|
|
|
|
|
except ValueError:
|
|
|
|
|
print(tok)
|
|
|
|
|
print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
|
|
|
|
|
return None
|
|
|
|
|
else:
|
|
|
|
|
from_addr = from_addr.replace(sep, '{AT}')
|
|
|
|
|
return "".join(from_addr.lower().split())
|
|
|
|
|
|
|
|
|
|
def format_from(msg):
|
|
|
|
|
|
|
|
|
|
if 'from' not in msg or msg['from'] is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
from_str = msg['from']
|
|
|
|
|
|
|
|
|
|
if " {AT} " in from_str:
|
|
|
|
|
return format_from_token(from_str, '{AT}')
|
|
|
|
|
elif " at " in from_str:
|
|
|
|
|
return format_from_token(from_str, 'at')
|
|
|
|
|
elif "@" in from_str:
|
|
|
|
|
return format_from_token(from_str, '@')
|
|
|
|
|
else:
|
|
|
|
|
return "".join(from_str.split())
|
|
|
|
|
|
|
|
|
|
def format_to(msg):
|
|
|
|
|
|
|
|
|
|
if "to" not in msg or msg["to"] is None:
|
2019-08-24 13:06:59 +02:00
|
|
|
return "n/a"
|
2019-07-11 13:21:42 +02:00
|
|
|
|
|
|
|
|
to_str = msg["to"]
|
|
|
|
|
toks = email.utils.parseaddr(to_str)
|
|
|
|
|
# print(toks)
|
|
|
|
|
|
|
|
|
|
if len(toks) == 2:
|
|
|
|
|
to_str = toks[1]
|
|
|
|
|
|
|
|
|
|
return "".join(to_str.lower().split())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# returns utc timestamp --- old...
|
|
|
|
|
def format_date_utc(msg, archive_name):
|
|
|
|
|
|
|
|
|
|
if 'date' not in msg or msg['date'] is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
date_str = msg['date'].replace('.', '')
|
|
|
|
|
time_tz = None
|
|
|
|
|
try:
|
|
|
|
|
date_tz = email.utils.parsedate_tz(date_str)
|
|
|
|
|
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
|
|
|
|
except TypeError:
|
|
|
|
|
print("Format Date TypeError")
|
|
|
|
|
print(" > " + date_str)
|
|
|
|
|
return None
|
|
|
|
|
except ValueError:
|
|
|
|
|
print("Format Date ValueError")
|
|
|
|
|
print(" > " + date_str)
|
|
|
|
|
return None
|
|
|
|
|
finally:
|
|
|
|
|
return time_tz
|
|
|
|
|
|
|
|
|
|
def format_date(msg, archive_name):
|
|
|
|
|
|
|
|
|
|
if 'date' not in msg or msg['date'] is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# date_str = msg['date'].replace('.', '')
|
|
|
|
|
date_str = msg['date']
|
|
|
|
|
|
|
|
|
|
# fix Thu, 01 Aug 2002 17:33:08 +0900 (JST)
|
|
|
|
|
if '(' in date_str:
|
|
|
|
|
date_str = date_str.split('(')[0].rstrip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
date_time = dateparser.parse(date_str)
|
|
|
|
|
if date_time is None:
|
|
|
|
|
|
|
|
|
|
# random stuff...
|
|
|
|
|
fix = False
|
|
|
|
|
toks = date_str.split()
|
|
|
|
|
|
|
|
|
|
if len(toks[-1]) == 5 or len(toks[-1]) == 4:
|
|
|
|
|
# ex. Thu, 24 Jan 2002 15:21:31 -0000
|
|
|
|
|
if toks[-1] in ['+0000', '-0000', '0000']:
|
|
|
|
|
date_str = date_str[:-5]
|
|
|
|
|
fix = True
|
|
|
|
|
# ex. Fri, 25 Jan 2002 13:21:49 +1050
|
|
|
|
|
elif toks[-1][-2] == '5':
|
|
|
|
|
d = list(date_str)
|
|
|
|
|
d[-2] = '3'
|
|
|
|
|
date_str = "".join(d)
|
|
|
|
|
fix = True
|
|
|
|
|
|
|
|
|
|
if toks[-1][-1] != '0':
|
|
|
|
|
#ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005'
|
|
|
|
|
date_str = date_str[:-5]
|
|
|
|
|
fix = True
|
|
|
|
|
|
|
|
|
|
if 'Fru' in toks[0]:
|
|
|
|
|
date_str = date_str.replace('Fru', 'Fri')
|
|
|
|
|
fix = True
|
|
|
|
|
elif 'Thur' in toks[0]:
|
|
|
|
|
date_str = date_str.replace('Thur', 'Thu')
|
|
|
|
|
fix = True
|
|
|
|
|
|
|
|
|
|
if not fix:
|
|
|
|
|
# print("----")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
date_time = dateparser.parse(date_str)
|
|
|
|
|
if date_time is None:
|
|
|
|
|
|
|
|
|
|
if 'GMT' in date_str:
|
|
|
|
|
# ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100'
|
|
|
|
|
date_str = date_str.split('GMT')[0].rstrip()
|
|
|
|
|
fix = True
|
|
|
|
|
|
|
|
|
|
if 'METDST' in toks[-1]:
|
|
|
|
|
# ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST'
|
|
|
|
|
date_str = date_str.replace('METDST', 'MET')
|
|
|
|
|
fix = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not fix:
|
|
|
|
|
# print("++++")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
date_time = dateparser.parse(date_str)
|
|
|
|
|
return date_time
|
|
|
|
|
|
|
|
|
|
# else:
|
|
|
|
|
# print(date_str)
|
|
|
|
|
|
|
|
|
|
# date_time = datetime.datetime.fromtimestamp(time_tz)
|
|
|
|
|
|
|
|
|
|
min_d = datetime.datetime.strptime(min_date(archive_name), "%d/%m/%Y")
|
|
|
|
|
max_d = datetime.datetime.now()
|
|
|
|
|
|
|
|
|
|
date_time_naive = date_time.replace(tzinfo=None)
|
|
|
|
|
|
|
|
|
|
if date_time_naive < min_d or date_time_naive > max_d:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
return date_time
|
|
|
|
|
|
|
|
|
|
def format_subject(msg, archive_name):
|
|
|
|
|
|
|
|
|
|
if 'subject' not in msg or msg['subject'] is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
return msg['subject']
|
|
|
|
|
|
|
|
|
|
def format_id(msg, archive_name):
|
|
|
|
|
if "message-id" in msg:
|
|
|
|
|
return msg['message-id']
|
|
|
|
|
else:
|
|
|
|
|
# create hash with author_name + date
|
|
|
|
|
s = msg['author_name'] + msg['date']
|
|
|
|
|
sha = hashlib.sha1(s.encode('utf-8'))
|
|
|
|
|
return sha.hexdigest()
|
|
|
|
|
|
|
|
|
|
# format='%d/%m/%Y'
|
|
|
|
|
def min_date(archive_name):
|
|
|
|
|
if "nettime" in archive_name:
|
|
|
|
|
return '01/10/1995'
|
|
|
|
|
elif archive_name == "spectre":
|
|
|
|
|
return '01/08/2001'
|
|
|
|
|
elif archive_name == "empyre":
|
|
|
|
|
return '01/01/2002'
|
|
|
|
|
elif archive_name == "crumb":
|
|
|
|
|
return '01/02/2001'
|