93 lines
2.1 KiB
Python
93 lines
2.1 KiB
Python
import email
|
|
import hashlib
|
|
|
|
def format_content(msg, archive_name):
|
|
return msg['content']
|
|
|
|
def format_url(msg, archive_name):
|
|
return msg['url']
|
|
|
|
def format_author(msg, archive_name):
|
|
return msg['author_name']
|
|
|
|
def format_from_token(from_str, sep):
|
|
|
|
fff = from_str
|
|
|
|
from_addr = email.utils.parseaddr(from_str)[1]
|
|
|
|
fffa = email.utils.parseaddr(from_str)
|
|
|
|
if sep not in from_addr:
|
|
tok = from_str.split()
|
|
try:
|
|
at = tok.index(sep)
|
|
from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
|
|
if from_addr.startswith('<') or from_addr.endswith('>'):
|
|
from_addr = from_addr.strip('<').strip('>')
|
|
except ValueError:
|
|
print(tok)
|
|
print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
|
|
print("*** " + fff)
|
|
print("+++")
|
|
print(fffa)
|
|
print("----")
|
|
|
|
return None
|
|
else:
|
|
from_addr = from_addr.replace(sep, '{AT}')
|
|
return from_addr.lower()
|
|
|
|
def format_from(msg, archive_name):
|
|
from_str = msg['from']
|
|
|
|
if " {AT} " in from_str:
|
|
return format_from_token(from_str, '{AT}')
|
|
elif " at " in from_str:
|
|
return format_from_token(from_str, 'at')
|
|
elif "@" in from_str:
|
|
return format_from_token(from_str, '@')
|
|
else:
|
|
return from_str
|
|
|
|
# returns utc timestamp
|
|
def format_date(msg, archive_name):
|
|
date_str = msg['date']
|
|
time_tz = None
|
|
try:
|
|
date_tz = email.utils.parsedate_tz(date_str)
|
|
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
|
except TypeError:
|
|
print("Format Date TypeError")
|
|
print(" > " + date_str)
|
|
return None
|
|
except ValueError:
|
|
print("Format Date ValueError")
|
|
print(" > " + date_str)
|
|
return None
|
|
finally:
|
|
return time_tz
|
|
|
|
def format_subject(msg, archive_name):
|
|
return msg['subject']
|
|
|
|
def format_id(msg, archive_name):
|
|
if "message-id" in msg:
|
|
return msg['message-id']
|
|
else:
|
|
# create hash with author_name + date
|
|
s = msg['author_name'] + msg['date']
|
|
sha = hashlib.sha1(s.encode('utf-8'))
|
|
return sha.hexdigest()
|
|
|
|
# format='%d/%m/%Y'
|
|
def min_date(archive_name):
|
|
if "nettime" in archive_name:
|
|
return '01/10/1995'
|
|
elif archive_name == "spectre":
|
|
return '01/08/2001'
|
|
elif archive_name == "empyre":
|
|
return '01/01/2002'
|
|
elif archive_name == "crumb":
|
|
return '01/02/2001'
|