nettime/nettime/archive.py
2016-07-22 10:17:44 +02:00

159 lines
4.1 KiB
Python

import numpy as np
import pandas as pd
import email, email.parser
import os, datetime, json, gzip, re
from random import randint
import query
def format_from(from_str):
from_addr = email.utils.parseaddr(from_str)[1]
if '{AT}' not in from_addr:
tok = from_str.split()
try:
at = tok.index('{AT}')
from_addr = ''.join(tok[at-1:at+2])
if from_addr.startswith('<') or from_addr.endswith('>'):
from_addr = from_addr.strip('<').strip('>')
except ValueError:
return None
return from_addr.lower()
def format_date(date_str):
try:
date_tz = email.utils.parsedate_tz(date_str)
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
except TypeError:
print "Format Date TypeError"
print " > " + date_str
return None
except ValueError:
print "Format Date ValueError"
print " > " + date_str
return None
dt = datetime.datetime.fromtimestamp(time_tz)
try:
pdt = pd.to_datetime(dt)
return pdt
except pd.tslib.OutOfBoundsDatetime:
print 'time out of bound'
print dt
return None
def message_to_tuple_record(msg, records, references=None):
# check date first?
date_time = format_date(msg['date'])
if not date_time:
return
# filter date?
nettime_min_date = pd.to_datetime('01/10/1995', format='%d/%m/%Y')
nettime_max_date = pd.to_datetime(datetime.datetime.now())
if date_time < nettime_min_date or date_time > nettime_max_date:
return None
# check / filter from email address second?
from_addr = format_from(msg['from'])
if not from_addr:
return
records.append((msg['message-id'],
from_addr,
msg['author_name'],
msg['subject'],
date_time,
msg['url'],
len(msg['content']),
0 if not msg.has_key('follow-up') else len(msg['follow-up']),
references))
if msg.has_key('follow-up'):
for f in msg['follow-up']:
message_to_tuple_record(f, records, references=msg['message-id'])
return
def json_data_to_pd_dataframe(json_data):
records = []
for d in json_data:
for dd in d['threads']:
message_to_tuple_record(dd, records)
df = pd.DataFrame.from_records(records,
index='date',
columns=['message-id',
'from',
'author',
'subject',
'date',
'url',
'content-length',
'nbr-references',
'references'])
df.index.name = 'date'
return df
def load_from_file(filename, archive_dir, json_data=None):
if not filename.endswith('.json.gz'):
file_path = os.path.join(archive_dir, filename + '.json.gz')
else:
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return json_data_to_pd_dataframe(json_data['threads'])
else:
#list of all "filename[...].json.gz" in archive_dir
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
if files:
filename = files[-1] # take the most recent (listed alpha-chronological)
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return json_data_to_pd_dataframe(json_data['threads'])
else:
#list of all json files in archive_dir/filename
dir_path = os.path.join(archive_dir, filename)
if not os.path.isdir(dir_path):
return None
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
if not files:
return None
# load all json files
threads = []
for file_path in files:
with open(file_path, 'r') as fp:
json_data = json.load(fp)
threads.append(json_data)
return json_data_to_pd_dataframe(threads)
class Archive:
data = None # "raw" json data
dataframe = None # main pd dataframe
def __init__(self, data="nettime-l", archive_dir="archives"):
if isinstance(data, pd.core.frame.DataFrame):
self.dataframe = data.copy()
if isinstance(data, str):
self.dataframe = load_from_file(data, archive_dir, self.data)
def query(self):
q = query.Query(self)
return q