renamed lib to nettime

This commit is contained in:
gauthiier
2016-07-21 16:04:43 +02:00
parent 63181b37f3
commit fd71309119
4 changed files with 0 additions and 0 deletions
View File
+403
View File
@@ -0,0 +1,403 @@
import numpy as np
import pandas as pd
import email, email.parser
import os, datetime, json, gzip, re
from random import randint
def format_from(from_str):
from_addr = email.utils.parseaddr(from_str)[1]
if '{AT}' not in from_addr:
tok = from_str.split()
try:
at = tok.index('{AT}')
from_addr = ''.join(tok[at-1:at+2])
if from_addr.startswith('<') or from_addr.endswith('>'):
from_addr = from_addr.strip('<').strip('>')
except ValueError:
return None
return from_addr.lower()
def format_date(date_str):
try:
date_tz = email.utils.parsedate_tz(date_str)
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
except TypeError:
print "Format Date TypeError"
print " > " + date_str
return None
except ValueError:
print "Format Date ValueError"
print " > " + date_str
return None
dt = datetime.datetime.fromtimestamp(time_tz)
try:
pdt = pd.to_datetime(dt)
return pdt
except pd.tslib.OutOfBoundsDatetime:
print 'time out of bound'
print dt
return None
def message_to_tuple_record(msg, records, references=None):
# check date first?
date_time = format_date(msg['date'])
if not date_time:
return
# filter date?
nettime_min_date = pd.to_datetime('01/10/1995', format='%d/%m/%Y')
nettime_max_date = pd.to_datetime(datetime.datetime.now())
if date_time < nettime_min_date or date_time > nettime_max_date:
return None
# check / filter from email address second?
from_addr = format_from(msg['from'])
if not from_addr:
return
records.append((msg['message-id'],
from_addr,
msg['author_name'],
msg['subject'],
date_time,
msg['url'],
len(msg['content']),
0 if not msg.has_key('follow-up') else len(msg['follow-up']),
references))
if msg.has_key('follow-up'):
for f in msg['follow-up']:
message_to_tuple_record(f, records, references=msg['message-id'])
return
def json_data_to_pd_dataframe(json_data):
records = []
for d in json_data:
for dd in d['threads']:
message_to_tuple_record(dd, records)
df = pd.DataFrame.from_records(records,
index='date',
columns=['message-id',
'from',
'author',
'subject',
'date',
'url',
'content-length',
'nbr-references',
'references'])
df.index.name = 'date'
return df
def load_from_file(filename, archive_dir):
json_data = None
if not filename.endswith('.json.gz'):
file_path = os.path.join(archive_dir, filename + '.json.gz')
else:
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return json_data_to_pd_dataframe(json_data['threads'])
else:
#list of all "filename[...].json.gz" in archive_dir
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
if files:
filename = files[-1] # take the most recent (listed alpha-chronological)
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return json_data_to_pd_dataframe(json_data['threads'])
else:
#list of all json files in archive_dir/filename
dir_path = os.path.join(archive_dir, filename)
if not os.path.isdir(dir_path):
return None
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
if not files:
return None
# load all json files
threads = []
for file_path in files:
with open(file_path, 'r') as fp:
json_data = json.load(fp)
threads.append(json_data)
return json_data_to_pd_dataframe(threads)
class Archive:
data = None # "raw" json data
dataframe = None # main pd dataframe
activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
threads = None
def __init__(self, data="nettime-l", archive_dir="archives"):
if isinstance(data, pd.core.frame.DataFrame):
self.dataframe = data.copy()
if isinstance(data, str):
self.dataframe = load_from_file(data, archive_dir)
'''
activity
'''
def _activity(self):
if self.activity is None:
from_index = self.dataframe.reindex(columns=['from'])
self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
return self.activity
def activity_from(self, email_address, resolution='M'):
eaddr = email_address.replace('@', '{at}').lower()
self._activity()
try:
if resolution.lower() == 'm':
return self.activity[eaddr]
elif resolution.lower() == 'y':
y = self.activity[eaddr].resample('AS').sum()
y.index = y.index.year
return y
else:
return None
except KeyError:
return None
def activity_overall(self, resolution='M'):
self._activity()
try:
sum_activity_month = self.activity.sum(axis=1)
if resolution.lower() == 'm':
sum_activity_month.rename
return sum_activity_month
elif resolution.lower() == 'y':
y = sum_activity_month.resample('AS').sum()
y.index = y.index.year
return y
else:
return None
except:
return None
def activity_from_ranking(self, resolution='M', rank=5, filter_nettime=True):
# finish this -- re resolution AND filtering
self._activity()
afr = self.activity.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
return afr[:rank]
def plot_activity_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
activity_rank = self.activity_from_ranking(rank=rank).keys()
series = []
for k in activity_rank:
series.append(self.activity_from(k, resolution))
df = pd.concat(series, axis=1)
colors = np.random.rand(len(df),3)
if figsize:
df.plot(colors=colors, figsize=figsize)
else:
df.plot(colors=colors)
'''
content lenght
'''
def _content_length(self):
if self.content_length is None:
from_content_index = self.dataframe.reindex(columns=['from', 'content-length'])
self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
return self.content_length
def content_length_from(self, email_address, resolution='M'):
eaddr = email_address.replace('@', '{at}').lower()
self._content_length()
try:
if resolution.lower() == 'm':
return self.content_length[eaddr]
elif resolution.lower() == 'y':
y = self.content_length[eaddr].resample('AS').sum()
y.index = y.index.year
return y
else:
return None
except KeyError:
return None
def content_length_overall(self):
self._content_length()
try:
sum_content_length_month = self.content_length.sum(axis=1)
if resolution.lower() == 'm':
return sum_content_length_month
elif resolution.lower() == 'y':
y = sum_content_length_month.resample('AS').sum()
y.index = y.index.year
return y
else:
return None
except:
return None
def content_length_from_ranking(self, resolution='M', rank=5, filter_nettime=True):
# finish this -- re resolution
self._content_length()
cfr = self.content_length.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
cfr = cfr[cfr.index.str.contains(p)]
return cfr[:rank]
def plot_content_length_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
content_rank = self.content_length_from_ranking(rank=rank).keys()
series = []
for k in content_rank:
series.append(self.content_length_from(k, resolution))
df = pd.concat(series, axis=1)
colors = np.random.rand(len(df),3)
if figsize:
df.plot(colors=colors, figsize=figsize)
else:
df.plot(colors=colors)
'''
threads
'''
def _threads(self, thresh=0):
if self.threads is None:
self.threads = self.dataframe[self.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
return self.threads;
def threads_ranking(self, rank=5):
self._threads()
return self.threads.drop('message-id', axis=1)[:rank]
def threads_from(self, email_address, resolution='y'):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._threads()
eaddr = email_address.replace('@', '{at}').lower()
self._threads()
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum()
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
return threads_from_ranking[eaddr]
def threads_from_ranking(self, rank=5, filter_nettime=True):
self._threads()
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq='AS'), 'from']).sum()
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
tfr = threads_from_ranking.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
return tfr[:rank]
def plot_threads_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
threads_rank = self.threads_from_ranking(rank=rank).keys()
series = []
for k in threads_rank:
series.append(self.threads_from(k, resolution))
df = pd.concat(series, axis=1)
colors = np.random.rand(len(df),3)
if figsize:
df.plot(colors=colors, figsize=figsize)
else:
df.plot(colors=colors)
def threads_overall(self, resolution='y', aggregate='sum', tresh=0):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
agg = aggregate.lower()
if not agg in ['sum', 'mean']:
return None
if not self.threads is None:
del self.threads
self.threads = None
self._threads(tresh)
if agg == 'sum':
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
else:
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
if freq == 'AS':
y.index = y.index.year
return y
+252
View File
@@ -0,0 +1,252 @@
import urllib2, urllib, urlparse
import logging
from bs4 import BeautifulSoup
import email, email.parser
from email.mime.text import MIMEText
import mailbox
import time, dateutil, string
from pprint import pprint as pp
import sys, os, re, json, gzip
import traceback
DELAY = 0.2
# hack for the mailbox module (re: force mbox.add() encoding to utf8)
reload(sys)
sys.setdefaultencoding('utf8')
def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
# base url
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
#collect name
list_name = soup.select('body p:nth-of-type(2) base title')[0].string
logging.info("Getting " + list_name + " list archive for " + sublist_name)
lists = soup.select('ul:nth-of-type(2) li')
threads = []
for l in lists:
if l.strong is None:
continue
name = l.strong.string
if name.lower() == sublist_name.lower():
threads_url_list = []
threads_links = l.select('ul li a')
for t in threads_links:
thread_url = urlparse.urljoin(base_url, t.get('href'))
threads_url_list.append(thread_url)
nbr_threads = str(len(threads_url_list))
n = 0
for u in threads_url_list:
n += 1
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
threads.append(collect_threads_from_url(u, base_arch_dir, mbox))
return threads
# for u in threads_url_list[0:10]:
# print "---------------------------------------"
# tt = collect_threads_from_url(u, base_arch_dir, mbox)
# threads.append(tt)
return None
def collect_threads_from_url(url, base_arch_dir, mbox):
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
# base url
base_url = url
# collect name
threads_name = soup.select('p:nth-of-type(1) title')[0].string
threads_name = threads_name.replace(' ', '_')
# thread data struct
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
logging.info("Collecting Threads of: " + threads_name)
# check if archive already exists
file_path = os.path.join(base_arch_dir, threads['name'] + ".json")
if os.path.isfile(file_path):
logging.info("archive already exists. loading from file " + file_path)
with open(file_path, 'r') as fpin:
threads = json.load(fpin)
else:
lists = soup.select('ul:nth-of-type(1) > li')
nbr_threads = str(len(lists))
n = 0
for l in lists:
n += 1
logging.info("> " + str(n) + " / " + nbr_threads)
try:
thread = archive_thread(l, base_url, None)
threads['threads'].append(thread)
except:
ex_type, ex, tb = sys.exc_info()
print ex_type
print ex
traceback.print_tb(tb)
del tb
continue
time.sleep(DELAY)
# write
logging.info("writing archive to file " + file_path)
with open(file_path, 'w') as fp:
json.dump(threads, fp, indent=4)
if mbox:
mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt")
mbox_path_gz = mbox_path + ".gz"
logging.info("writing mbox " + mbox_path)
if not os.path.isfile(mbox_path):
box = mailbox.mbox(mbox_path)
box.lock()
try:
for t in threads['threads']:
write_mbox_message(t, box)
box.flush()
except:
ex_type, ex, tb = sys.exc_info()
print ex_type
print ex
traceback.print_tb(tb)
del tb
finally:
box.unlock()
with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout:
fpout.writelines(fpin)
else:
logging.info("mbox " + mbox_path + " already exists.")
logging.info("done. ")
return threads
def archive_thread(li, base_url, parent_thread_data):
thread_link = li.select('strong a')[0]
thread_url = urlparse.urljoin(base_url, thread_link.get('href'))
thread_id = thread_link.get('name')
thread_title = thread_link.string
thread_author_name = li.select('em')[0].string
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
collect_message(thread_url, message)
follow = li.select('ul > li')
if len(follow) > 0:
for f in follow:
follow_link = f.select('strong a')
if len (follow_link) > 0:
archive_thread(f, base_url, message) ## recursion
if parent_thread_data is None:
return message
if u'follow-up' not in parent_thread_data:
parent_thread_data[u'follow-up'] = []
parent_thread_data[u'follow-up'].append(message)
return message
def collect_message(url, message):
print url
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
message['subject'] = parse_xcomment(soup, "X-Subject")
message['date'] = parse_xcomment(soup, "X-Date")
message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
# parse what is displayed on the page
info = soup.select('ul:nth-of-type(1) > li')
for i in info:
if i.em == None:
continue
field = i.em.string
if field.lower() in message_labels:
message[field.lower()] = i.text.strip(field + ": ")
## reformat from -- [author_name, email_addr]
# from_addr = email.utils.parseaddr(message['from'])
# message['author_name'] = from_addr[0]
# message['from'] = from_addr[1]
## -- content --
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
def parse_xcomment(soup, xcom):
com = soup.find(text=re.compile(xcom))
if com is not None:
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
return com
def to_mbox_message(msg):
mime = MIMEText('', 'plain', _charset='utf8')
mime['From'] = msg['from']
mime['Subject'] = msg['subject']
mime['Message-Id'] = msg['message-id']
mime['Date'] = msg['date']
mime.set_payload(msg['content'], charset='utf8')
mbox_message = mailbox.mboxMessage(mime)
mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
return mbox_message
# throws exception
def write_mbox_message(msg, mbox):
mbox_msg = to_mbox_message(msg)
mbox.add(mbox_msg) # here
if u'follow-up' in msg:
for f in msg['follow-up']:
write_mbox_message(f, mbox)
+26
View File
@@ -0,0 +1,26 @@
import urllib2, urllib, urlparse
import os, re, json, gzip
import mhonarccrawl
import datetime
def archive_from_url(url, sublist_name="nettime-l", archive_dir="archives"):
url = url.rstrip()
archive_list_dir = check_dir(archive_dir, sublist_name)
archive_name = sublist_name.lower()
archive_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
archive = {'name' : sublist_name.lower(), 'url': url, 'date': archive_date, 'threads' : []}
archive['threads'] = mhonarccrawl.collect_from_url(url, sublist_name, archive_list_dir, mbox=True)
file_path = os.path.join(archive_dir, archive_name + "_" + archive_date + ".json.gz")
with gzip.open(file_path, 'w') as fp:
json.dump(archive, fp, indent=4)
return
def check_dir(base_dir, list_name):
arc_dir = os.path.join(base_dir, list_name)
if not os.path.exists(arc_dir):
os.makedirs(arc_dir)
return arc_dir