many many things...
This commit is contained in:
@@ -0,0 +1,152 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import email, email.parser
|
||||
import os, datetime, json, gzip, re
|
||||
import analysis.util
|
||||
import analysis.query
|
||||
|
||||
|
||||
def filter_date(msg, archive_name):
|
||||
|
||||
time_tz = analysis.util.format_date(msg, archive_name)
|
||||
if not time_tz:
|
||||
return None
|
||||
|
||||
dt = datetime.datetime.fromtimestamp(time_tz)
|
||||
try:
|
||||
date_time = pd.to_datetime(dt)
|
||||
except pd.tslib.OutOfBoundsDatetime:
|
||||
print('time out of bound')
|
||||
print(dt)
|
||||
return None
|
||||
|
||||
min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y')
|
||||
max_date = pd.to_datetime(datetime.datetime.now())
|
||||
if date_time < min_date or date_time > max_date:
|
||||
return None
|
||||
|
||||
return date_time
|
||||
|
||||
|
||||
def message_to_tuple_record(msg, records, archive_name, references='X'):
|
||||
|
||||
# check date first?
|
||||
date = filter_date(msg, archive_name)
|
||||
if not date:
|
||||
print("Archive::filter_date returned None. Skip.")
|
||||
return
|
||||
|
||||
# check / filter from email address second?
|
||||
from_addr = analysis.util.format_from(msg, archive_name)
|
||||
if not from_addr:
|
||||
print("Archive::analysis.util.format_from returned None. Skip.")
|
||||
return
|
||||
|
||||
url = analysis.util.format_url(msg, archive_name)
|
||||
author = analysis.util.format_author(msg, archive_name)
|
||||
subject = analysis.util.format_subject(msg, archive_name)
|
||||
message_id = analysis.util.format_id(msg, archive_name)
|
||||
content = analysis.util.format_content(msg, archive_name)
|
||||
|
||||
records.append((message_id,
|
||||
from_addr,
|
||||
author,
|
||||
subject,
|
||||
date,
|
||||
url,
|
||||
len(content),
|
||||
0 if not 'follow-up' in msg else len(msg['follow-up']),
|
||||
references))
|
||||
|
||||
# recursive follow up -- but references is not keeping track really...
|
||||
if 'follow-up' in msg:
|
||||
for f in msg['follow-up']:
|
||||
message_to_tuple_record(f, records, archive_name, references=message_id)
|
||||
|
||||
return
|
||||
|
||||
def json_data_to_pd_dataframe(json_data, archive_name):
|
||||
|
||||
records = []
|
||||
for d in json_data:
|
||||
for dd in d['threads']:
|
||||
message_to_tuple_record(dd, records, archive_name)
|
||||
|
||||
print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records)))
|
||||
|
||||
df = pd.DataFrame.from_records(records,
|
||||
index='date',
|
||||
columns=['message-id',
|
||||
'from',
|
||||
'author',
|
||||
'subject',
|
||||
'date',
|
||||
'url',
|
||||
'content-length',
|
||||
'nbr-references',
|
||||
'references'])
|
||||
|
||||
df.index.name = 'date'
|
||||
|
||||
return df
|
||||
|
||||
def load_from_file(filename, archive_name, archive_dir, json_data=None):
|
||||
|
||||
if not filename.endswith('.json.gz'):
|
||||
file_path = os.path.join(archive_dir, filename + '.json.gz')
|
||||
else:
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return json_data_to_pd_dataframe(json_data['threads'], archive_name)
|
||||
else:
|
||||
#list of all "filename[...].json.gz" in archive_dir
|
||||
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
|
||||
if files:
|
||||
filename = files[-1] # take the most recent (listed alpha-chronological)
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return json_data_to_pd_dataframe(json_data['threads'], archive_name)
|
||||
else:
|
||||
#list of all json files in archive_dir/filename
|
||||
dir_path = os.path.join(archive_dir, filename)
|
||||
if not os.path.isdir(dir_path):
|
||||
return None
|
||||
|
||||
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
|
||||
if not files:
|
||||
return None
|
||||
|
||||
# load all json files
|
||||
threads = []
|
||||
for file_path in files:
|
||||
with open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
threads.append(json_data)
|
||||
|
||||
print('---> ' + archive_name)
|
||||
return json_data_to_pd_dataframe(threads, archive_name)
|
||||
|
||||
|
||||
class Archive:
|
||||
|
||||
data = None # "raw" json data
|
||||
dataframe = None # main pd dataframe
|
||||
|
||||
def __init__(self, archive_name, archive_dir="archives"):
|
||||
|
||||
if isinstance(archive_name, pd.core.frame.DataFrame):
|
||||
self.dataframe = archive_name.copy()
|
||||
|
||||
if isinstance(archive_name, str):
|
||||
# need a filename or a dir name....
|
||||
self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data)
|
||||
|
||||
def query(self):
|
||||
q = analysis.query.Query(self)
|
||||
return q
|
||||
|
||||
@@ -0,0 +1,158 @@
|
||||
import analysis.query
|
||||
import logging, html, numpy
|
||||
from tabulate import tabulate
|
||||
|
||||
def makeurl(text, url):
|
||||
return '<a href="' + url + '">' + text + "</a>"
|
||||
|
||||
def table_threads_ranking(ranking_dataframe):
|
||||
|
||||
html_str = '<table class="threads_ranking">'
|
||||
|
||||
|
||||
html_str += '<tr>'
|
||||
html_str += '<td class="td_date_t">date</td>'
|
||||
html_str += '<td class="td_subject_t">subject</td>'
|
||||
html_str += '<td class="td_from_t">from</td>'
|
||||
html_str += '<td class="td_rep_t">replies</td>'
|
||||
html_str += '</tr>'
|
||||
|
||||
|
||||
for i, row in ranking_dataframe.iterrows():
|
||||
|
||||
html_str += '<tr>'
|
||||
html_str += '<td class="td_date">' + str(i) + '</td>'
|
||||
html_str += '<td class="td_subject">' + makeurl(row['subject'], row['url']) + '</td>'
|
||||
html_str += '<td class="td_from">' + row['from'] + '</td>'
|
||||
html_str += '<td class="td_rep">' + str(row['nbr-references']) + '</td>'
|
||||
html_str += '</tr>'
|
||||
|
||||
html_str += "</table>"
|
||||
|
||||
return html_str
|
||||
|
||||
|
||||
|
||||
|
||||
class Html:
|
||||
|
||||
query = None
|
||||
|
||||
def __init__(self, q=None):
|
||||
|
||||
if not isinstance(q, query.Query):
|
||||
logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
|
||||
raise Exception()
|
||||
|
||||
self.query = q
|
||||
|
||||
def threads_ranking(self, rank=5, resolution=None):
|
||||
|
||||
data = self.query.threads_ranking(rank=rank)
|
||||
|
||||
h = html.HTML()
|
||||
t = h.table()
|
||||
|
||||
r = t.tr
|
||||
r.td('date', klass='td_date_t')
|
||||
r.td('from', klass='td_from_t')
|
||||
r.td('replies', klass='td_rep_t')
|
||||
r.td('subject', klass='td_subject_t')
|
||||
|
||||
for i, row in data.iterrows():
|
||||
r = t.tr
|
||||
|
||||
print(row.index)
|
||||
|
||||
r.td(str(row['date']), klass='td_date')
|
||||
r.td(row['from'], klass='td_from')
|
||||
r.td(str(row['nbr-references']), klass='td_rep')
|
||||
r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
|
||||
|
||||
return str(t)
|
||||
|
||||
@staticmethod
|
||||
def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}):
|
||||
|
||||
header = []
|
||||
if data_frame.index.name in name_map:
|
||||
header.append(name_map[data_frame.index.name])
|
||||
else:
|
||||
header.append(data_frame.index.name)
|
||||
for h in data_frame.columns:
|
||||
if h in name_map:
|
||||
h = name_map[h]
|
||||
header.append(h)
|
||||
|
||||
css_header = []
|
||||
css_element = []
|
||||
for i in header:
|
||||
css_header.append('td_' + i + '_t')
|
||||
css_element.append('td_' + i)
|
||||
|
||||
h = html.HTML()
|
||||
if table_name:
|
||||
t = h.table(id=table_name, klass=table_name + '_t')
|
||||
else:
|
||||
t = h.table()
|
||||
|
||||
# url map
|
||||
url_hash = {}
|
||||
url_skip = []
|
||||
url_keys = url_map.keys()
|
||||
for u in url_keys:
|
||||
if u in header and url_map[u] in header:
|
||||
url_indx = header.index(url_map[u])
|
||||
url_hash[header.index(u)] = url_indx
|
||||
url_skip.append(url_indx)
|
||||
header.pop(url_indx)
|
||||
|
||||
#header
|
||||
r = t.tr
|
||||
n = 0
|
||||
for j in header:
|
||||
r.td(str(j), klass=css_header[n])
|
||||
n += 1
|
||||
|
||||
|
||||
#elements
|
||||
for k, row in data_frame.iterrows():
|
||||
r = t.tr
|
||||
r.td(str(k), klass=css_element[0])
|
||||
n = 1
|
||||
for l in row:
|
||||
|
||||
if n in url_skip:
|
||||
continue
|
||||
|
||||
if isinstance(l, float):
|
||||
if l % 1 > 0:
|
||||
l = '{0:.4f}'.format(l)
|
||||
else:
|
||||
l = int(l)
|
||||
|
||||
if n in url_hash.keys():
|
||||
url = row[url_hash[n] - 1]
|
||||
r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
|
||||
|
||||
else:
|
||||
r.td(str(l), klass=css_element[n])
|
||||
n += 1
|
||||
|
||||
return str(t)
|
||||
|
||||
class Tab:
|
||||
|
||||
@staticmethod
|
||||
def from_dataframe(data_frame, name_map={}, format=".0f"):
|
||||
|
||||
header = []
|
||||
header.append(data_frame.index.name)
|
||||
for h in data_frame.columns:
|
||||
if h in name_map:
|
||||
h = name_map[h]
|
||||
header.append(h)
|
||||
|
||||
return tabulate(data_frame, headers=header, floatfmt=format)
|
||||
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import analysis.query
|
||||
|
||||
# for colormaps see:
|
||||
# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps
|
||||
# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps
|
||||
# http://matplotlib.org/examples/color/colormaps_reference.html
|
||||
# for colors see:
|
||||
# http://matplotlib.org/examples/color/named_colors.html
|
||||
|
||||
# spectre: slategrey
|
||||
# nettime: red
|
||||
# crumb: purple
|
||||
# empyre: darkblue
|
||||
|
||||
def bar_plot_series(series, title, color='blueviolet', ylim=None):
|
||||
return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim)
|
||||
|
||||
def save(plot, name):
|
||||
fig = plot.get_figure()
|
||||
fig.savefig(name)
|
||||
|
||||
class Plot:
|
||||
|
||||
query = None
|
||||
|
||||
def __init__(self, q=None):
|
||||
|
||||
if not isinstance(q, analysis.query.Query):
|
||||
logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query")
|
||||
raise Exception()
|
||||
|
||||
self.query = q
|
||||
|
||||
'''
|
||||
activity
|
||||
'''
|
||||
|
||||
def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
|
||||
|
||||
activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys()
|
||||
series = []
|
||||
for k in activity_rank:
|
||||
series.append(self.query.activity_from(k, resolution, series=True))
|
||||
|
||||
df = pd.concat(series, axis=1)
|
||||
|
||||
return df.plot.area(colormap='spectral', figsize=figsize, stacked=False)
|
||||
|
||||
'''
|
||||
content lenght
|
||||
'''
|
||||
|
||||
def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
|
||||
|
||||
content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys()
|
||||
series = []
|
||||
for k in content_rank:
|
||||
series.append(self.query.content_length_from(k, resolution, series=True))
|
||||
|
||||
df = pd.concat(series, axis=1)
|
||||
|
||||
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
|
||||
|
||||
'''
|
||||
threads
|
||||
'''
|
||||
|
||||
def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
|
||||
|
||||
threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys()
|
||||
series = []
|
||||
for k in threads_rank:
|
||||
series.append(self.query.threads_from(k, resolution, series=True))
|
||||
|
||||
df = pd.concat(series, axis=1)
|
||||
|
||||
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
|
||||
@@ -0,0 +1,573 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import analysis.archive
|
||||
import logging
|
||||
|
||||
class Query:
|
||||
|
||||
archive = None # analysis.archive.Archive object
|
||||
activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
|
||||
content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
|
||||
threads = None # ...
|
||||
single_threads = None
|
||||
replies = None # ...
|
||||
|
||||
def __init__(self, arch=None):
|
||||
|
||||
if not isinstance(arch, analysis.archive.Archive):
|
||||
logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
|
||||
raise Exception()
|
||||
|
||||
self.archive = arch
|
||||
|
||||
'''
|
||||
activity
|
||||
'''
|
||||
|
||||
def _activity(self):
|
||||
|
||||
if self.activity is None:
|
||||
from_index = self.archive.dataframe.reindex(columns=['from'])
|
||||
self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
|
||||
|
||||
return self.activity
|
||||
|
||||
def activity_from(self, email_address, resolution='y', series=False):
|
||||
|
||||
eaddr = email_address.replace('@', '{at}').lower()
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._activity()
|
||||
try:
|
||||
af = self.activity[eaddr]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
|
||||
if freq == 'AS':
|
||||
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
activity_from.index.name = 'year'
|
||||
else:
|
||||
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
activity_from.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return activity_from
|
||||
|
||||
return activity_from.to_frame('nbr-messages').astype(int)
|
||||
|
||||
def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
|
||||
|
||||
self._activity()
|
||||
afr = self.activity.sum(axis=0).order(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
afr = afr[afr.index.str.contains(p)]
|
||||
|
||||
if series:
|
||||
return afr[:rank]
|
||||
|
||||
return afr[:rank].to_frame('nbr-messages').astype(int)
|
||||
|
||||
|
||||
# def activity_overall(self, resolution='y', series=False):
|
||||
|
||||
# freq = 'M'
|
||||
# if resolution.lower() == 'y':
|
||||
# freq = 'AS'
|
||||
# elif resolution.lower() == 'm':
|
||||
# freq = 'M'
|
||||
# else:
|
||||
# return None
|
||||
|
||||
# self._activity()
|
||||
|
||||
# y = self.activity.sum(axis=1)
|
||||
# y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
|
||||
# if freq == 'AS':
|
||||
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
# y.index.name = 'year'
|
||||
# else:
|
||||
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
# y.index.name = 'year-month'
|
||||
|
||||
# if series:
|
||||
# return y
|
||||
|
||||
# return y.to_frame('nbr-messages').astype(int)
|
||||
|
||||
def activity_overall(self, resolution='y', series=False):
|
||||
|
||||
a = self.archive.dataframe['url']
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-messages').astype(int)
|
||||
|
||||
def cohort(self, resolution='m', series=False):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._activity()
|
||||
|
||||
c = self.activity.idxmax().order().to_frame('date')
|
||||
c.index = c['date']
|
||||
|
||||
cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()
|
||||
|
||||
if freq == 'AS':
|
||||
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
cohort.index.name = 'year'
|
||||
else:
|
||||
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
cohort.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return cohort
|
||||
|
||||
return cohort.to_frame('first-messages').astype(int)
|
||||
|
||||
'''
|
||||
content lenght
|
||||
'''
|
||||
|
||||
def _content_length(self):
|
||||
|
||||
if self.content_length is None:
|
||||
from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
|
||||
self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
|
||||
self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
|
||||
|
||||
return self.content_length
|
||||
|
||||
def content_length_from(self, email_address, resolution='y', series=False):
|
||||
|
||||
eaddr = email_address.replace('@', '{at}').lower()
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._content_length()
|
||||
try:
|
||||
af = self.content_length[eaddr]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
|
||||
if freq == 'AS':
|
||||
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
content_length_from.index.name = 'year'
|
||||
else:
|
||||
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
content_length_from.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return content_length_from
|
||||
|
||||
return content_length_from.to_frame('nbr-bytes').astype(int)
|
||||
|
||||
def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
|
||||
|
||||
self._content_length()
|
||||
cfr = self.content_length.sum(axis=0).order(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
cfr = cfr[cfr.index.str.contains(p)]
|
||||
|
||||
if series:
|
||||
return cfr[:rank]
|
||||
|
||||
return cfr[:rank].to_frame('nbr-bytes').astype(int)
|
||||
|
||||
def content_length_overall(self, resolution='y', series=False):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._content_length()
|
||||
|
||||
y = self.content_length.sum(axis=1)
|
||||
y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-bytes').astype(int)
|
||||
|
||||
|
||||
'''
|
||||
threads
|
||||
'''
|
||||
|
||||
def _threads(self, thresh=0):
|
||||
|
||||
print("doing threads")
|
||||
|
||||
if self.threads is None:
|
||||
self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
|
||||
|
||||
if self.single_threads is None:
|
||||
self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
|
||||
|
||||
return self.threads;
|
||||
|
||||
def threads_ranking(self, rank=5, resolution='y'):
|
||||
|
||||
self._threads()
|
||||
|
||||
if resolution == None:
|
||||
data = self.threads.drop('message-id', axis=1)[:rank]
|
||||
return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
# get the threads ranking per time resolution
|
||||
#
|
||||
data = self.threads.drop('message-id', axis=1)
|
||||
data = data.groupby([pd.TimeGrouper(freq=freq)])
|
||||
r = {}
|
||||
for k, v in data:
|
||||
if freq == 'AS':
|
||||
time_key = k.strftime('%Y')
|
||||
else:
|
||||
time_key = k.strftime('%Y-%m')
|
||||
frame = v[:rank]
|
||||
frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
|
||||
r[time_key] = frame
|
||||
return r
|
||||
|
||||
def threads_replies_to(self, email_address, resolution='y', series=False):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._threads()
|
||||
|
||||
eaddr = email_address.replace('@', '{at}').lower()
|
||||
|
||||
self._threads()
|
||||
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
|
||||
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references
|
||||
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
|
||||
|
||||
if series:
|
||||
return threads_from_ranking[eaddr]
|
||||
|
||||
threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
|
||||
|
||||
if freq == 'AS':
|
||||
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
threads_from_ranking.index.name = 'year'
|
||||
else:
|
||||
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
threads_from_ranking.index.name = 'year-month'
|
||||
|
||||
return threads_from_ranking
|
||||
|
||||
def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
self._threads()
|
||||
|
||||
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
|
||||
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tfr = tfr[tfr.index.str.contains(p)]
|
||||
|
||||
tfr = tfr[:rank].astype(int)
|
||||
return tfr
|
||||
|
||||
def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
|
||||
|
||||
self._threads()
|
||||
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tir = tir[tir.index.str.contains(p)]
|
||||
|
||||
if series:
|
||||
return tir[:rank]
|
||||
|
||||
return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
|
||||
|
||||
def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
# activity
|
||||
self._activity()
|
||||
afr = self.activity.sum(axis=0).astype(int)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
afr = afr[afr.index.str.contains(p)]
|
||||
|
||||
# initiated threads [top 25]
|
||||
self._threads()
|
||||
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tir = tir[tir.index.str.contains(p)]
|
||||
|
||||
inter = afr.index.intersection(tir.index)
|
||||
avg = tir[inter] / afr[inter]
|
||||
|
||||
labels = ['messages', 'threads', 'avg.threads']
|
||||
return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
|
||||
|
||||
def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
self._threads()
|
||||
|
||||
#initiated
|
||||
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tir = tir[tir.index.str.contains(p)]
|
||||
|
||||
#replies [top 25]
|
||||
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tfr = tfr[tfr.index.str.contains(p)]
|
||||
tfr = tfr['nbr-references'] # dataframe to series
|
||||
|
||||
|
||||
inter = tir.index.intersection(tfr.index)
|
||||
avg = tfr[inter] / tir[inter]
|
||||
|
||||
labels = ['threads', 'replies', 'avg.replies']
|
||||
return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
|
||||
|
||||
|
||||
def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
agg = aggregate.lower()
|
||||
if not agg in ['sum', 'mean', 'count']:
|
||||
return None
|
||||
|
||||
if not self.threads is None:
|
||||
del self.threads
|
||||
self.threads = None
|
||||
|
||||
self._threads(tresh)
|
||||
|
||||
if agg == 'sum':
|
||||
# number of replies total (re: sum all the replies)
|
||||
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
elif agg == 'mean':
|
||||
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
|
||||
else:
|
||||
# number of threads (re: msgs with at least one reply)
|
||||
y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-threads').astype(int)
|
||||
|
||||
def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
agg = aggregate.lower()
|
||||
if not agg in ['sum', 'mean', 'count']:
|
||||
return None
|
||||
|
||||
if not self.single_threads is None:
|
||||
del self.single_threads
|
||||
self.single_threads = None
|
||||
|
||||
self._threads(tresh)
|
||||
|
||||
|
||||
y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
|
||||
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-threads').astype(int)
|
||||
|
||||
|
||||
'''
|
||||
replies
|
||||
'''
|
||||
|
||||
def _replies(self):
|
||||
|
||||
if self.replies is None:
|
||||
self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
|
||||
self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
|
||||
return self.replies;
|
||||
|
||||
def replies_ranking(self, rank=5, resolution=None):
|
||||
|
||||
self._replies()
|
||||
|
||||
if resolution == None:
|
||||
data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
|
||||
return data.to_frame('nbr_replies')
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
# get the threads ranking per time resolution
|
||||
#
|
||||
data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
|
||||
r = {}
|
||||
for k, v in data:
|
||||
if freq == 'AS':
|
||||
time_key = k.strftime('%Y')
|
||||
else:
|
||||
time_key = k.strftime('%Y-%m')
|
||||
frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
|
||||
r[time_key] = frame.to_frame('nbr-replies')
|
||||
return r
|
||||
|
||||
def replies_avg_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
# activity
|
||||
self._activity()
|
||||
afr = self.activity.sum(axis=0)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
afr = afr[afr.index.str.contains(p)]
|
||||
|
||||
# replies in thread [top 25]
|
||||
|
||||
self._replies()
|
||||
rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
|
||||
|
||||
inter = afr.index.intersection(rpl.index)
|
||||
avg = rpl[inter] / afr[inter]
|
||||
|
||||
labels = ['messages', 'replies', 'avg.replies']
|
||||
return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
|
||||
|
||||
def replies_overall(self, resolution='y', series=False):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
if not self.replies is None:
|
||||
del self.replies
|
||||
self.replies = None
|
||||
|
||||
self._replies()
|
||||
|
||||
y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()
|
||||
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-replies').astype(int)
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
import email
|
||||
import hashlib
|
||||
|
||||
def format_content(msg, archive_name):
|
||||
return msg['content']
|
||||
|
||||
def format_url(msg, archive_name):
|
||||
return msg['url']
|
||||
|
||||
def format_author(msg, archive_name):
|
||||
return msg['author_name']
|
||||
|
||||
def format_from_token(from_str, sep):
|
||||
from_addr = email.utils.parseaddr(from_str)[1]
|
||||
if sep not in from_addr:
|
||||
tok = from_str.split()
|
||||
try:
|
||||
at = tok.index(sep)
|
||||
from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
|
||||
if from_addr.startswith('<') or from_addr.endswith('>'):
|
||||
from_addr = from_addr.strip('<').strip('>')
|
||||
except ValueError:
|
||||
print(tok)
|
||||
print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
|
||||
return None
|
||||
else:
|
||||
from_addr = from_addr.replace(sep, '{AT}')
|
||||
return from_addr.lower()
|
||||
|
||||
def format_from(msg, archive_name):
|
||||
from_str = msg['from']
|
||||
|
||||
if " {AT} " in from_str:
|
||||
return format_from_token(from_str, '{AT}')
|
||||
elif " at " in from_str:
|
||||
return format_from_token(from_str, 'at')
|
||||
elif "@" in from_str:
|
||||
return format_from_token(from_str, '@')
|
||||
else:
|
||||
return from_str
|
||||
|
||||
# returns utc timestamp
|
||||
def format_date(msg, archive_name):
|
||||
date_str = msg['date']
|
||||
time_tz = None
|
||||
try:
|
||||
date_tz = email.utils.parsedate_tz(date_str)
|
||||
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
||||
except TypeError:
|
||||
print("Format Date TypeError")
|
||||
print(" > " + date_str)
|
||||
return None
|
||||
except ValueError:
|
||||
print("Format Date ValueError")
|
||||
print(" > " + date_str)
|
||||
return None
|
||||
finally:
|
||||
return time_tz
|
||||
|
||||
def format_subject(msg, archive_name):
|
||||
return msg['subject']
|
||||
|
||||
def format_id(msg, archive_name):
|
||||
if "message-id" in msg:
|
||||
return msg['message-id']
|
||||
else:
|
||||
# create hash with author_name + date
|
||||
s = msg['author_name'] + msg['date']
|
||||
sha = hashlib.sha1(s.encode('utf-8'))
|
||||
return sha.hexdigest()
|
||||
|
||||
# format='%d/%m/%Y'
|
||||
def min_date(archive_name):
|
||||
if "nettime" in archive_name:
|
||||
return '01/10/1995'
|
||||
elif archive_name == "spectre":
|
||||
return '01/08/2001'
|
||||
elif archive_name == "empyre":
|
||||
return '01/01/2002'
|
||||
elif archive_name == "crumb":
|
||||
return '01/02/2001'
|
||||
Reference in New Issue
Block a user