display and report

This commit is contained in:
gauthiier 2016-09-09 15:15:10 +02:00
parent cb7e7825c0
commit 9a2badf32a
4 changed files with 425 additions and 125 deletions

101
nettime/format.py Normal file
View File

@ -0,0 +1,101 @@
import query
import logging, html
from tabulate import tabulate
class Html:
query = None
def __init__(self, q=None):
if not isinstance(q, query.Query):
logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
raise Exception()
self.query = q
def threads_ranking(self, rank=5):
data = self.query.threads_ranking(rank=rank)
h = html.HTML()
t = h.table()
r = t.tr
r.td('date', klass='td_date_t')
r.td('from', klass='td_from_t')
r.td('replies', klass='td_rep_t')
r.td('subject', klass='td_subject_t')
for i, row in data.iterrows():
r = t.tr
r.td(str(row['date']), klass='td_date')
r.td(row['from'], klass='td_from')
r.td(str(row['nbr-references']), klass='td_rep')
r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
return str(t)
def from_dataframe(self, data_frame, table_name=None, name_map={}):
header = []
header.append(data_frame.index.name)
for h in data_frame.columns:
if h in name_map:
h = name_map[h]
header.append(h)
css_header = []
css_element = []
for i in header:
css_header.append('td_' + i + '_t')
css_element.append('td_' + i)
h = html.HTML()
if table_name:
t = h.table(id=table_name, klass=table_name + '_t')
else:
t = h.table()
#header
r = t.tr
n = 0
for j in header:
r.td(str(j), klass=css_header[n])
n += 1
#elements
for k, row in data_frame.iterrows():
r = t.tr
r.td(str(k), klass=css_element[0])
n = 1
for l in row:
r.td(str(l), klass=css_element[n])
n += 1
return str(t)
class Tab:
query = None
def __init__(self, q=None):
if not isinstance(q, query.Query):
logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
raise Exception()
self.query = q
def from_dataframe(self, data_frame, name_map={}):
header = []
header.append(data_frame.index.name)
for h in data_frame.columns:
if h in name_map:
h = name_map[h]
header.append(h)
return tabulate(data_frame, headers=header)

70
nettime/plot.py Normal file
View File

@ -0,0 +1,70 @@
import numpy as np
import pandas as pd
import query
# for colormaps see:
# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps
# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps
# http://matplotlib.org/examples/color/colormaps_reference.html
# for colors see:
# http://matplotlib.org/examples/color/named_colors.html
def bar_plot_series(series, title, color='blueviolet'):
return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True)
class Plot:
query = None
def __init__(self, q=None):
if not isinstance(q, query.Query):
logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
raise Exception()
self.query = q
'''
activity
'''
def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys()
series = []
for k in activity_rank:
series.append(self.query.activity_from(k, resolution, series=True))
df = pd.concat(series, axis=1)
return df.plot.area(colormap='spectral', figsize=figsize, stacked=False)
'''
content lenght
'''
def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys()
series = []
for k in content_rank:
series.append(self.query.content_length_from(k, resolution, series=True))
df = pd.concat(series, axis=1)
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
'''
threads
'''
def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys()
series = []
for k in threads_rank:
series.append(self.query.threads_from(k, resolution, series=True))
df = pd.concat(series, axis=1)
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)

View File

@ -1,7 +1,7 @@
import numpy as np
import pandas as pd
import archive
import logging, html
import logging
class Query:
@ -30,65 +30,78 @@ class Query:
return self.activity
def activity_from(self, email_address, resolution='y'):
def activity_from(self, email_address, resolution='y', series=False):
eaddr = email_address.replace('@', '{at}').lower()
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._activity()
try:
if resolution.lower() == 'm':
return self.activity[eaddr]
elif resolution.lower() == 'y':
y = self.activity[eaddr].resample('AS').sum()
y.index = y.index.year
return y
else:
return None
af = self.activity[eaddr]
except KeyError:
return None
def activity_overall(self, resolution='y'):
activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
self._activity()
try:
sum_activity_month = self.activity.sum(axis=1)
if resolution.lower() == 'm':
sum_activity_month.rename
return sum_activity_month
elif resolution.lower() == 'y':
y = sum_activity_month.resample('AS').sum()
y.index = y.index.year
return y
else:
return None
except:
return None
if freq == 'AS':
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
activity_from.index.name = 'year'
else:
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
activity_from.index.name = 'year-month'
def activity_from_ranking(self, resolution='y', rank=5, filter_nettime=True):
# finish this -- re resolution AND filtering
if series:
return activity_from
return activity_from.to_frame('nbr-messages').astype(int)
def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
self._activity()
afr = self.activity.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
return afr[:rank]
def plot_activity_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
if series:
return afr[:rank]
activity_rank = self.activity_from_ranking(rank=rank).keys()
series = []
for k in activity_rank:
series.append(self.activity_from(k, resolution))
df = pd.concat(series, axis=1)
colors = np.random.rand(len(df),3)
return afr[:rank].to_frame('nbr-messages').astype(int)
if figsize:
df.plot(colors=colors, figsize=figsize)
def activity_overall(self, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
df.plot(colors=colors)
return None
self._activity()
y = self.activity.sum(axis=1)
y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-messages').astype(int)
'''
content lenght
@ -103,63 +116,78 @@ class Query:
return self.content_length
def content_length_from(self, email_address, resolution='y'):
def content_length_from(self, email_address, resolution='y', series=False):
eaddr = email_address.replace('@', '{at}').lower()
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._content_length()
try:
if resolution.lower() == 'm':
return self.content_length[eaddr]
elif resolution.lower() == 'y':
y = self.content_length[eaddr].resample('AS').sum()
y.index = y.index.year
return y
else:
return None
af = self.content_length[eaddr]
except KeyError:
return None
def content_length_overall(self):
content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
self._content_length()
try:
sum_content_length_month = self.content_length.sum(axis=1)
if resolution.lower() == 'm':
return sum_content_length_month
elif resolution.lower() == 'y':
y = sum_content_length_month.resample('AS').sum()
y.index = y.index.year
return y
else:
return None
except:
return None
if freq == 'AS':
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
content_length_from.index.name = 'year'
else:
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
content_length_from.index.name = 'year-month'
def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True):
# finish this -- re resolution
if series:
return content_length_from
return content_length_from.to_frame('nbr-bytes').astype(int)
def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
self._content_length()
cfr = self.content_length.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
cfr = cfr[cfr.index.str.contains(p)]
return cfr[:rank]
def plot_content_length_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
if series:
return cfr[:rank]
content_rank = self.content_length_from_ranking(rank=rank).keys()
series = []
for k in content_rank:
series.append(self.content_length_from(k, resolution))
df = pd.concat(series, axis=1)
colors = np.random.rand(len(df),3)
return cfr[:rank].to_frame('nbr-bytes').astype(int)
if figsize:
df.plot(colors=colors, figsize=figsize)
def content_length_overall(self, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
df.plot(colors=colors)
return None
self._content_length()
y = self.content_length.sum(axis=1)
y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-bytes').astype(int)
'''
threads
@ -171,37 +199,39 @@ class Query:
self.threads = self.netarchive.dataframe[self.netarchive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
return self.threads;
def threads_ranking(self, rank=5, output=None):
def threads_ranking(self, rank=5, resolution=None):
self._threads()
data = self.threads.drop('message-id', axis=1)[:rank]
data['date'] = data.index
if output is None:
return data
elif output == 'string':
return data.to_string()
elif output == 'html':
h = html.HTML()
t = h.table()
r = t.tr
r.td('date', klass='td_date_t')
r.td('from', klass='td_from_t')
r.td('replies', klass='td_rep_t')
r.td('subject', klass='td_subject_t')
if resolution == None:
data = self.threads.drop('message-id', axis=1)[:rank]
return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
for i, row in data.iterrows():
r = t.tr
r.td(str(row['date']), klass='td_date')
r.td(row['from'], klass='td_from')
r.td(str(row['nbr-references']), klass='td_rep')
r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
return str(t)
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
def threads_from(self, email_address, resolution='y'):
# get the threads ranking per time resolution
#
data = self.threads.drop('message-id', axis=1)
data = data.groupby([pd.TimeGrouper(freq=freq)])
r = {}
for k, v in data:
if freq == 'AS':
time_key = k.strftime('%Y')
else:
time_key = k.strftime('%Y-%m')
frame = v[:rank]
frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
r[time_key] = frame
return r
def threads_from(self, email_address, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
@ -219,9 +249,22 @@ class Query:
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum()
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
return threads_from_ranking[eaddr]
def threads_from_ranking(self, rank=5, filter_nettime=True):
if series:
return threads_from_ranking[eaddr]
threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
if freq == 'AS':
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
threads_from_ranking.index.name = 'year'
else:
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
threads_from_ranking.index.name = 'year-month'
return threads_from_ranking
def threads_from_ranking(self, rank=5, filter_nettime=True, series=False):
self._threads()
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
@ -233,24 +276,11 @@ class Query:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
return tfr[:rank]
def plot_threads_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
threads_rank = self.threads_from_ranking(rank=rank).keys()
series = []
for k in threads_rank:
series.append(self.threads_from(k, resolution))
df = pd.concat(series, axis=1)
colors = np.random.rand(len(df),3)
if figsize:
df.plot(colors=colors, figsize=figsize)
else:
df.plot(colors=colors)
if series:
return tfr[:rank]
tfr = tfr[:rank].to_frame('nbr-threads').astype(int)
return tfr
def threads_overall(self, resolution='y', aggregate='sum', tresh=0):
@ -263,7 +293,7 @@ class Query:
return None
agg = aggregate.lower()
if not agg in ['sum', 'mean']:
if not agg in ['sum', 'mean', 'count']:
return None
if not self.threads is None:
@ -273,11 +303,20 @@ class Query:
self._threads(tresh)
if agg == 'sum':
# number of replies total (re: sum all the replies)
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
else:
elif agg == 'mean':
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
else:
# number of threads (re: msgs with at least one reply)
y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
y = y.to_frame('nbr-threads')
if freq == 'AS':
y.index = y.index.year
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
return y
return y

90
nettime_report.py Normal file
View File

@ -0,0 +1,90 @@
import nettime.query
import nettime.format
import nettime.plot
class Report:
query = None
matrix = None
def __init__(self, q=None):
if not isinstance(q, nettime.query.Query):
logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
raise Exception()
self.query = q
'''
(basic) stats
'''
def matrix_msgs_threads(self):
if self.matrix is None:
# nbr messages
mat = self.query.activity_overall()
# nbr threads
mat['nbr-threads'] = self.query.threads_overall(aggregate='count')['nbr-threads']
# nbr replies
mat['nbr-replies'] = self.query.threads_overall(aggregate='sum')['nbr-references']
# nbr non-replies (aka. non-threads)
mat['nbr-single-messages'] = mat['nbr-messages'] - mat['nbr-replies'] - mat['nbr-threads']
# avg. rep per message
mat['avg-rep-per-msg'] = mat['nbr-replies'] / mat['nbr-messages']
# avg. rep per thread
mat['avg-rep-per-thrd'] = mat['nbr-replies'] / mat['nbr-threads']
# same as:
# mat['avg-rep-per-thrd'] = q.threads_overall(aggregate='mean')['nbr-references']
self.matrix = mat
return self.matrix
'''
plots
'''
def plot_nbr_msgs(self, title='Nbr. Messages', label='messages', color='mediumblue'):
self.matrix_msgs_threads()
nettime.plot.bar_plot_series(self.matrix['nbr-messages'].to_frame(label), title=title, color=color)
def plot_nbr_threads(self, title='Nbr. Threads', label='threads', color='crimson'):
self.matrix_msgs_threads()
nettime.plot.bar_plot_series(self.matrix['nbr-threads'].to_frame(label), title=title, color=color)
def plot_nbr_replies(self, title='Nbr. Replies in Threads', label='replies', color='dimgray'):
self.matrix_msgs_threads()
nettime.plot.bar_plot_series(self.matrix['nbr-replies'].to_frame(label), title=title, color=color)
def plot_avg_rep_p_msg(self, title='Avg. Replies per Messages', label='replies-per-messasges', color='limegreen'):
self.matrix_msgs_threads()
nettime.plot.bar_plot_series(self.matrix['avg-rep-per-msg'].to_frame(label), title=title, color=color)
def plot_avg_rep_p_thrd(self, title='Avg. Replies per Thread', label='replies-per-thread', color='blueviolet'):
self.matrix_msgs_threads()
nettime.plot.bar_plot_series(self.matrix['avg-rep-per-thrd'].to_frame(label), title=title, color=color)
def plot_msgs_replies(self, title='Nbr. Messages segments (individual messages vs thread replies)'):
self.matrix_msgs_threads()
nettime.plot.bar_plot_series(self.matrix[['nbr-single-messages', 'nbr-threads', 'nbr-replies']], color=['mediumblue', 'red', 'dimgray'], title=title)