rehaul -- replies vs threads

This commit is contained in:
gauthiier
2017-01-02 19:10:00 +01:00
parent 70d5181311
commit 9309643a50
4 changed files with 287 additions and 83 deletions
+10 -6
View File
@@ -15,11 +15,11 @@ class Html:
self.query = q
def threads_ranking(self, rank=5, resolution=None):
#if resolution is None:
data = self.query.threads_ranking(rank=rank)
print data
h = html.HTML()
t = h.table()
@@ -31,6 +31,9 @@ class Html:
for i, row in data.iterrows():
r = t.tr
print row.index
r.td(str(row['date']), klass='td_date')
r.td(row['from'], klass='td_from')
r.td(str(row['nbr-references']), klass='td_rep')
@@ -89,13 +92,14 @@ class Html:
if n in url_skip:
continue
if type(l) == numpy.float64:
l = '{0:.4f}'.format(l)
if isinstance(l, float):
if l % 1 > 0:
l = '{0:.4f}'.format(l)
else:
l = int(l)
if n in url_hash.keys():
url = row[url_hash[n] - 1]
print '---->' + l
print '<<<<<>' + url
r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
else:
+130 -11
View File
@@ -9,6 +9,7 @@ class Query:
activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
threads = None # ...
replies = None # ...
def __init__(self, arch=None):
@@ -231,7 +232,7 @@ class Query:
return r
def threads_from(self, email_address, resolution='y', series=False):
def threads_replies_to(self, email_address, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
@@ -247,7 +248,7 @@ class Query:
self._threads()
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum()
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
if series:
@@ -264,24 +265,79 @@ class Query:
return threads_from_ranking
def threads_from_ranking(self, rank=5, filter_nettime=True, series=False):
def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
self._threads()
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq='AS'), 'from']).sum()
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
tfr = threads_from_ranking.sum(axis=0).order(ascending=False)
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
if series:
return tfr[:rank]
tfr = tfr[:rank].to_frame('nbr-threads').astype(int)
tfr = tfr[:rank].astype(int)
return tfr
def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
self._threads()
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
if series:
return tir[:rank]
return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
# activity
self._activity()
afr = self.activity.sum(axis=0).astype(int)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
# initiated threads [top 25]
self._threads()
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
inter = afr.index.intersection(tir.index)
avg = tir[inter] / afr[inter]
labels = ['messages', 'threads', 'avg.threads']
return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
self._threads()
#initiated
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
#replies [top 25]
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
if filter_nettime:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
tfr = tfr['nbr-references'] # dataframe to series
inter = tir.index.intersection(tfr.index)
avg = tfr[inter] / tir[inter]
labels = ['threads', 'replies', 'avg.replies']
return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
def threads_overall(self, resolution='y', aggregate='sum', tresh=0):
freq = 'M'
@@ -320,3 +376,66 @@ class Query:
y.index.name = 'year-month'
return y
'''
replies
'''
def _replies(self):
if self.replies is None:
self.replies = self.netarchive.dataframe[self.netarchive.dataframe['references'] > 0].reindex(columns=['from','references'])
return self.replies;
def replies_ranking(self, rank=5, resolution=None):
self._replies()
if resolution == None:
data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
return data.to_frame('nbr_replies')
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
# get the threads ranking per time resolution
#
data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
r = {}
for k, v in data:
if freq == 'AS':
time_key = k.strftime('%Y')
else:
time_key = k.strftime('%Y-%m')
frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
r[time_key] = frame.to_frame('nbr_replies')
return r
def replies_avg_ranking(self, rank=5, filter_nettime=True):
# activity
self._activity()
afr = self.activity.sum(axis=0)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
# replies in thread [top 25]
self._replies()
rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
inter = afr.index.intersection(rpl.index)
avg = rpl[inter] / afr[inter]
labels = ['messages', 'replies', 'avg.replies']
return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
+77 -24
View File
@@ -101,10 +101,35 @@ class Report:
return format.Tab.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']],
name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'})
def html_msgs_threads_replies_avg_rep_msg_thrd(self):
self.matrix_msgs_threads()
return format.Tab.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies', 'avg--per-msg', 'avg-rep-per-thrd']],
name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads', 'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'})
def tab_activity_from_ranking(self, rank=5):
d = self.query.activity_from_ranking(rank=rank)
return format.Tab.from_dataframe(d, name_map={'nbr-messages': 'messages'})
#
def tab_threads_replies_to_ranking(self, rank=10):
d = self.query.threads_replies_to_ranking(rank=rank)
return format.Tab.from_dataframe(d)
#
def tab_threads_initiated_from_ranking(self, rank=10):
d = self.query.threads_initiated_from_ranking(rank=rank)
return format.Tab.from_dataframe(d)
#
def tab_threads_activity_threads_initiated_avg_ranking(self, rank=10):
d = self.query.threads_activity_threads_initiated_avg_ranking(rank=rank)
return format.Tab.from_dataframe(d)
#
def tab_threads_initiated_replies_avg_ranking(self, rank=10):
d = self.query.threads_initiated_replies_avg_ranking(rank=rank)
return format.Tab.from_dataframe(d)
def tab_content_length_from_ranking(self, rank=5):
d = self.query.activity_from_ranking(rank=rank)
return format.Tab.from_dataframe(d, name_map={'nbr-bytes': 'bytes'})
@@ -123,46 +148,64 @@ class Report:
s += format.Tab.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}) + nl
return s + nl
'''
html
'''
def tab_replies_ranking(self, rank=10):
d = self.query.replies_ranking(rank=rank)
return format.Tab.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'})
def tab_replies_avg_ranking(self, rank=10):
d = self.query.replies_avg_ranking(rank=rank)
return format.Tab.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'})
'''
m-t-r
html
'''
def html_msgs_threads_replies(self):
self.matrix_msgs_threads()
return format.Html.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies']],
name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads'})
'''
a-r-m-t
'''
def html_avg_rep_msg_thrd(self):
self.matrix_msgs_threads()
return format.Html.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']],
name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'})
'''
a-f-r
'''
def html_activity_from_ranking(self, rank=5):
html = format.Html(self.query)
return html.threads_ranking(rank=rank)
'''
c-l-f-r
'''
def html_content_length_from_ranking(self, rank=5):
def html_msgs_threads_replies_avg_rep_msg_thrd(self):
self.matrix_msgs_threads()
return format.Html.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies', 'avg--per-msg', 'avg-rep-per-thrd']],
name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads', 'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'})
def html_activity_from_ranking(self, rank=10):
d = self.query.activity_from_ranking(rank=rank)
return format.Html.from_dataframe(d, name_map={'nbr-bytes': 'bytes'})
#
def html_threads_replies_to_ranking(self, rank=10):
d = self.query.threads_replies_to_ranking(rank=rank)
return format.Html.from_dataframe(d)
#
def html_threads_initiated_from_ranking(self, rank=10):
d = self.query.threads_initiated_from_ranking(rank=rank)
return format.Html.from_dataframe(d)
#
def html_threads_activity_threads_initiated_avg_ranking(self, rank=10):
d = self.query.threads_activity_threads_initiated_avg_ranking(rank=rank)
return format.Html.from_dataframe(d)
#
def html_threads_initiated_replies_avg_ranking(self, rank=10):
d = self.query.threads_initiated_replies_avg_ranking(rank=rank)
return format.Html.from_dataframe(d)
def html_content_length_from_ranking(self, rank=10):
d = self.query.activity_from_ranking(rank=rank)
return format.Html.from_dataframe(d, name_map={'nbr-bytes': 'bytes'})
'''
t-r
'''
def html_threads_ranking(self, rank=5):
def html_threads_ranking(self, rank=10):
d = self.query.threads_ranking(rank=rank)
return format.Html.from_dataframe(d, name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'})
'''
t-r-y
'''
def html_threads_ranking_year(self, rank=5, resolution='y'):
d = self.query.threads_ranking(rank=rank, resolution=resolution)
years = sorted(d)
@@ -172,3 +215,13 @@ class Report:
s += '<div class="year_t">' + i + '</div>' + nl
s += format.Html.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'}) + nl
return s + nl
def html_replies_ranking(self, rank=10):
d = self.query.replies_ranking(rank=rank)
return format.Html.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'})
def html_replies_avg_ranking(self, rank=10):
d = self.query.replies_avg_ranking(rank=rank)
return format.Html.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'})