rehaul -- replies vs threads
This commit is contained in:
+10
-6
@@ -15,11 +15,11 @@ class Html:
|
||||
self.query = q
|
||||
|
||||
def threads_ranking(self, rank=5, resolution=None):
|
||||
|
||||
#if resolution is None:
|
||||
|
||||
data = self.query.threads_ranking(rank=rank)
|
||||
|
||||
print data
|
||||
|
||||
h = html.HTML()
|
||||
t = h.table()
|
||||
|
||||
@@ -31,6 +31,9 @@ class Html:
|
||||
|
||||
for i, row in data.iterrows():
|
||||
r = t.tr
|
||||
|
||||
print row.index
|
||||
|
||||
r.td(str(row['date']), klass='td_date')
|
||||
r.td(row['from'], klass='td_from')
|
||||
r.td(str(row['nbr-references']), klass='td_rep')
|
||||
@@ -89,13 +92,14 @@ class Html:
|
||||
if n in url_skip:
|
||||
continue
|
||||
|
||||
if type(l) == numpy.float64:
|
||||
l = '{0:.4f}'.format(l)
|
||||
if isinstance(l, float):
|
||||
if l % 1 > 0:
|
||||
l = '{0:.4f}'.format(l)
|
||||
else:
|
||||
l = int(l)
|
||||
|
||||
if n in url_hash.keys():
|
||||
url = row[url_hash[n] - 1]
|
||||
print '---->' + l
|
||||
print '<<<<<>' + url
|
||||
r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
|
||||
|
||||
else:
|
||||
|
||||
+130
-11
@@ -9,6 +9,7 @@ class Query:
|
||||
activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
|
||||
content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
|
||||
threads = None # ...
|
||||
replies = None # ...
|
||||
|
||||
def __init__(self, arch=None):
|
||||
|
||||
@@ -231,7 +232,7 @@ class Query:
|
||||
return r
|
||||
|
||||
|
||||
def threads_from(self, email_address, resolution='y', series=False):
|
||||
def threads_replies_to(self, email_address, resolution='y', series=False):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
@@ -247,7 +248,7 @@ class Query:
|
||||
|
||||
self._threads()
|
||||
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
|
||||
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum()
|
||||
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references
|
||||
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
|
||||
|
||||
if series:
|
||||
@@ -264,24 +265,79 @@ class Query:
|
||||
|
||||
return threads_from_ranking
|
||||
|
||||
def threads_from_ranking(self, rank=5, filter_nettime=True, series=False):
|
||||
def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
self._threads()
|
||||
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
|
||||
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq='AS'), 'from']).sum()
|
||||
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
|
||||
tfr = threads_from_ranking.sum(axis=0).order(ascending=False)
|
||||
|
||||
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
|
||||
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tfr = tfr[tfr.index.str.contains(p)]
|
||||
|
||||
if series:
|
||||
return tfr[:rank]
|
||||
|
||||
tfr = tfr[:rank].to_frame('nbr-threads').astype(int)
|
||||
tfr = tfr[:rank].astype(int)
|
||||
return tfr
|
||||
|
||||
def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
|
||||
|
||||
self._threads()
|
||||
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tir = tir[tir.index.str.contains(p)]
|
||||
|
||||
if series:
|
||||
return tir[:rank]
|
||||
|
||||
return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
|
||||
|
||||
def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
# activity
|
||||
self._activity()
|
||||
afr = self.activity.sum(axis=0).astype(int)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
afr = afr[afr.index.str.contains(p)]
|
||||
|
||||
# initiated threads [top 25]
|
||||
self._threads()
|
||||
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tir = tir[tir.index.str.contains(p)]
|
||||
|
||||
inter = afr.index.intersection(tir.index)
|
||||
avg = tir[inter] / afr[inter]
|
||||
|
||||
labels = ['messages', 'threads', 'avg.threads']
|
||||
return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
|
||||
|
||||
def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
self._threads()
|
||||
|
||||
#initiated
|
||||
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tir = tir[tir.index.str.contains(p)]
|
||||
|
||||
#replies [top 25]
|
||||
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tfr = tfr[tfr.index.str.contains(p)]
|
||||
tfr = tfr['nbr-references'] # dataframe to series
|
||||
|
||||
|
||||
inter = tir.index.intersection(tfr.index)
|
||||
avg = tfr[inter] / tir[inter]
|
||||
|
||||
labels = ['threads', 'replies', 'avg.replies']
|
||||
return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
|
||||
|
||||
|
||||
def threads_overall(self, resolution='y', aggregate='sum', tresh=0):
|
||||
|
||||
freq = 'M'
|
||||
@@ -320,3 +376,66 @@ class Query:
|
||||
y.index.name = 'year-month'
|
||||
|
||||
return y
|
||||
|
||||
|
||||
'''
|
||||
replies
|
||||
'''
|
||||
|
||||
def _replies(self):
|
||||
|
||||
if self.replies is None:
|
||||
self.replies = self.netarchive.dataframe[self.netarchive.dataframe['references'] > 0].reindex(columns=['from','references'])
|
||||
return self.replies;
|
||||
|
||||
def replies_ranking(self, rank=5, resolution=None):
|
||||
|
||||
self._replies()
|
||||
|
||||
if resolution == None:
|
||||
data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
|
||||
return data.to_frame('nbr_replies')
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
# get the threads ranking per time resolution
|
||||
#
|
||||
data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
|
||||
r = {}
|
||||
for k, v in data:
|
||||
if freq == 'AS':
|
||||
time_key = k.strftime('%Y')
|
||||
else:
|
||||
time_key = k.strftime('%Y-%m')
|
||||
frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
|
||||
r[time_key] = frame.to_frame('nbr_replies')
|
||||
return r
|
||||
|
||||
def replies_avg_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
# activity
|
||||
self._activity()
|
||||
afr = self.activity.sum(axis=0)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
afr = afr[afr.index.str.contains(p)]
|
||||
|
||||
# replies in thread [top 25]
|
||||
|
||||
self._replies()
|
||||
rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
|
||||
|
||||
inter = afr.index.intersection(rpl.index)
|
||||
avg = rpl[inter] / afr[inter]
|
||||
|
||||
labels = ['messages', 'replies', 'avg.replies']
|
||||
return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
|
||||
|
||||
|
||||
|
||||
|
||||
+77
-24
@@ -101,10 +101,35 @@ class Report:
|
||||
return format.Tab.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']],
|
||||
name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'})
|
||||
|
||||
def html_msgs_threads_replies_avg_rep_msg_thrd(self):
|
||||
self.matrix_msgs_threads()
|
||||
return format.Tab.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies', 'avg--per-msg', 'avg-rep-per-thrd']],
|
||||
name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads', 'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'})
|
||||
|
||||
def tab_activity_from_ranking(self, rank=5):
|
||||
d = self.query.activity_from_ranking(rank=rank)
|
||||
return format.Tab.from_dataframe(d, name_map={'nbr-messages': 'messages'})
|
||||
|
||||
#
|
||||
def tab_threads_replies_to_ranking(self, rank=10):
|
||||
d = self.query.threads_replies_to_ranking(rank=rank)
|
||||
return format.Tab.from_dataframe(d)
|
||||
|
||||
#
|
||||
def tab_threads_initiated_from_ranking(self, rank=10):
|
||||
d = self.query.threads_initiated_from_ranking(rank=rank)
|
||||
return format.Tab.from_dataframe(d)
|
||||
|
||||
#
|
||||
def tab_threads_activity_threads_initiated_avg_ranking(self, rank=10):
|
||||
d = self.query.threads_activity_threads_initiated_avg_ranking(rank=rank)
|
||||
return format.Tab.from_dataframe(d)
|
||||
|
||||
#
|
||||
def tab_threads_initiated_replies_avg_ranking(self, rank=10):
|
||||
d = self.query.threads_initiated_replies_avg_ranking(rank=rank)
|
||||
return format.Tab.from_dataframe(d)
|
||||
|
||||
def tab_content_length_from_ranking(self, rank=5):
|
||||
d = self.query.activity_from_ranking(rank=rank)
|
||||
return format.Tab.from_dataframe(d, name_map={'nbr-bytes': 'bytes'})
|
||||
@@ -123,46 +148,64 @@ class Report:
|
||||
s += format.Tab.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}) + nl
|
||||
return s + nl
|
||||
|
||||
'''
|
||||
html
|
||||
'''
|
||||
def tab_replies_ranking(self, rank=10):
|
||||
d = self.query.replies_ranking(rank=rank)
|
||||
return format.Tab.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'})
|
||||
|
||||
def tab_replies_avg_ranking(self, rank=10):
|
||||
d = self.query.replies_avg_ranking(rank=rank)
|
||||
return format.Tab.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'})
|
||||
|
||||
'''
|
||||
m-t-r
|
||||
html
|
||||
'''
|
||||
def html_msgs_threads_replies(self):
|
||||
self.matrix_msgs_threads()
|
||||
return format.Html.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies']],
|
||||
name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads'})
|
||||
'''
|
||||
a-r-m-t
|
||||
'''
|
||||
|
||||
def html_avg_rep_msg_thrd(self):
|
||||
self.matrix_msgs_threads()
|
||||
return format.Html.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']],
|
||||
name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'})
|
||||
'''
|
||||
a-f-r
|
||||
'''
|
||||
def html_activity_from_ranking(self, rank=5):
|
||||
html = format.Html(self.query)
|
||||
return html.threads_ranking(rank=rank)
|
||||
'''
|
||||
c-l-f-r
|
||||
'''
|
||||
def html_content_length_from_ranking(self, rank=5):
|
||||
|
||||
def html_msgs_threads_replies_avg_rep_msg_thrd(self):
|
||||
self.matrix_msgs_threads()
|
||||
return format.Html.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies', 'avg--per-msg', 'avg-rep-per-thrd']],
|
||||
name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads', 'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'})
|
||||
|
||||
def html_activity_from_ranking(self, rank=10):
|
||||
d = self.query.activity_from_ranking(rank=rank)
|
||||
return format.Html.from_dataframe(d, name_map={'nbr-bytes': 'bytes'})
|
||||
|
||||
#
|
||||
def html_threads_replies_to_ranking(self, rank=10):
|
||||
d = self.query.threads_replies_to_ranking(rank=rank)
|
||||
return format.Html.from_dataframe(d)
|
||||
|
||||
#
|
||||
def html_threads_initiated_from_ranking(self, rank=10):
|
||||
d = self.query.threads_initiated_from_ranking(rank=rank)
|
||||
return format.Html.from_dataframe(d)
|
||||
|
||||
#
|
||||
def html_threads_activity_threads_initiated_avg_ranking(self, rank=10):
|
||||
d = self.query.threads_activity_threads_initiated_avg_ranking(rank=rank)
|
||||
return format.Html.from_dataframe(d)
|
||||
|
||||
#
|
||||
def html_threads_initiated_replies_avg_ranking(self, rank=10):
|
||||
d = self.query.threads_initiated_replies_avg_ranking(rank=rank)
|
||||
return format.Html.from_dataframe(d)
|
||||
|
||||
def html_content_length_from_ranking(self, rank=10):
|
||||
d = self.query.activity_from_ranking(rank=rank)
|
||||
return format.Html.from_dataframe(d, name_map={'nbr-bytes': 'bytes'})
|
||||
'''
|
||||
t-r
|
||||
'''
|
||||
def html_threads_ranking(self, rank=5):
|
||||
|
||||
def html_threads_ranking(self, rank=10):
|
||||
d = self.query.threads_ranking(rank=rank)
|
||||
return format.Html.from_dataframe(d, name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'})
|
||||
|
||||
'''
|
||||
t-r-y
|
||||
'''
|
||||
def html_threads_ranking_year(self, rank=5, resolution='y'):
|
||||
d = self.query.threads_ranking(rank=rank, resolution=resolution)
|
||||
years = sorted(d)
|
||||
@@ -172,3 +215,13 @@ class Report:
|
||||
s += '<div class="year_t">' + i + '</div>' + nl
|
||||
s += format.Html.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'}) + nl
|
||||
return s + nl
|
||||
|
||||
def html_replies_ranking(self, rank=10):
|
||||
d = self.query.replies_ranking(rank=rank)
|
||||
return format.Html.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'})
|
||||
|
||||
def html_replies_avg_ranking(self, rank=10):
|
||||
d = self.query.replies_avg_ranking(rank=rank)
|
||||
return format.Html.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'})
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user