From 9309643a507e1c677156c0dc888f06e21d66ae58 Mon Sep 17 00:00:00 2001 From: gauthiier Date: Mon, 2 Jan 2017 19:10:00 +0100 Subject: [PATCH] rehaul -- replies vs threads --- nettime/format.py | 16 ++++-- nettime/query.py | 141 ++++++++++++++++++++++++++++++++++++++++++---- nettime/report.py | 101 +++++++++++++++++++++++++-------- report.py | 112 ++++++++++++++++++++++-------------- 4 files changed, 287 insertions(+), 83 deletions(-) diff --git a/nettime/format.py b/nettime/format.py index 4f50b85..cba9d75 100644 --- a/nettime/format.py +++ b/nettime/format.py @@ -15,11 +15,11 @@ class Html: self.query = q def threads_ranking(self, rank=5, resolution=None): - - #if resolution is None: data = self.query.threads_ranking(rank=rank) + print data + h = html.HTML() t = h.table() @@ -31,6 +31,9 @@ class Html: for i, row in data.iterrows(): r = t.tr + + print row.index + r.td(str(row['date']), klass='td_date') r.td(row['from'], klass='td_from') r.td(str(row['nbr-references']), klass='td_rep') @@ -89,13 +92,14 @@ class Html: if n in url_skip: continue - if type(l) == numpy.float64: - l = '{0:.4f}'.format(l) + if isinstance(l, float): + if l % 1 > 0: + l = '{0:.4f}'.format(l) + else: + l = int(l) if n in url_hash.keys(): url = row[url_hash[n] - 1] - print '---->' + l - print '<<<<<>' + url r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False) else: diff --git a/nettime/query.py b/nettime/query.py index 122a5ee..0ad3257 100644 --- a/nettime/query.py +++ b/nettime/query.py @@ -9,6 +9,7 @@ class Query: activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month)) content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes)) threads = None # ... + replies = None # ... def __init__(self, arch=None): @@ -231,7 +232,7 @@ class Query: return r - def threads_from(self, email_address, resolution='y', series=False): + def threads_replies_to(self, email_address, resolution='y', series=False): freq = 'M' if resolution.lower() == 'y': @@ -247,7 +248,7 @@ class Query: self._threads() threads_from = self.threads.reindex(columns=['from', 'nbr-references']) - threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() + threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0) if series: @@ -264,24 +265,79 @@ class Query: return threads_from_ranking - def threads_from_ranking(self, rank=5, filter_nettime=True, series=False): + def threads_replies_to_ranking(self, rank=5, filter_nettime=True): self._threads() - threads_from = self.threads.reindex(columns=['from', 'nbr-references']) - threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq='AS'), 'from']).sum() - threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0) - tfr = threads_from_ranking.sum(axis=0).order(ascending=False) + + tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False) if filter_nettime: p = r'^((?!nettime*).)*$' tfr = tfr[tfr.index.str.contains(p)] - if series: - return tfr[:rank] - - tfr = tfr[:rank].to_frame('nbr-threads').astype(int) + tfr = tfr[:rank].astype(int) return tfr + def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False): + + self._threads() + tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False) + if filter_nettime: + p = r'^((?!nettime*).)*$' + tir = tir[tir.index.str.contains(p)] + + if series: + return tir[:rank] + + return tir[:rank].to_frame('nbr-initiated-threads').astype(int) + + def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True): + + # activity + self._activity() + afr = self.activity.sum(axis=0).astype(int) + if filter_nettime: + p = r'^((?!nettime*).)*$' + afr = afr[afr.index.str.contains(p)] + + # initiated threads [top 25] + self._threads() + tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25 + if filter_nettime: + p = r'^((?!nettime*).)*$' + tir = tir[tir.index.str.contains(p)] + + inter = afr.index.intersection(tir.index) + avg = tir[inter] / afr[inter] + + labels = ['messages', 'threads', 'avg.threads'] + return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank] + + def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True): + + self._threads() + + #initiated + tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False) + if filter_nettime: + p = r'^((?!nettime*).)*$' + tir = tir[tir.index.str.contains(p)] + + #replies [top 25] + tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25 + if filter_nettime: + p = r'^((?!nettime*).)*$' + tfr = tfr[tfr.index.str.contains(p)] + tfr = tfr['nbr-references'] # dataframe to series + + + inter = tir.index.intersection(tfr.index) + avg = tfr[inter] / tir[inter] + + labels = ['threads', 'replies', 'avg.replies'] + return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank] + + def threads_overall(self, resolution='y', aggregate='sum', tresh=0): freq = 'M' @@ -320,3 +376,66 @@ class Query: y.index.name = 'year-month' return y + + + ''' + replies + ''' + + def _replies(self): + + if self.replies is None: + self.replies = self.netarchive.dataframe[self.netarchive.dataframe['references'] > 0].reindex(columns=['from','references']) + return self.replies; + + def replies_ranking(self, rank=5, resolution=None): + + self._replies() + + if resolution == None: + data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank] + return data.to_frame('nbr_replies') + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + # get the threads ranking per time resolution + # + data = self.replies.groupby([pd.TimeGrouper(freq=freq)]) + r = {} + for k, v in data: + if freq == 'AS': + time_key = k.strftime('%Y') + else: + time_key = k.strftime('%Y-%m') + frame = v.groupby('from').size().sort_values(ascending=False)[:rank] + r[time_key] = frame.to_frame('nbr_replies') + return r + + def replies_avg_ranking(self, rank=5, filter_nettime=True): + + # activity + self._activity() + afr = self.activity.sum(axis=0) + if filter_nettime: + p = r'^((?!nettime*).)*$' + afr = afr[afr.index.str.contains(p)] + + # replies in thread [top 25] + + self._replies() + rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25] + + inter = afr.index.intersection(rpl.index) + avg = rpl[inter] / afr[inter] + + labels = ['messages', 'replies', 'avg.replies'] + return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank] + + + diff --git a/nettime/report.py b/nettime/report.py index c69e15d..448b176 100644 --- a/nettime/report.py +++ b/nettime/report.py @@ -101,10 +101,35 @@ class Report: return format.Tab.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']], name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'}) + def html_msgs_threads_replies_avg_rep_msg_thrd(self): + self.matrix_msgs_threads() + return format.Tab.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies', 'avg--per-msg', 'avg-rep-per-thrd']], + name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads', 'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'}) + def tab_activity_from_ranking(self, rank=5): d = self.query.activity_from_ranking(rank=rank) return format.Tab.from_dataframe(d, name_map={'nbr-messages': 'messages'}) + # + def tab_threads_replies_to_ranking(self, rank=10): + d = self.query.threads_replies_to_ranking(rank=rank) + return format.Tab.from_dataframe(d) + + # + def tab_threads_initiated_from_ranking(self, rank=10): + d = self.query.threads_initiated_from_ranking(rank=rank) + return format.Tab.from_dataframe(d) + + # + def tab_threads_activity_threads_initiated_avg_ranking(self, rank=10): + d = self.query.threads_activity_threads_initiated_avg_ranking(rank=rank) + return format.Tab.from_dataframe(d) + + # + def tab_threads_initiated_replies_avg_ranking(self, rank=10): + d = self.query.threads_initiated_replies_avg_ranking(rank=rank) + return format.Tab.from_dataframe(d) + def tab_content_length_from_ranking(self, rank=5): d = self.query.activity_from_ranking(rank=rank) return format.Tab.from_dataframe(d, name_map={'nbr-bytes': 'bytes'}) @@ -123,46 +148,64 @@ class Report: s += format.Tab.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}) + nl return s + nl - ''' - html - ''' + def tab_replies_ranking(self, rank=10): + d = self.query.replies_ranking(rank=rank) + return format.Tab.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'}) + + def tab_replies_avg_ranking(self, rank=10): + d = self.query.replies_avg_ranking(rank=rank) + return format.Tab.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'}) ''' - m-t-r + html ''' def html_msgs_threads_replies(self): self.matrix_msgs_threads() return format.Html.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies']], name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads'}) - ''' - a-r-m-t - ''' + def html_avg_rep_msg_thrd(self): self.matrix_msgs_threads() return format.Html.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']], name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'}) - ''' - a-f-r - ''' - def html_activity_from_ranking(self, rank=5): - html = format.Html(self.query) - return html.threads_ranking(rank=rank) - ''' - c-l-f-r - ''' - def html_content_length_from_ranking(self, rank=5): + + def html_msgs_threads_replies_avg_rep_msg_thrd(self): + self.matrix_msgs_threads() + return format.Html.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies', 'avg--per-msg', 'avg-rep-per-thrd']], + name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads', 'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'}) + + def html_activity_from_ranking(self, rank=10): + d = self.query.activity_from_ranking(rank=rank) + return format.Html.from_dataframe(d, name_map={'nbr-bytes': 'bytes'}) + + # + def html_threads_replies_to_ranking(self, rank=10): + d = self.query.threads_replies_to_ranking(rank=rank) + return format.Html.from_dataframe(d) + + # + def html_threads_initiated_from_ranking(self, rank=10): + d = self.query.threads_initiated_from_ranking(rank=rank) + return format.Html.from_dataframe(d) + + # + def html_threads_activity_threads_initiated_avg_ranking(self, rank=10): + d = self.query.threads_activity_threads_initiated_avg_ranking(rank=rank) + return format.Html.from_dataframe(d) + + # + def html_threads_initiated_replies_avg_ranking(self, rank=10): + d = self.query.threads_initiated_replies_avg_ranking(rank=rank) + return format.Html.from_dataframe(d) + + def html_content_length_from_ranking(self, rank=10): d = self.query.activity_from_ranking(rank=rank) return format.Html.from_dataframe(d, name_map={'nbr-bytes': 'bytes'}) - ''' - t-r - ''' - def html_threads_ranking(self, rank=5): + + def html_threads_ranking(self, rank=10): d = self.query.threads_ranking(rank=rank) return format.Html.from_dataframe(d, name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'}) - ''' - t-r-y - ''' def html_threads_ranking_year(self, rank=5, resolution='y'): d = self.query.threads_ranking(rank=rank, resolution=resolution) years = sorted(d) @@ -172,3 +215,13 @@ class Report: s += '
' + i + '
' + nl s += format.Html.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'}) + nl return s + nl + + def html_replies_ranking(self, rank=10): + d = self.query.replies_ranking(rank=rank) + return format.Html.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'}) + + def html_replies_avg_ranking(self, rank=10): + d = self.query.replies_avg_ranking(rank=rank) + return format.Html.from_dataframe(d, name_map={'nbr-replies': 'nbr. replies'}) + + diff --git a/report.py b/report.py index 0aee3ae..a407bae 100644 --- a/report.py +++ b/report.py @@ -4,85 +4,99 @@ from optparse import OptionParser reload(sys) sys.setdefaultencoding('utf8') -logging.info('1/4 setting up matplotlib') # matplot view/windows import matplotlib -import matplotlib.pyplot as plt matplotlib.interactive(True) -logging.info('2/4 setting up pandas') # pd display import pandas as pd pd.set_option('display.max_colwidth', 100) -logging.info('3/4 loading nettime archive') import nettime.archive import nettime.query import nettime.report -a = nettime.archive.Archive('nettime-l_2016-12-31.json.gz') -q = nettime.query.Query(a) -r = nettime.report.Report(q) +class ReportDispatch: -logging.info('4/4 reporting') + def __init__(self, r=None): -def text(command, params=None): + if not isinstance(r, nettime.report.Report): + logging.error("Rep constructor Error: r be of type nettime.report.Report") + raise Exception() - print command + self.r = r - func = { - "tab_msgs_threads_replies": r.tab_msgs_threads_replies, - "tab_avg_rep_msg_thrd": r.tab_avg_rep_msg_thrd, - "tab_activity_from_ranking": r.tab_activity_from_ranking, - "tab_content_length_from_ranking": r.tab_content_length_from_ranking, - "tab_threads_ranking": r.tab_threads_ranking, - "tab_threads_ranking_year": r.tab_threads_ranking_year - } + def text(self, command, params=None): - print func[command] + func = { + "tab_msgs_threads_replies": self.r.tab_msgs_threads_replies, + "tab_avg_rep_msg_thrd": self.r.tab_avg_rep_msg_thrd, + "tab_activity_from_ranking": self.r.tab_activity_from_ranking, + "tab_threads_replies_to_ranking": self.r.tab_threads_replies_to_ranking, + "tab_threads_initiated_from_ranking": self.r.tab_threads_initiated_from_ranking, + "tab_threads_activity_threads_initiated_avg_ranking": self.r.tab_threads_activity_threads_initiated_avg_ranking, + "tab_threads_initiated_replies_avg_ranking": self.r.tab_threads_initiated_replies_avg_ranking, + "tab_content_length_from_ranking": self.r.tab_content_length_from_ranking, + "tab_threads_ranking": self.r.tab_threads_ranking, + "tab_threads_ranking_year": self.r.tab_threads_ranking_year, + "tab_msgs_threads_replies_avg_rep_msg_thrd": self.r.tab_msgs_threads_replies_avg_rep_msg_thrd, + "tab_replies_ranking": self.r.tab_replies_ranking, + "tab_replies_avg_ranking": self.r.tab_replies_avg_ranking + } - return func[command]() + return func[command]() -def html(command, params=None): + def html(self, command, params=None): - func = { - "html_msgs_threads_replies": r.html_msgs_threads_replies, - "html_avg_rep_msg_thrd": r.html_avg_rep_msg_thrd, - "html_activity_from_ranking": r.html_activity_from_ranking, - "html_content_length_from_ranking": r.html_content_length_from_ranking, - "html_threads_ranking": r.html_threads_ranking, - "html_threads_ranking_year": r.html_threads_ranking_year - } + func = { + "html_msgs_threads_replies": self.r.html_msgs_threads_replies, + "html_avg_rep_msg_thrd": self.r.html_avg_rep_msg_thrd, + "html_activity_from_ranking": self.r.html_activity_from_ranking, + "html_threads_replies_to_ranking": self.r.html_threads_replies_to_ranking, + "html_threads_initiated_from_ranking": self.r.html_threads_initiated_from_ranking, + "html_threads_activity_threads_initiated_avg_ranking": self.r.html_threads_activity_threads_initiated_avg_ranking, + "html_threads_initiated_replies_avg_ranking": self.r.html_threads_initiated_replies_avg_ranking, + "html_content_length_from_ranking": self.r.html_content_length_from_ranking, + "html_threads_ranking": self.r.html_threads_ranking, + "html_threads_ranking_year": self.r.html_threads_ranking_year, + "html_msgs_threads_replies_avg_rep_msg_thrd": self.r.html_msgs_threads_replies_avg_rep_msg_thrd, + "html_replies_ranking": self.r.html_replies_ranking, + "html_replies_avg_ranking": self.r.html_replies_avg_ranking + } - return func[command]() + return func[command]() def run(options): - if options.output_file and os.path.isfile(options.output_file): - with open(options.output_file, 'r') as fp: - out = fp.read() # not optimal but will do - else: - print 'No output-file. Nothing to do.' - return - if options.input_script and os.path.isfile(options.input_script): with open(options.input_script, 'r') as fp: input_script = json.load(fp) else: - print 'No input-script. Nothing to do.' + print 'No input script. Nothing to do.' return + if options.template_file and os.path.isfile(options.template_file): + with open(options.template_file, 'r') as fp: + out = fp.read() # not optimal but will do + else: + print 'No template file. Nothing to do.' + return + + a = nettime.archive.Archive(options.archive) + q = nettime.query.Query(a) + r = nettime.report.Report(q) + + rep = ReportDispatch(r) + for cmd in input_script: if cmd['format'] == 'html': - func = html + res = rep.html(cmd['command']) elif cmd['format'] == 'text': - func = text + res = rep.text(cmd['command']) else: continue - res = func(cmd['command']) - if res is not None: out = out.replace(cmd['replace'], res) @@ -95,9 +109,23 @@ if __name__ == "__main__": p = OptionParser(); p.add_option('-i', '--input-script', action="store", help="..") p.add_option('-o', '--output-file', action="store", help="..") + p.add_option('-t', '--template-file', action="store", help="..") + p.add_option('-a', '--archive', action="store", help="..", default="nettime-l_2016-12-31.json.gz") options, args = p.parse_args() + if options.input_script is None: + p.print_help() + p.error('No input file specified.') + + if options.output_file is None: + p.print_help() + p.error('No output file specified.') + + if options.template_file is None: + p.print_help() + p.error('No template file specified.') + run(options)