From 70d51813116ba3b7ec5b8180781aa957ad875618 Mon Sep 17 00:00:00 2001 From: gauthiier Date: Sat, 31 Dec 2016 17:56:37 +0100 Subject: [PATCH] report handle --- README | 13 ++- nettime/mhonarccrawl.py | 2 - nettime/report.py | 174 ++++++++++++++++++++++++++++++ report.py | 233 +++++++++++++--------------------------- 4 files changed, 260 insertions(+), 162 deletions(-) create mode 100644 nettime/report.py diff --git a/README b/README index d9e8043..203677f 100644 --- a/README +++ b/README @@ -1,9 +1,14 @@ -Usage: archive_nettime.py [options] +Usage: archive.py [options] Options: -h, --help show this help message and exit -u URL, --url=URL nettime url - -l LIST, --list=LIST nettime's list name (ex: nettime-l) - -a ARCH, --arch=ARCH path to archive directory + (default='http://www.nettime.org/archives.php') + -l LIST, --list=LIST nettime's list name (default=nettime-l) + -a ARCH, --arch=ARCH path to archives directory (default='archives') - Dependencies: bs4 \ No newline at end of file + Dependencies: bs4 + + --- + + \ No newline at end of file diff --git a/nettime/mhonarccrawl.py b/nettime/mhonarccrawl.py index 7500425..192a0b2 100644 --- a/nettime/mhonarccrawl.py +++ b/nettime/mhonarccrawl.py @@ -248,5 +248,3 @@ def write_mbox_message(msg, mbox): for f in msg['follow-up']: write_mbox_message(f, mbox) - - diff --git a/nettime/report.py b/nettime/report.py new file mode 100644 index 0000000..c69e15d --- /dev/null +++ b/nettime/report.py @@ -0,0 +1,174 @@ +import query +import format +import plot + +class Report: + + query = None + matrix = None + + def __init__(self, q=None): + + if not isinstance(q, query.Query): + logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query") + raise Exception() + + self.query = q + + ''' + (basic) stats + ''' + + def matrix_msgs_threads(self): + + if self.matrix is None: + + # nbr messages + mat = self.query.activity_overall() + + # nbr threads + mat['nbr-threads'] = self.query.threads_overall(aggregate='count')['nbr-threads'] + + # nbr replies + mat['nbr-replies'] = self.query.threads_overall(aggregate='sum')['nbr-references'] + + # nbr non-replies (aka. non-threads) + mat['nbr-single-messages'] = mat['nbr-messages'] - mat['nbr-replies'] - mat['nbr-threads'] + + # avg. rep per message + mat['avg--per-msg'] = mat['nbr-threads'] / mat['nbr-messages'] + + # avg. rep per thread + mat['avg-rep-per-thrd'] = mat['nbr-replies'] / mat['nbr-threads'] + # same as: + # mat['avg-rep-per-thrd'] = q.threads_overall(aggregate='mean')['nbr-references'] + + self.matrix = mat + + return self.matrix + + ''' + plots + ''' + + def plot_nbr_msgs(self, title='Nbr. Messages', label='messages', color='mediumblue'): + + self.matrix_msgs_threads() + + return plot.bar_plot_series(self.matrix['nbr-messages'].to_frame(label), title=title, color=color) + + def plot_nbr_threads(self, title='Nbr. Threads', label='threads', color='crimson'): + + self.matrix_msgs_threads() + + return plot.bar_plot_series(self.matrix['nbr-threads'].to_frame(label), title=title, color=color) + + def plot_nbr_replies(self, title='Nbr. Replies in Threads', label='replies', color='dimgray'): + + self.matrix_msgs_threads() + + return plot.bar_plot_series(self.matrix['nbr-replies'].to_frame(label), title=title, color=color) + + def plot_avg_rep_p_msg(self, title='Avg. Thread per Message', label='replies-per-messasges', color='limegreen'): + + self.matrix_msgs_threads() + + return plot.bar_plot_series(self.matrix['avg--per-msg'].to_frame(label), title=title, color=color) + + def plot_avg_rep_p_thrd(self, title='Avg. Replies per Thread', label='replies-per-thread', color='blueviolet'): + + self.matrix_msgs_threads() + + return plot.bar_plot_series(self.matrix['avg-rep-per-thrd'].to_frame(label), title=title, color=color) + + def plot_msgs_replies(self, title='Nbr. Messages segments (individual messages vs thread replies)'): + + self.matrix_msgs_threads() + + return plot.bar_plot_series(self.matrix[['nbr-single-messages', 'nbr-threads', 'nbr-replies']], color=['mediumblue', 'red', 'dimgray'], title=title) + + ''' + text (tabular) + ''' + + def tab_msgs_threads_replies(self): + self.matrix_msgs_threads() + return format.Tab.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies']], + name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads'}) + + def tab_avg_rep_msg_thrd(self): + self.matrix_msgs_threads() + return format.Tab.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']], + name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'}) + + def tab_activity_from_ranking(self, rank=5): + d = self.query.activity_from_ranking(rank=rank) + return format.Tab.from_dataframe(d, name_map={'nbr-messages': 'messages'}) + + def tab_content_length_from_ranking(self, rank=5): + d = self.query.activity_from_ranking(rank=rank) + return format.Tab.from_dataframe(d, name_map={'nbr-bytes': 'bytes'}) + + def tab_threads_ranking(self, rank=5): + d = self.query.threads_ranking(rank=rank) + return format.Tab.from_dataframe(d, name_map={'nbr-references': 'nbr. replies'}) + + def tab_threads_ranking_year(self, rank=5, resolution='y'): + d = self.query.threads_ranking(rank=rank, resolution=resolution) + years = sorted(d) + nl = '\n' + s = "" + for i in years: + s += 'year: ' + i + nl + s += format.Tab.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}) + nl + return s + nl + + ''' + html + ''' + + ''' + m-t-r + ''' + def html_msgs_threads_replies(self): + self.matrix_msgs_threads() + return format.Html.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies']], + name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads'}) + ''' + a-r-m-t + ''' + def html_avg_rep_msg_thrd(self): + self.matrix_msgs_threads() + return format.Html.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']], + name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'}) + ''' + a-f-r + ''' + def html_activity_from_ranking(self, rank=5): + html = format.Html(self.query) + return html.threads_ranking(rank=rank) + ''' + c-l-f-r + ''' + def html_content_length_from_ranking(self, rank=5): + d = self.query.activity_from_ranking(rank=rank) + return format.Html.from_dataframe(d, name_map={'nbr-bytes': 'bytes'}) + ''' + t-r + ''' + def html_threads_ranking(self, rank=5): + d = self.query.threads_ranking(rank=rank) + return format.Html.from_dataframe(d, name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'}) + + ''' + t-r-y + ''' + def html_threads_ranking_year(self, rank=5, resolution='y'): + d = self.query.threads_ranking(rank=rank, resolution=resolution) + years = sorted(d) + nl = '\n' + s = "" + for i in years: + s += '
' + i + '
' + nl + s += format.Html.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'}) + nl + return s + nl diff --git a/report.py b/report.py index 0a2baac..0aee3ae 100644 --- a/report.py +++ b/report.py @@ -1,182 +1,103 @@ +import sys, os, json, logging +from optparse import OptionParser + +reload(sys) +sys.setdefaultencoding('utf8') + +logging.info('1/4 setting up matplotlib') +# matplot view/windows +import matplotlib +import matplotlib.pyplot as plt +matplotlib.interactive(True) + +logging.info('2/4 setting up pandas') +# pd display +import pandas as pd +pd.set_option('display.max_colwidth', 100) + +logging.info('3/4 loading nettime archive') +import nettime.archive import nettime.query -import nettime.format -import nettime.plot +import nettime.report -class Report: +a = nettime.archive.Archive('nettime-l_2016-12-31.json.gz') +q = nettime.query.Query(a) +r = nettime.report.Report(q) - query = None - matrix = None +logging.info('4/4 reporting') - def __init__(self, q=None): - - if not isinstance(q, nettime.query.Query): - logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query") - raise Exception() - - self.query = q - - ''' - (basic) stats - ''' - - def matrix_msgs_threads(self): - - if self.matrix is None: - - # nbr messages - mat = self.query.activity_overall() - - # nbr threads - mat['nbr-threads'] = self.query.threads_overall(aggregate='count')['nbr-threads'] - - # nbr replies - mat['nbr-replies'] = self.query.threads_overall(aggregate='sum')['nbr-references'] - - # nbr non-replies (aka. non-threads) - mat['nbr-single-messages'] = mat['nbr-messages'] - mat['nbr-replies'] - mat['nbr-threads'] - - # avg. rep per message - mat['avg--per-msg'] = mat['nbr-threads'] / mat['nbr-messages'] - - # avg. rep per thread - mat['avg-rep-per-thrd'] = mat['nbr-replies'] / mat['nbr-threads'] - # same as: - # mat['avg-rep-per-thrd'] = q.threads_overall(aggregate='mean')['nbr-references'] - - self.matrix = mat - - return self.matrix - - ''' - plots - ''' - - def plot_nbr_msgs(self, title='Nbr. Messages', label='messages', color='mediumblue'): - - self.matrix_msgs_threads() - - return nettime.plot.bar_plot_series(self.matrix['nbr-messages'].to_frame(label), title=title, color=color) - - def plot_nbr_threads(self, title='Nbr. Threads', label='threads', color='crimson'): - - self.matrix_msgs_threads() - - return nettime.plot.bar_plot_series(self.matrix['nbr-threads'].to_frame(label), title=title, color=color) - - def plot_nbr_replies(self, title='Nbr. Replies in Threads', label='replies', color='dimgray'): - - self.matrix_msgs_threads() - - return nettime.plot.bar_plot_series(self.matrix['nbr-replies'].to_frame(label), title=title, color=color) - - def plot_avg_rep_p_msg(self, title='Avg. Thread per Message', label='replies-per-messasges', color='limegreen'): - - self.matrix_msgs_threads() - - return nettime.plot.bar_plot_series(self.matrix['avg--per-msg'].to_frame(label), title=title, color=color) - - def plot_avg_rep_p_thrd(self, title='Avg. Replies per Thread', label='replies-per-thread', color='blueviolet'): - - self.matrix_msgs_threads() - - return nettime.plot.bar_plot_series(self.matrix['avg-rep-per-thrd'].to_frame(label), title=title, color=color) - - def plot_msgs_replies(self, title='Nbr. Messages segments (individual messages vs thread replies)'): - - self.matrix_msgs_threads() - - return nettime.plot.bar_plot_series(self.matrix[['nbr-single-messages', 'nbr-threads', 'nbr-replies']], color=['mediumblue', 'red', 'dimgray'], title=title) - - ''' - text (tabular) - ''' - - def tab_msgs_threads_replies(self): - self.matrix_msgs_threads() - return nettime.format.Tab.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies']], - name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads'}) - - def tab_avg_rep_msg_thrd(self): - self.matrix_msgs_threads() - return nettime.format.Tab.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']], - name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'}) - - def tab_activity_from_ranking(self, rank=5): - d = self.query.activity_from_ranking(rank=rank) - return nettime.format.Tab.from_dataframe(d, name_map={'nbr-messages': 'messages'}) - - def tab_content_length_from_ranking(self, rank=5): - d = self.query.activity_from_ranking(rank=rank) - return nettime.format.Tab.from_dataframe(d, name_map={'nbr-bytes': 'bytes'}) - - def tab_threads_ranking(self, rank=5): - d = self.query.threads_ranking(rank=rank) - return nettime.format.Tab.from_dataframe(d, name_map={'nbr-references': 'nbr. replies'}) - - def tab_threads_ranking_year(self, rank=5, resolution='y'): - d = self.query.threads_ranking(rank=rank, resolution=resolution) - years = sorted(d) - nl = '\n' - s = "" - for i in years: - s += 'year: ' + i + nl - s += nettime.format.Tab.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}) + nl - return s + nl - - ''' - html - ''' - - def html_msgs_threads_replies(self): - self.matrix_msgs_threads() - return nettime.format.Html.from_dataframe(self.matrix[['nbr-messages', 'nbr-threads', 'nbr-replies']], - name_map={'nbr-messages': 'messages', 'nbr-threads': 'threads', 'nbr-replies': 'replies in threads'}) - - def html_avg_rep_msg_thrd(self): - self.matrix_msgs_threads() - return nettime.format.Html.from_dataframe(self.matrix[['avg--per-msg', 'avg-rep-per-thrd']], - name_map={'avg--per-msg': 'avg. thread per message', 'avg-rep-per-thrd': 'avg. replies per thread'}) - - def html_activity_from_ranking(self, rank=5): - html = nettime.format.Html(self.query) - return html.threads_ranking(rank=rank) - - def html_content_length_from_ranking(self, rank=5): - d = self.query.activity_from_ranking(rank=rank) - return nettime.format.Html.from_dataframe(d, name_map={'nbr-bytes': 'bytes'}) - - def html_threads_ranking(self, rank=5): - d = self.query.threads_ranking(rank=rank) - return nettime.format.Html.from_dataframe(d, name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'}) - - def html_threads_ranking_year(self, rank=5, resolution='y'): - d = self.query.threads_ranking(rank=rank, resolution=resolution) - years = sorted(d) - nl = '\n' - s = "" - for i in years: - s += '
' + i + '
' + nl - s += nettime.format.Html.from_dataframe(d[i], name_map={'nbr-references': 'nbr. replies'}, url_map={'subject': 'url'}) + nl - return s + nl +def text(command, params=None): + print command + func = { + "tab_msgs_threads_replies": r.tab_msgs_threads_replies, + "tab_avg_rep_msg_thrd": r.tab_avg_rep_msg_thrd, + "tab_activity_from_ranking": r.tab_activity_from_ranking, + "tab_content_length_from_ranking": r.tab_content_length_from_ranking, + "tab_threads_ranking": r.tab_threads_ranking, + "tab_threads_ranking_year": r.tab_threads_ranking_year + } + print func[command] + return func[command]() +def html(command, params=None): + func = { + "html_msgs_threads_replies": r.html_msgs_threads_replies, + "html_avg_rep_msg_thrd": r.html_avg_rep_msg_thrd, + "html_activity_from_ranking": r.html_activity_from_ranking, + "html_content_length_from_ranking": r.html_content_length_from_ranking, + "html_threads_ranking": r.html_threads_ranking, + "html_threads_ranking_year": r.html_threads_ranking_year + } + return func[command]() +def run(options): + if options.output_file and os.path.isfile(options.output_file): + with open(options.output_file, 'r') as fp: + out = fp.read() # not optimal but will do + else: + print 'No output-file. Nothing to do.' + return + if options.input_script and os.path.isfile(options.input_script): + with open(options.input_script, 'r') as fp: + input_script = json.load(fp) + else: + print 'No input-script. Nothing to do.' + return + for cmd in input_script: + if cmd['format'] == 'html': + func = html + elif cmd['format'] == 'text': + func = text + else: + continue + res = func(cmd['command']) + if res is not None: + out = out.replace(cmd['replace'], res) + with open(options.output_file, 'w') as fp: + fp.write(out) # not optimal but will do +if __name__ == "__main__": + p = OptionParser(); + p.add_option('-i', '--input-script', action="store", help="..") + p.add_option('-o', '--output-file', action="store", help="..") + options, args = p.parse_args() + run(options)