diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 951da93..18c05fa --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # mailinglists specific archives/ +figs/ config.py # Byte-compiled / optimized / DLL files diff --git a/analyse.py b/analyse.py new file mode 100644 index 0000000..bef4381 --- /dev/null +++ b/analyse.py @@ -0,0 +1,230 @@ +import os + +# matplot view/windows +import matplotlib +matplotlib.interactive(True) + +# pd display +import pandas as pd +pd.set_option('display.max_colwidth', 100) + +from analysis.archive import Archive +from analysis.query import Query +from analysis.plot import Plot + +import analysis.format + +# spectre: slategrey +# nettime: red +# crumb: purple +# empyre: darkblue + +def save_fig_cohort(q, name, dir, color): + t = name + " - Cohorts" + pp = q.cohort().plot(color=color, title=t) + ts = name + "_cohorts.png" + filename = os.path.join(dir, ts) + pp.get_figure().savefig(filename) + +def save_fig_messages_total(q, name, dir, color): + t = name + " - Nbr. Messages" + pp = q.activity_overall().plot(kind='bar', color=color, title=t) + ts = name + "_messages.png" + filename = os.path.join(dir, ts) + pp.get_figure().savefig(filename) + +def save_fig_threads_total(q, name, dir, color): + t = name + " - Nbr. Threads" + pp = q.threads_overall().plot(kind='bar', color=color, title=t) + ts = name + "_threads.png" + filename = os.path.join(dir, ts) + pp.get_figure().savefig(filename) + +def save_fig_messages_constituency(q, name, dir): + t = name + " - Messages Constituency" + replies = pd.Series(q.replies_overall(series=True)) + # threads = pd.Series(q.single_threads_overall(series=True)) + threads = pd.Series(q.threads_overall(series=True)) + messages = pd.Series(q.activity_overall(series=True)) + single_messages = messages - (replies + threads) + + # df = {'a': single_messages, 'b': threads, 'c': replies} + # df = pd.DataFrame([single_messages, threads, replies], columns=['a', 'b', 'c']) + df = pd.concat([single_messages.to_frame('single-messages').astype(int), threads.to_frame('threads').astype(int), replies.to_frame('replies').astype(int)], axis=1) + pp = df.plot(kind='bar', stacked=True, title=t) + + # pp = [single_messages, threads, replies].plot(kind='bar', stacked=True) + + ts = name + "_constituency.png" + filename = os.path.join(dir, ts) + pp.get_figure().savefig(filename) + +def save_fig_avg_threads_replies(q, name, dir, color): + t = name + " - Avg. Threads + Replies" + replies = pd.Series(q.replies_overall(series=True)) + threads = pd.Series(q.threads_overall(series=True)) + messages = pd.Series(q.activity_overall(series=True)) + + avg_threads_messages = (replies + threads) / messages + + pp = pd.DataFrame(avg_threads_messages).plot(kind='bar', color=color, title=t) + + ts = name + "_avg_threads_replies.png" + filename = os.path.join(dir, ts) + pp.get_figure().savefig(filename) + +def save_fig_diff_threads_replies_vs_messages(q, name, dir, color): + t = name + " - Diff. Threads + Replies vs Single Messages" + replies = pd.Series(q.replies_overall(series=True)) + threads = pd.Series(q.threads_overall(series=True)) + rt = replies + threads + messages = pd.Series(q.activity_overall(series=True)) + + diff_threads_messages = (2 * rt) - messages + + pp = pd.DataFrame(diff_threads_messages).plot(kind='bar', color=color, title=t) + + ts = name + "_diff_threads_replies_messages.png" + filename = os.path.join(dir, ts) + pp.get_figure().savefig(filename) + +def save_fig_ratio_replies_threads(q, name, dir, color): + t = name + " - Ratio Replies per Thread" + replies = pd.Series(q.replies_overall(series=True)) + threads = pd.Series(q.threads_overall(series=True)) + + ratio_replies_threads = replies / threads + + pp = pd.DataFrame(ratio_replies_threads).plot(kind='bar', color=color, title=t) + + ts = name + "_ratio_replies_threads.png" + filename = os.path.join(dir, ts) + pp.get_figure().savefig(filename) + +def html_td_rank_year(year, data): + td_str = '' + if year in data: + td_str += analysis.format.table_threads_ranking(data[year]) + td_str += '' + return td_str + +def html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre): + + html_str = '' + + html_str += '' + html_str += '' + html_str += '' + html_str += '' + html_str += '' + html_str += '' + html_str += '' + + years = sorted(ranking_nettime.keys()) + + print(years) + + for i in years: + html_str += '' + html_str += '' + html_str += html_td_rank_year(i, ranking_nettime) + html_str += html_td_rank_year(i, ranking_crumb) + html_str += html_td_rank_year(i, ranking_spectre) + html_str += html_td_rank_year(i, ranking_empyre) + html_str += '' + + html_str += '
yearnettimecrumbspectreempyre
' + i + '
' + return html_str + + +print("nettime") +#nettime +nt = Archive('nettime-l') +ntq = nt.query() +ntp = Plot(ntq) + + + +# save_fig_cohort(ntq, 'nettime', 'figs/', 'red') +# save_fig_messages_total(ntq, 'nettime', 'figs/', 'red') +# save_fig_threads_total(ntq, 'nettime', 'figs/', 'red') +# save_fig_messages_constituency(ntq, 'nettime', 'figs/') + +# save_fig_avg_threads_replies(ntq, 'nettime', 'figs/', 'red') +# save_fig_diff_threads_replies_vs_messages(ntq, 'nettime', 'figs/', 'red') +# save_fig_ratio_replies_threads(ntq, 'nettime', 'figs/', 'red') + +ranking_nettime = ntq.threads_ranking(rank=15) + +# print(r['2000']) + +# print(analysis.format.table_threads_ranking(r['2000'])) + + +print("crumb") +#crumb +cr = Archive('crumb') +crq = cr.query() +crp = Plot(crq) + +# save_fig_cohort(crq, 'crumb', 'figs/', 'purple') +# save_fig_messages_total(crq, 'crumb', 'figs/', 'purple') +# save_fig_threads_total(crq, 'crumb', 'figs/', 'purple') +# save_fig_messages_constituency(crq, 'crumb', 'figs/') + +# save_fig_avg_threads_replies(crq, 'crumb', 'figs/', 'purple') +# save_fig_diff_threads_replies_vs_messages(crq, 'crumb', 'figs/', 'purple') +# save_fig_ratio_replies_threads(crq, 'crumb', 'figs/', 'purple') + +ranking_crumb = crq.threads_ranking(rank=15) + + +print("empyre") +#empyre +em = Archive('empyre') +emq = em.query() +emp = Plot(emq) + +# save_fig_cohort(emq, 'empyre', 'figs/', 'darkblue') +# save_fig_messages_total(emq, 'empyre', 'figs/', 'darkblue') +# save_fig_threads_total(emq, 'empyre', 'figs/', 'darkblue') +# save_fig_messages_constituency(emq, 'empyre', 'figs/') + +# save_fig_avg_threads_replies(emq, 'empyre', 'figs/', 'darkblue') +# save_fig_diff_threads_replies_vs_messages(emq, 'empyre', 'figs/', 'darkblue') +# save_fig_ratio_replies_threads(emq, 'empyre', 'figs/', 'darkblue') + +ranking_empyre = emq.threads_ranking(rank=15) + +print("spectre") +#spectre +sp = Archive('spectre') +spq = sp.query() +spp = Plot(spq) + +# save_fig_cohort(spq, 'spectre', 'figs/', 'slategrey') +# save_fig_messages_total(spq, 'spectre', 'figs/', 'slategrey') +# save_fig_threads_total(spq, 'spectre', 'figs/', 'slategrey') +# save_fig_messages_constituency(spq, 'spectre', 'figs/') + +# save_fig_avg_threads_replies(spq, 'spectre', 'figs/', 'slategrey') +# save_fig_diff_threads_replies_vs_messages(spq, 'spectre', 'figs/', 'slategrey') +# save_fig_ratio_replies_threads(spq, 'spectre', 'figs/', 'slategrey') + +ranking_spectre = spq.threads_ranking(rank=15) + + +## comparative ranking + +rankings = html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre) + +html_template = 'figs/ranking/index_template.html' +with open(html_template, 'r') as fp: + h = fp.read() + +html = h.replace("--table--", rankings) + +html_output = 'figs/ranking/index.html' +with open(html_output, 'w+') as fp: + fp.write(html) + diff --git a/analysis/archive.py b/analysis/archive.py new file mode 100644 index 0000000..597615a --- /dev/null +++ b/analysis/archive.py @@ -0,0 +1,152 @@ +import numpy as np +import pandas as pd +import email, email.parser +import os, datetime, json, gzip, re +import analysis.util +import analysis.query + + +def filter_date(msg, archive_name): + + time_tz = analysis.util.format_date(msg, archive_name) + if not time_tz: + return None + + dt = datetime.datetime.fromtimestamp(time_tz) + try: + date_time = pd.to_datetime(dt) + except pd.tslib.OutOfBoundsDatetime: + print('time out of bound') + print(dt) + return None + + min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y') + max_date = pd.to_datetime(datetime.datetime.now()) + if date_time < min_date or date_time > max_date: + return None + + return date_time + + +def message_to_tuple_record(msg, records, archive_name, references='X'): + + # check date first? + date = filter_date(msg, archive_name) + if not date: + print("Archive::filter_date returned None. Skip.") + return + + # check / filter from email address second? + from_addr = analysis.util.format_from(msg, archive_name) + if not from_addr: + print("Archive::analysis.util.format_from returned None. Skip.") + return + + url = analysis.util.format_url(msg, archive_name) + author = analysis.util.format_author(msg, archive_name) + subject = analysis.util.format_subject(msg, archive_name) + message_id = analysis.util.format_id(msg, archive_name) + content = analysis.util.format_content(msg, archive_name) + + records.append((message_id, + from_addr, + author, + subject, + date, + url, + len(content), + 0 if not 'follow-up' in msg else len(msg['follow-up']), + references)) + + # recursive follow up -- but references is not keeping track really... + if 'follow-up' in msg: + for f in msg['follow-up']: + message_to_tuple_record(f, records, archive_name, references=message_id) + + return + +def json_data_to_pd_dataframe(json_data, archive_name): + + records = [] + for d in json_data: + for dd in d['threads']: + message_to_tuple_record(dd, records, archive_name) + + print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records))) + + df = pd.DataFrame.from_records(records, + index='date', + columns=['message-id', + 'from', + 'author', + 'subject', + 'date', + 'url', + 'content-length', + 'nbr-references', + 'references']) + + df.index.name = 'date' + + return df + +def load_from_file(filename, archive_name, archive_dir, json_data=None): + + if not filename.endswith('.json.gz'): + file_path = os.path.join(archive_dir, filename + '.json.gz') + else: + file_path = os.path.join(archive_dir, filename) + + if os.path.isfile(file_path): + with gzip.open(file_path, 'r') as fp: + json_data = json.load(fp) + return json_data_to_pd_dataframe(json_data['threads'], archive_name) + else: + #list of all "filename[...].json.gz" in archive_dir + files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')]) + if files: + filename = files[-1] # take the most recent (listed alpha-chronological) + file_path = os.path.join(archive_dir, filename) + if os.path.isfile(file_path): + with gzip.open(file_path, 'r') as fp: + json_data = json.load(fp) + return json_data_to_pd_dataframe(json_data['threads'], archive_name) + else: + #list of all json files in archive_dir/filename + dir_path = os.path.join(archive_dir, filename) + if not os.path.isdir(dir_path): + return None + + files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')] + if not files: + return None + + # load all json files + threads = [] + for file_path in files: + with open(file_path, 'r') as fp: + json_data = json.load(fp) + threads.append(json_data) + + print('---> ' + archive_name) + return json_data_to_pd_dataframe(threads, archive_name) + + +class Archive: + + data = None # "raw" json data + dataframe = None # main pd dataframe + + def __init__(self, archive_name, archive_dir="archives"): + + if isinstance(archive_name, pd.core.frame.DataFrame): + self.dataframe = archive_name.copy() + + if isinstance(archive_name, str): + # need a filename or a dir name.... + self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data) + + def query(self): + q = analysis.query.Query(self) + return q + diff --git a/analysis/format.py b/analysis/format.py new file mode 100644 index 0000000..4c8e8b0 --- /dev/null +++ b/analysis/format.py @@ -0,0 +1,158 @@ +import analysis.query +import logging, html, numpy +from tabulate import tabulate + +def makeurl(text, url): + return '' + text + "" + +def table_threads_ranking(ranking_dataframe): + + html_str = '' + + + html_str += '' + html_str += '' + html_str += '' + html_str += '' + html_str += '' + html_str += '' + + + for i, row in ranking_dataframe.iterrows(): + + html_str += '' + html_str += '' + html_str += '' + html_str += '' + html_str += '' + html_str += '' + + html_str += "
datesubjectfromreplies
' + str(i) + '' + makeurl(row['subject'], row['url']) + '' + row['from'] + '' + str(row['nbr-references']) + '
" + + return html_str + + + + +class Html: + + query = None + + def __init__(self, q=None): + + if not isinstance(q, query.Query): + logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query") + raise Exception() + + self.query = q + + def threads_ranking(self, rank=5, resolution=None): + + data = self.query.threads_ranking(rank=rank) + + h = html.HTML() + t = h.table() + + r = t.tr + r.td('date', klass='td_date_t') + r.td('from', klass='td_from_t') + r.td('replies', klass='td_rep_t') + r.td('subject', klass='td_subject_t') + + for i, row in data.iterrows(): + r = t.tr + + print(row.index) + + r.td(str(row['date']), klass='td_date') + r.td(row['from'], klass='td_from') + r.td(str(row['nbr-references']), klass='td_rep') + r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False) + + return str(t) + + @staticmethod + def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}): + + header = [] + if data_frame.index.name in name_map: + header.append(name_map[data_frame.index.name]) + else: + header.append(data_frame.index.name) + for h in data_frame.columns: + if h in name_map: + h = name_map[h] + header.append(h) + + css_header = [] + css_element = [] + for i in header: + css_header.append('td_' + i + '_t') + css_element.append('td_' + i) + + h = html.HTML() + if table_name: + t = h.table(id=table_name, klass=table_name + '_t') + else: + t = h.table() + + # url map + url_hash = {} + url_skip = [] + url_keys = url_map.keys() + for u in url_keys: + if u in header and url_map[u] in header: + url_indx = header.index(url_map[u]) + url_hash[header.index(u)] = url_indx + url_skip.append(url_indx) + header.pop(url_indx) + + #header + r = t.tr + n = 0 + for j in header: + r.td(str(j), klass=css_header[n]) + n += 1 + + + #elements + for k, row in data_frame.iterrows(): + r = t.tr + r.td(str(k), klass=css_element[0]) + n = 1 + for l in row: + + if n in url_skip: + continue + + if isinstance(l, float): + if l % 1 > 0: + l = '{0:.4f}'.format(l) + else: + l = int(l) + + if n in url_hash.keys(): + url = row[url_hash[n] - 1] + r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False) + + else: + r.td(str(l), klass=css_element[n]) + n += 1 + + return str(t) + +class Tab: + + @staticmethod + def from_dataframe(data_frame, name_map={}, format=".0f"): + + header = [] + header.append(data_frame.index.name) + for h in data_frame.columns: + if h in name_map: + h = name_map[h] + header.append(h) + + return tabulate(data_frame, headers=header, floatfmt=format) + + diff --git a/analysis/plot.py b/analysis/plot.py new file mode 100644 index 0000000..8f08430 --- /dev/null +++ b/analysis/plot.py @@ -0,0 +1,79 @@ +import numpy as np +import pandas as pd +import analysis.query + +# for colormaps see: +# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps +# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps +# http://matplotlib.org/examples/color/colormaps_reference.html +# for colors see: +# http://matplotlib.org/examples/color/named_colors.html + +# spectre: slategrey +# nettime: red +# crumb: purple +# empyre: darkblue + +def bar_plot_series(series, title, color='blueviolet', ylim=None): + return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim) + +def save(plot, name): + fig = plot.get_figure() + fig.savefig(name) + +class Plot: + + query = None + + def __init__(self, q=None): + + if not isinstance(q, analysis.query.Query): + logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query") + raise Exception() + + self.query = q + + ''' + activity + ''' + + def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)): + + activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys() + series = [] + for k in activity_rank: + series.append(self.query.activity_from(k, resolution, series=True)) + + df = pd.concat(series, axis=1) + + return df.plot.area(colormap='spectral', figsize=figsize, stacked=False) + + ''' + content lenght + ''' + + def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)): + + content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys() + series = [] + for k in content_rank: + series.append(self.query.content_length_from(k, resolution, series=True)) + + df = pd.concat(series, axis=1) + + return df.plot.area(colormap=colormap, figsize=figsize, stacked=False) + + ''' + threads + ''' + + def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)): + + threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys() + series = [] + for k in threads_rank: + series.append(self.query.threads_from(k, resolution, series=True)) + + df = pd.concat(series, axis=1) + + return df.plot.area(colormap=colormap, figsize=figsize, stacked=False) diff --git a/analysis/query.py b/analysis/query.py new file mode 100644 index 0000000..5b46488 --- /dev/null +++ b/analysis/query.py @@ -0,0 +1,573 @@ +import numpy as np +import pandas as pd +import analysis.archive +import logging + +class Query: + + archive = None # analysis.archive.Archive object + activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month)) + content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes)) + threads = None # ... + single_threads = None + replies = None # ... + + def __init__(self, arch=None): + + if not isinstance(arch, analysis.archive.Archive): + logging.error("Query constructor Error: arch must be of type analysis.archive.Archive") + raise Exception() + + self.archive = arch + + ''' + activity + ''' + + def _activity(self): + + if self.activity is None: + from_index = self.archive.dataframe.reindex(columns=['from']) + self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0) + + return self.activity + + def activity_from(self, email_address, resolution='y', series=False): + + eaddr = email_address.replace('@', '{at}').lower() + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + self._activity() + try: + af = self.activity[eaddr] + except KeyError: + return None + + activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum() + + if freq == 'AS': + activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y')) + activity_from.index.name = 'year' + else: + activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m')) + activity_from.index.name = 'year-month' + + if series: + return activity_from + + return activity_from.to_frame('nbr-messages').astype(int) + + def activity_from_ranking(self, rank=5, filter_nettime=True, series=False): + + self._activity() + afr = self.activity.sum(axis=0).order(ascending=False) + if filter_nettime: + p = r'^((?!nettime*).)*$' + afr = afr[afr.index.str.contains(p)] + + if series: + return afr[:rank] + + return afr[:rank].to_frame('nbr-messages').astype(int) + + + # def activity_overall(self, resolution='y', series=False): + + # freq = 'M' + # if resolution.lower() == 'y': + # freq = 'AS' + # elif resolution.lower() == 'm': + # freq = 'M' + # else: + # return None + + # self._activity() + + # y = self.activity.sum(axis=1) + # y = y.groupby([pd.TimeGrouper(freq=freq)]).sum() + + # if freq == 'AS': + # y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) + # y.index.name = 'year' + # else: + # y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) + # y.index.name = 'year-month' + + # if series: + # return y + + # return y.to_frame('nbr-messages').astype(int) + + def activity_overall(self, resolution='y', series=False): + + a = self.archive.dataframe['url'] + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count() + + if freq == 'AS': + y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) + y.index.name = 'year' + else: + y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) + y.index.name = 'year-month' + + if series: + return y + + return y.to_frame('nbr-messages').astype(int) + + def cohort(self, resolution='m', series=False): + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + self._activity() + + c = self.activity.idxmax().order().to_frame('date') + c.index = c['date'] + + cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size() + + if freq == 'AS': + cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y')) + cohort.index.name = 'year' + else: + cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m')) + cohort.index.name = 'year-month' + + if series: + return cohort + + return cohort.to_frame('first-messages').astype(int) + + ''' + content lenght + ''' + + def _content_length(self): + + if self.content_length is None: + from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length']) + self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum() + self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0) + + return self.content_length + + def content_length_from(self, email_address, resolution='y', series=False): + + eaddr = email_address.replace('@', '{at}').lower() + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + self._content_length() + try: + af = self.content_length[eaddr] + except KeyError: + return None + + content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum() + + if freq == 'AS': + content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y')) + content_length_from.index.name = 'year' + else: + content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m')) + content_length_from.index.name = 'year-month' + + if series: + return content_length_from + + return content_length_from.to_frame('nbr-bytes').astype(int) + + def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False): + + self._content_length() + cfr = self.content_length.sum(axis=0).order(ascending=False) + if filter_nettime: + p = r'^((?!nettime*).)*$' + cfr = cfr[cfr.index.str.contains(p)] + + if series: + return cfr[:rank] + + return cfr[:rank].to_frame('nbr-bytes').astype(int) + + def content_length_overall(self, resolution='y', series=False): + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + self._content_length() + + y = self.content_length.sum(axis=1) + y = y.groupby([pd.TimeGrouper(freq=freq)]).sum() + + if freq == 'AS': + y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) + y.index.name = 'year' + else: + y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) + y.index.name = 'year-month' + + if series: + return y + + return y.to_frame('nbr-bytes').astype(int) + + + ''' + threads + ''' + + def _threads(self, thresh=0): + + print("doing threads") + + if self.threads is None: + self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False) + + if self.single_threads is None: + self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False) + + return self.threads; + + def threads_ranking(self, rank=5, resolution='y'): + + self._threads() + + if resolution == None: + data = self.threads.drop('message-id', axis=1)[:rank] + return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1) + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + # get the threads ranking per time resolution + # + data = self.threads.drop('message-id', axis=1) + data = data.groupby([pd.TimeGrouper(freq=freq)]) + r = {} + for k, v in data: + if freq == 'AS': + time_key = k.strftime('%Y') + else: + time_key = k.strftime('%Y-%m') + frame = v[:rank] + frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1) + r[time_key] = frame + return r + + def threads_replies_to(self, email_address, resolution='y', series=False): + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + self._threads() + + eaddr = email_address.replace('@', '{at}').lower() + + self._threads() + threads_from = self.threads.reindex(columns=['from', 'nbr-references']) + threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references + threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0) + + if series: + return threads_from_ranking[eaddr] + + threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int) + + if freq == 'AS': + threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y')) + threads_from_ranking.index.name = 'year' + else: + threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m')) + threads_from_ranking.index.name = 'year-month' + + return threads_from_ranking + + def threads_replies_to_ranking(self, rank=5, filter_nettime=True): + + self._threads() + + tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False) + + if filter_nettime: + p = r'^((?!nettime*).)*$' + tfr = tfr[tfr.index.str.contains(p)] + + tfr = tfr[:rank].astype(int) + return tfr + + def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False): + + self._threads() + tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False) + if filter_nettime: + p = r'^((?!nettime*).)*$' + tir = tir[tir.index.str.contains(p)] + + if series: + return tir[:rank] + + return tir[:rank].to_frame('nbr-initiated-threads').astype(int) + + def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True): + + # activity + self._activity() + afr = self.activity.sum(axis=0).astype(int) + if filter_nettime: + p = r'^((?!nettime*).)*$' + afr = afr[afr.index.str.contains(p)] + + # initiated threads [top 25] + self._threads() + tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25 + if filter_nettime: + p = r'^((?!nettime*).)*$' + tir = tir[tir.index.str.contains(p)] + + inter = afr.index.intersection(tir.index) + avg = tir[inter] / afr[inter] + + labels = ['messages', 'threads', 'avg.threads'] + return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank] + + def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True): + + self._threads() + + #initiated + tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False) + if filter_nettime: + p = r'^((?!nettime*).)*$' + tir = tir[tir.index.str.contains(p)] + + #replies [top 25] + tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25 + if filter_nettime: + p = r'^((?!nettime*).)*$' + tfr = tfr[tfr.index.str.contains(p)] + tfr = tfr['nbr-references'] # dataframe to series + + + inter = tir.index.intersection(tfr.index) + avg = tfr[inter] / tir[inter] + + labels = ['threads', 'replies', 'avg.replies'] + return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank] + + + def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0): + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + agg = aggregate.lower() + if not agg in ['sum', 'mean', 'count']: + return None + + if not self.threads is None: + del self.threads + self.threads = None + + self._threads(tresh) + + if agg == 'sum': + # number of replies total (re: sum all the replies) + y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum() + elif agg == 'mean': + y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean() + else: + # number of threads (re: msgs with at least one reply) + y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count() + + if freq == 'AS': + y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) + y.index.name = 'year' + else: + y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) + y.index.name = 'year-month' + + if series: + return y + + return y.to_frame('nbr-threads').astype(int) + + def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1): + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + agg = aggregate.lower() + if not agg in ['sum', 'mean', 'count']: + return None + + if not self.single_threads is None: + del self.single_threads + self.single_threads = None + + self._threads(tresh) + + + y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count() + + + if freq == 'AS': + y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) + y.index.name = 'year' + else: + y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) + y.index.name = 'year-month' + + if series: + return y + + return y.to_frame('nbr-threads').astype(int) + + + ''' + replies + ''' + + def _replies(self): + + if self.replies is None: + self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references']) + self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references']) + return self.replies; + + def replies_ranking(self, rank=5, resolution=None): + + self._replies() + + if resolution == None: + data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank] + return data.to_frame('nbr_replies') + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + # get the threads ranking per time resolution + # + data = self.replies.groupby([pd.TimeGrouper(freq=freq)]) + r = {} + for k, v in data: + if freq == 'AS': + time_key = k.strftime('%Y') + else: + time_key = k.strftime('%Y-%m') + frame = v.groupby('from').size().sort_values(ascending=False)[:rank] + r[time_key] = frame.to_frame('nbr-replies') + return r + + def replies_avg_ranking(self, rank=5, filter_nettime=True): + + # activity + self._activity() + afr = self.activity.sum(axis=0) + if filter_nettime: + p = r'^((?!nettime*).)*$' + afr = afr[afr.index.str.contains(p)] + + # replies in thread [top 25] + + self._replies() + rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25] + + inter = afr.index.intersection(rpl.index) + avg = rpl[inter] / afr[inter] + + labels = ['messages', 'replies', 'avg.replies'] + return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank] + + def replies_overall(self, resolution='y', series=False): + + freq = 'M' + if resolution.lower() == 'y': + freq = 'AS' + elif resolution.lower() == 'm': + freq = 'M' + else: + return None + + if not self.replies is None: + del self.replies + self.replies = None + + self._replies() + + y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count() + + + if freq == 'AS': + y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) + y.index.name = 'year' + else: + y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) + y.index.name = 'year-month' + + if series: + return y + + return y.to_frame('nbr-replies').astype(int) + + + + diff --git a/analysis/util.py b/analysis/util.py new file mode 100644 index 0000000..cd39f54 --- /dev/null +++ b/analysis/util.py @@ -0,0 +1,81 @@ +import email +import hashlib + +def format_content(msg, archive_name): + return msg['content'] + +def format_url(msg, archive_name): + return msg['url'] + +def format_author(msg, archive_name): + return msg['author_name'] + +def format_from_token(from_str, sep): + from_addr = email.utils.parseaddr(from_str)[1] + if sep not in from_addr: + tok = from_str.split() + try: + at = tok.index(sep) + from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]]) + if from_addr.startswith('<') or from_addr.endswith('>'): + from_addr = from_addr.strip('<').strip('>') + except ValueError: + print(tok) + print("error formating 'from' " + from_str + " -- expecting sep: " + sep) + return None + else: + from_addr = from_addr.replace(sep, '{AT}') + return from_addr.lower() + +def format_from(msg, archive_name): + from_str = msg['from'] + + if " {AT} " in from_str: + return format_from_token(from_str, '{AT}') + elif " at " in from_str: + return format_from_token(from_str, 'at') + elif "@" in from_str: + return format_from_token(from_str, '@') + else: + return from_str + +# returns utc timestamp +def format_date(msg, archive_name): + date_str = msg['date'] + time_tz = None + try: + date_tz = email.utils.parsedate_tz(date_str) + time_tz = email.utils.mktime_tz(date_tz) #utc timestamp + except TypeError: + print("Format Date TypeError") + print(" > " + date_str) + return None + except ValueError: + print("Format Date ValueError") + print(" > " + date_str) + return None + finally: + return time_tz + +def format_subject(msg, archive_name): + return msg['subject'] + +def format_id(msg, archive_name): + if "message-id" in msg: + return msg['message-id'] + else: + # create hash with author_name + date + s = msg['author_name'] + msg['date'] + sha = hashlib.sha1(s.encode('utf-8')) + return sha.hexdigest() + +# format='%d/%m/%Y' +def min_date(archive_name): + if "nettime" in archive_name: + return '01/10/1995' + elif archive_name == "spectre": + return '01/08/2001' + elif archive_name == "empyre": + return '01/01/2002' + elif archive_name == "crumb": + return '01/02/2001' diff --git a/lists/crawl.py b/lists/crawl.py index 5bcb38b..73529e2 100644 --- a/lists/crawl.py +++ b/lists/crawl.py @@ -1,10 +1,12 @@ from urllib.parse import urlparse import lists.pipermail as pipermail import lists.listserv as listserv +import lists.mhonarc as mhonarc +import lists.mhonarc_nettime as mhonarc_nettime DELAY = 0.2 -def crawl(url, name, archive_dir): +def crawl(url, name, sublist_name=None, archive_dir="archives"): u = urlparse(url) # the following type 'tests' are very weak... @@ -21,6 +23,11 @@ def crawl(url, name, archive_dir): elif 'cgi-bin' in u.path: listserv.collect_from_url(url, name, archive_dir) + # special case -- nettime. + # the name should be the sublist_name (i.e nettime-l) + elif "nettime" in name: + mhonarc_nettime.collect_from_url(url, name, name, archive_dir) + else: print('mhonarc?') diff --git a/lists/listserv.py b/lists/listserv.py index df3230d..c17287b 100644 --- a/lists/listserv.py +++ b/lists/listserv.py @@ -43,6 +43,17 @@ def collect_from_url(url, name, base_archive_dir): del tb continue + # archive['name'] = name + # archive['list'] = threads + + # file_path = os.path.join(base_arch_dir, name + '.json') + + # with open(file_path, 'w') as fp: + # json.dump(archive, fp, indent=4) + + # logging.info("done.") + + def collect_threads_from_url(url, name, base_arch_dir): threads = {'name' : name, 'url' : url, 'threads' : []} diff --git a/lists/mhonarc.py b/lists/mhonarc.py index 68398df..1ba9c35 100644 --- a/lists/mhonarc.py +++ b/lists/mhonarc.py @@ -4,22 +4,27 @@ from bs4 import BeautifulSoup DELAY = 0.2 -def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False): +def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False): response = urllib.request.urlopen(url) - html = response.read().decode(encoding="utf-8") + html = response.read() soup = BeautifulSoup(html, "html5lib") # base url base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') #collect name - list_name = soup.select('body p:nth-of-type(2) base title')[0].string + list_name = soup.select('body p:nth-of-type(2) title')[0].string logging.info("Getting " + list_name + " list archive for " + sublist_name) - lists = soup.select('ul:nth-of-type(2) li') + # create (main) directory + # this is where all temp files will be created + d = os.path.join(base_archive_dir, name) + if not os.path.exists(d): + os.makedirs(d) threads = [] + lists = soup.select('ul:nth-of-type(2) li') for l in lists: @@ -33,31 +38,41 @@ def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False): threads_url_list = [] threads_links = l.select('ul li a') for t in threads_links: - thread_url = urlparse.urljoin(base_url, t.get('href')) + thread_url = urllib.parse.urljoin(base_url, t.get('href')) threads_url_list.append(thread_url) nbr_threads = str(len(threads_url_list)) n = 0 for u in threads_url_list: + time.sleep(DELAY) n += 1 - logging.info("## " + str(n) + " / " + nbr_threads + " ##") - threads.append(collect_threads_from_url(u, base_arch_dir, mbox)) + logging.info("## " + str(n) + " / " + nbr_threads + " ##") + try: + threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox)) + except KeyboardInterrupt: + sys.exit(0) + except: + logging.warning("Error archiving: " + l[1] + "... Continuing.") + ex_t, ex, tb = sys.exc_info() + print(ex_t) + traceback.print_tb(tb) + del tb + continue return threads # for u in threads_url_list[0:10]: # print "---------------------------------------" - # tt = collect_threads_from_url(u, base_arch_dir, mbox) - # threads.append(tt) - + # tt = collect_threads_from_url(u, base_archive_dir, mbox) + # threads.append(tt) return None -def collect_threads_from_url(url, base_arch_dir, mbox): +def collect_threads_from_url(url, base_archive_dir, mbox=False): response = urllib.request.urlopen(url) - html = response.read().decode(encoding="utf-8") + html = response.read() soup = BeautifulSoup(html, "html5lib") # base url @@ -73,7 +88,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox): logging.info("Collecting Threads of: " + threads_name) # check if archive already exists - file_path = os.path.join(base_arch_dir, threads['name'] + ".json") + file_path = os.path.join(base_archive_dir, threads['name'] + ".json") if os.path.isfile(file_path): logging.info("archive already exists. loading from file " + file_path) with open(file_path, 'r') as fpin: @@ -114,7 +129,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox): def archive_thread(li, base_url, parent_thread_data): thread_link = li.select('strong a')[0] - thread_url = urlparse.urljoin(base_url, thread_link.get('href')) + thread_url = urllib.parse.urljoin(base_url, thread_link.get('href')) thread_id = thread_link.get('name') thread_title = thread_link.string thread_author_name = li.select('em')[0].string @@ -145,6 +160,7 @@ def collect_message(url, message): response = urllib.request.urlopen(url) html = response.read().decode(encoding="utf-8") + # html = response.read() soup = BeautifulSoup(html, "html5lib") #note: this should follow an RFC header standard -- MHonArc has header info in the 1th
@@ -184,6 +200,8 @@ def collect_message(url, message):
     else:
         message['content'] = soup.select('pre:nth-of-type(2)')[0].text
 
+    # message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
 # mhonarc xcomments
 # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
 def parse_xcomment(soup, xcom):
diff --git a/lists/mhonarc_nettime.py b/lists/mhonarc_nettime.py
new file mode 100644
index 0000000..5476f37
--- /dev/null
+++ b/lists/mhonarc_nettime.py
@@ -0,0 +1,214 @@
+import urllib.request, urllib.parse
+import logging, os, sys, traceback, re, time, json, gzip
+from bs4 import BeautifulSoup
+
+DELAY = 0.2
+
+def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
+
+    response = urllib.request.urlopen(url)
+    html = response.read()
+    soup = BeautifulSoup(html, "html5lib")
+
+    # base url 
+    base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
+
+	#collect name
+    list_name = soup.select('body p:nth-of-type(2) title')[0].string
+    logging.info("Getting " + list_name + " list archive for " + sublist_name)
+
+    # create (main) directory 
+    # this is where all temp files will be created
+    d = os.path.join(base_archive_dir, name)
+    if not os.path.exists(d):
+        os.makedirs(d)
+
+    threads = []
+    lists = soup.select('ul:nth-of-type(2) li')    
+
+    for l in lists:
+
+    	if l.strong is None:
+    		continue
+
+    	name = l.strong.string
+
+    	if name.lower() == sublist_name.lower():
+
+            threads_url_list = []
+            threads_links = l.select('ul li a')
+            for t in threads_links:
+                thread_url = urllib.parse.urljoin(base_url, t.get('href'))
+                threads_url_list.append(thread_url)
+
+            nbr_threads = str(len(threads_url_list))
+            n = 0
+
+            for u in threads_url_list:
+                time.sleep(DELAY)
+                n += 1
+                logging.info("## " + str(n) + " / " + nbr_threads + " ##")                
+                try:
+                    threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))   
+                except KeyboardInterrupt:
+                    sys.exit(0)
+                except:
+                    logging.warning("Error archiving: " + l[1] + "... Continuing.")
+                    ex_t, ex, tb = sys.exc_info()
+                    print(ex_t)
+                    traceback.print_tb(tb)
+                    del tb
+                    continue                   
+
+            return threads
+
+            # for u in threads_url_list[0:10]:
+            #     print "---------------------------------------"
+            #     tt = collect_threads_from_url(u, base_archive_dir, mbox)
+            #     threads.append(tt)                
+
+    return None
+
+def collect_threads_from_url(url, base_archive_dir, mbox=False):
+
+    response = urllib.request.urlopen(url)
+    html = response.read()
+    soup = BeautifulSoup(html, "html5lib")
+
+    # base url 
+    base_url = url
+
+    # collect name
+    threads_name = soup.select('p:nth-of-type(1) title')[0].string
+    threads_name = threads_name.replace(' ', '_')
+
+    # thread data struct
+    threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
+
+    logging.info("Collecting Threads of: " + threads_name)
+
+    # check if archive already exists
+    file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
+    if os.path.isfile(file_path):
+        logging.info("archive already exists. loading from file " + file_path)
+        with open(file_path, 'r') as fpin:
+            threads = json.load(fpin)
+    else:
+        lists = soup.select('ul:nth-of-type(1) > li')
+
+        nbr_threads = str(len(lists))
+        n = 0
+
+        for l in lists:
+            n += 1
+            logging.info("> " + str(n) + " / " + nbr_threads)
+
+            try:
+                thread = archive_thread(l, base_url, None)
+                threads['threads'].append(thread)
+            except:
+                ex_type, ex, tb = sys.exc_info()
+                traceback.print_tb(tb)
+                del tb                
+                continue
+
+            time.sleep(DELAY)
+
+        # write 
+        logging.info("writing archive to file " + file_path)
+
+        with open(file_path, 'w') as fp:
+            json.dump(threads, fp, indent=4)
+
+        logging.info("done. ")
+
+    return threads
+
+    
+
+def archive_thread(li, base_url, parent_thread_data):
+
+	thread_link = li.select('strong a')[0]
+	thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
+	thread_id = thread_link.get('name')
+	thread_title = thread_link.string
+	thread_author_name = li.select('em')[0].string
+
+	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
+
+	collect_message(thread_url, message)
+
+	follow = li.select('ul > li')
+	if len(follow) > 0:
+		for f in follow:
+			follow_link = f.select('strong a')
+			if len (follow_link) > 0:
+				archive_thread(f, base_url, message)  ## recursion
+	
+	if parent_thread_data is None:
+		return message
+
+	if u'follow-up' not in parent_thread_data:
+		parent_thread_data[u'follow-up'] = []
+
+	parent_thread_data[u'follow-up'].append(message)
+
+	return message
+
+
+def collect_message(url, message):
+
+    response = urllib.request.urlopen(url)
+    html = response.read().decode(encoding="utf-8")
+    # html = response.read()
+    soup = BeautifulSoup(html, "html5lib")    
+
+    #note: this should follow an RFC header standard -- MHonArc has header info in the 1th 
+
+    message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    
+
+    # mhonarc xcomments
+    # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+    message['subject'] = parse_xcomment(soup, "X-Subject")
+    message['date'] = parse_xcomment(soup, "X-Date")
+    message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
+    message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
+    message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
+
+    # parse what is displayed on the page
+
+    info = soup.select('ul:nth-of-type(1) > li')
+
+    for i in info:
+        if i.em == None:
+            continue
+        field = i.em.string
+        if field.lower() in message_labels:
+        	message[field.lower()] = i.text.strip(field + ": ")
+
+    ## reformat from -- [author_name, email_addr]
+
+    # from_addr = email.utils.parseaddr(message['from'])
+    # message['author_name'] = from_addr[0]
+    # message['from'] = from_addr[1]
+
+    ## -- content --
+    # test
+    # c1 = soup.select('pre:nth-of-type(1)')
+    # if len(c1) > 0:
+    #     message['content'] = c1[0].text
+    # else:
+    #     message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
+    message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
+# mhonarc xcomments
+# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+def parse_xcomment(soup, xcom):
+    com = soup.find(text=re.compile(xcom))
+    if com is not None:
+        return com.strip('').strip(xcom + ":").strip()
+    return com
+
+def test_xcomment(soup):
+    return soup.find(text=re.compile('X-Message-Id')) is not None
diff --git a/lists/pipermail.py b/lists/pipermail.py
index dcdf757..c08c68f 100644
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@@ -8,7 +8,8 @@ DELAY = 0.2
 def collect_from_url(url, name, base_archive_dir):
 
 	response = urllib.request.urlopen(url)
-	html = response.read().decode(encoding="utf-8")
+	# html = response.read().decode(encoding="utf-8")
+	html = response.read()
 	soup = BeautifulSoup(html, "html5lib")
 
 	threads_list = soup.find_all('tr')
@@ -195,7 +196,8 @@ def collect_message(url, message):
 	# logging.info("	+ " + url)
 
 	response = urllib.request.urlopen(url)
-	html = response.read().decode(encoding="utf-8")
+	# html = response.read().decode(encoding="utf-8")
+	html = response.read()
 	soup = BeautifulSoup(html, "html5lib")
 
 	if lists.mhonarc.test_xcomment(soup):
diff --git a/search/archive.py b/search/archive.py
index 3dff99f..9f333d9 100644
--- a/search/archive.py
+++ b/search/archive.py
@@ -69,6 +69,10 @@ class Archive():
 				i += 1
 
 			if nbr_hits > 0:
+				# nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001')
+				if k.startswith("nettime-l_"):
+					dt = datetime.strptime(k, "nettime-l_%b_%y")
+					k = dt.strftime("%B_%Y")
 				search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
 
 		return search_results
@@ -97,6 +101,12 @@ def get_key(kv_tuple):
 	except Exception:
 		pass
 
+	# nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01"
+	try:
+		return datetime.strptime(k, "nettime-l_%b_%y")
+	except Exception:
+		pass
+
 	print("--------------")
 	print(k)
 
diff --git a/www/routes.py b/www/routes.py
index 163bb9f..9212258 100644
--- a/www/routes.py
+++ b/www/routes.py
@@ -118,7 +118,7 @@ def searh():
 
 	################################
 	##
-	##	need to chache all the below
+	##	need to cache all the below??
 	##
 	################################
 
@@ -128,7 +128,13 @@ def searh():
 		a.load(l)
 		results.append(a.search(k_arg))
 
-	return jsonify(result=results)
+	## -- sort results?
+	search_results = sorted(results, key=get_result_key)
+
+	return jsonify(result=search_results)
+
+def get_result_key(r):
+	return r['archive']
 
 	
 
diff --git a/www/static/c3.min.css b/www/static/c3.min.css
old mode 100755
new mode 100644
diff --git a/www/static/c3.min.js b/www/static/c3.min.js
old mode 100755
new mode 100644
diff --git a/www/static/search.js b/www/static/search.js
index 1435edb..437ca59 100644
--- a/www/static/search.js
+++ b/www/static/search.js
@@ -1,18 +1,26 @@
 
 $(document).ready(function(){
-	$('#search').on('submit', function(e) {
+	$('#loading').hide()
+
+	$('#search').submit(function(e) {
 		e.preventDefault();
 		args = $(this).serialize();	
+		$('#graph').empty();
+		$('#results').empty();		
+
+		$('#loading').show()
 		$.get('/search?'+args, function(data) {
+			$('#loading').hide()
 			console.log(data);
-			$('#graph').empty();
-			$('#results').empty();
+			// $('#graph').empty();
+			// $('#results').empty();
 			$.each(data.result, function(i, item) {
 				search_result_archive(item);
 			});
-			graph(data);
+			graph(data);		
 		});
 	});
+
 });
 
 function search_result_archive(a) {	
diff --git a/www/templates/search.html b/www/templates/search.html
index 5ccee0a..472ba68 100644
--- a/www/templates/search.html
+++ b/www/templates/search.html
@@ -16,6 +16,7 @@
 			 {% endfor %}
 		
 		
+		
Loading...