diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
index 951da93..18c05fa
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
# mailinglists specific
archives/
+figs/
config.py
# Byte-compiled / optimized / DLL files
diff --git a/analyse.py b/analyse.py
new file mode 100644
index 0000000..bef4381
--- /dev/null
+++ b/analyse.py
@@ -0,0 +1,230 @@
+import os
+
+# matplot view/windows
+import matplotlib
+matplotlib.interactive(True)
+
+# pd display
+import pandas as pd
+pd.set_option('display.max_colwidth', 100)
+
+from analysis.archive import Archive
+from analysis.query import Query
+from analysis.plot import Plot
+
+import analysis.format
+
+# spectre: slategrey
+# nettime: red
+# crumb: purple
+# empyre: darkblue
+
+def save_fig_cohort(q, name, dir, color):
+ t = name + " - Cohorts"
+ pp = q.cohort().plot(color=color, title=t)
+ ts = name + "_cohorts.png"
+ filename = os.path.join(dir, ts)
+ pp.get_figure().savefig(filename)
+
+def save_fig_messages_total(q, name, dir, color):
+ t = name + " - Nbr. Messages"
+ pp = q.activity_overall().plot(kind='bar', color=color, title=t)
+ ts = name + "_messages.png"
+ filename = os.path.join(dir, ts)
+ pp.get_figure().savefig(filename)
+
+def save_fig_threads_total(q, name, dir, color):
+ t = name + " - Nbr. Threads"
+ pp = q.threads_overall().plot(kind='bar', color=color, title=t)
+ ts = name + "_threads.png"
+ filename = os.path.join(dir, ts)
+ pp.get_figure().savefig(filename)
+
+def save_fig_messages_constituency(q, name, dir):
+ t = name + " - Messages Constituency"
+ replies = pd.Series(q.replies_overall(series=True))
+ # threads = pd.Series(q.single_threads_overall(series=True))
+ threads = pd.Series(q.threads_overall(series=True))
+ messages = pd.Series(q.activity_overall(series=True))
+ single_messages = messages - (replies + threads)
+
+ # df = {'a': single_messages, 'b': threads, 'c': replies}
+ # df = pd.DataFrame([single_messages, threads, replies], columns=['a', 'b', 'c'])
+ df = pd.concat([single_messages.to_frame('single-messages').astype(int), threads.to_frame('threads').astype(int), replies.to_frame('replies').astype(int)], axis=1)
+ pp = df.plot(kind='bar', stacked=True, title=t)
+
+ # pp = [single_messages, threads, replies].plot(kind='bar', stacked=True)
+
+ ts = name + "_constituency.png"
+ filename = os.path.join(dir, ts)
+ pp.get_figure().savefig(filename)
+
+def save_fig_avg_threads_replies(q, name, dir, color):
+ t = name + " - Avg. Threads + Replies"
+ replies = pd.Series(q.replies_overall(series=True))
+ threads = pd.Series(q.threads_overall(series=True))
+ messages = pd.Series(q.activity_overall(series=True))
+
+ avg_threads_messages = (replies + threads) / messages
+
+ pp = pd.DataFrame(avg_threads_messages).plot(kind='bar', color=color, title=t)
+
+ ts = name + "_avg_threads_replies.png"
+ filename = os.path.join(dir, ts)
+ pp.get_figure().savefig(filename)
+
+def save_fig_diff_threads_replies_vs_messages(q, name, dir, color):
+ t = name + " - Diff. Threads + Replies vs Single Messages"
+ replies = pd.Series(q.replies_overall(series=True))
+ threads = pd.Series(q.threads_overall(series=True))
+ rt = replies + threads
+ messages = pd.Series(q.activity_overall(series=True))
+
+ diff_threads_messages = (2 * rt) - messages
+
+ pp = pd.DataFrame(diff_threads_messages).plot(kind='bar', color=color, title=t)
+
+ ts = name + "_diff_threads_replies_messages.png"
+ filename = os.path.join(dir, ts)
+ pp.get_figure().savefig(filename)
+
+def save_fig_ratio_replies_threads(q, name, dir, color):
+ t = name + " - Ratio Replies per Thread"
+ replies = pd.Series(q.replies_overall(series=True))
+ threads = pd.Series(q.threads_overall(series=True))
+
+ ratio_replies_threads = replies / threads
+
+ pp = pd.DataFrame(ratio_replies_threads).plot(kind='bar', color=color, title=t)
+
+ ts = name + "_ratio_replies_threads.png"
+ filename = os.path.join(dir, ts)
+ pp.get_figure().savefig(filename)
+
+def html_td_rank_year(year, data):
+ td_str = '
'
+ if year in data:
+ td_str += analysis.format.table_threads_ranking(data[year])
+ td_str += ' | '
+ return td_str
+
+def html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre):
+
+ html_str = ''
+
+ html_str += ''
+ html_str += '| year | '
+ html_str += 'nettime | '
+ html_str += 'crumb | '
+ html_str += 'spectre | '
+ html_str += 'empyre | '
+ html_str += '
'
+
+ years = sorted(ranking_nettime.keys())
+
+ print(years)
+
+ for i in years:
+ html_str += ''
+ html_str += '| ' + i + ' | '
+ html_str += html_td_rank_year(i, ranking_nettime)
+ html_str += html_td_rank_year(i, ranking_crumb)
+ html_str += html_td_rank_year(i, ranking_spectre)
+ html_str += html_td_rank_year(i, ranking_empyre)
+ html_str += '
'
+
+ html_str += '
'
+ return html_str
+
+
+print("nettime")
+#nettime
+nt = Archive('nettime-l')
+ntq = nt.query()
+ntp = Plot(ntq)
+
+
+
+# save_fig_cohort(ntq, 'nettime', 'figs/', 'red')
+# save_fig_messages_total(ntq, 'nettime', 'figs/', 'red')
+# save_fig_threads_total(ntq, 'nettime', 'figs/', 'red')
+# save_fig_messages_constituency(ntq, 'nettime', 'figs/')
+
+# save_fig_avg_threads_replies(ntq, 'nettime', 'figs/', 'red')
+# save_fig_diff_threads_replies_vs_messages(ntq, 'nettime', 'figs/', 'red')
+# save_fig_ratio_replies_threads(ntq, 'nettime', 'figs/', 'red')
+
+ranking_nettime = ntq.threads_ranking(rank=15)
+
+# print(r['2000'])
+
+# print(analysis.format.table_threads_ranking(r['2000']))
+
+
+print("crumb")
+#crumb
+cr = Archive('crumb')
+crq = cr.query()
+crp = Plot(crq)
+
+# save_fig_cohort(crq, 'crumb', 'figs/', 'purple')
+# save_fig_messages_total(crq, 'crumb', 'figs/', 'purple')
+# save_fig_threads_total(crq, 'crumb', 'figs/', 'purple')
+# save_fig_messages_constituency(crq, 'crumb', 'figs/')
+
+# save_fig_avg_threads_replies(crq, 'crumb', 'figs/', 'purple')
+# save_fig_diff_threads_replies_vs_messages(crq, 'crumb', 'figs/', 'purple')
+# save_fig_ratio_replies_threads(crq, 'crumb', 'figs/', 'purple')
+
+ranking_crumb = crq.threads_ranking(rank=15)
+
+
+print("empyre")
+#empyre
+em = Archive('empyre')
+emq = em.query()
+emp = Plot(emq)
+
+# save_fig_cohort(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_messages_total(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_threads_total(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_messages_constituency(emq, 'empyre', 'figs/')
+
+# save_fig_avg_threads_replies(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_diff_threads_replies_vs_messages(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_ratio_replies_threads(emq, 'empyre', 'figs/', 'darkblue')
+
+ranking_empyre = emq.threads_ranking(rank=15)
+
+print("spectre")
+#spectre
+sp = Archive('spectre')
+spq = sp.query()
+spp = Plot(spq)
+
+# save_fig_cohort(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_messages_total(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_threads_total(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_messages_constituency(spq, 'spectre', 'figs/')
+
+# save_fig_avg_threads_replies(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_diff_threads_replies_vs_messages(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_ratio_replies_threads(spq, 'spectre', 'figs/', 'slategrey')
+
+ranking_spectre = spq.threads_ranking(rank=15)
+
+
+## comparative ranking
+
+rankings = html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre)
+
+html_template = 'figs/ranking/index_template.html'
+with open(html_template, 'r') as fp:
+ h = fp.read()
+
+html = h.replace("--table--", rankings)
+
+html_output = 'figs/ranking/index.html'
+with open(html_output, 'w+') as fp:
+ fp.write(html)
+
diff --git a/analysis/archive.py b/analysis/archive.py
new file mode 100644
index 0000000..597615a
--- /dev/null
+++ b/analysis/archive.py
@@ -0,0 +1,152 @@
+import numpy as np
+import pandas as pd
+import email, email.parser
+import os, datetime, json, gzip, re
+import analysis.util
+import analysis.query
+
+
+def filter_date(msg, archive_name):
+
+ time_tz = analysis.util.format_date(msg, archive_name)
+ if not time_tz:
+ return None
+
+ dt = datetime.datetime.fromtimestamp(time_tz)
+ try:
+ date_time = pd.to_datetime(dt)
+ except pd.tslib.OutOfBoundsDatetime:
+ print('time out of bound')
+ print(dt)
+ return None
+
+ min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y')
+ max_date = pd.to_datetime(datetime.datetime.now())
+ if date_time < min_date or date_time > max_date:
+ return None
+
+ return date_time
+
+
+def message_to_tuple_record(msg, records, archive_name, references='X'):
+
+ # check date first?
+ date = filter_date(msg, archive_name)
+ if not date:
+ print("Archive::filter_date returned None. Skip.")
+ return
+
+ # check / filter from email address second?
+ from_addr = analysis.util.format_from(msg, archive_name)
+ if not from_addr:
+ print("Archive::analysis.util.format_from returned None. Skip.")
+ return
+
+ url = analysis.util.format_url(msg, archive_name)
+ author = analysis.util.format_author(msg, archive_name)
+ subject = analysis.util.format_subject(msg, archive_name)
+ message_id = analysis.util.format_id(msg, archive_name)
+ content = analysis.util.format_content(msg, archive_name)
+
+ records.append((message_id,
+ from_addr,
+ author,
+ subject,
+ date,
+ url,
+ len(content),
+ 0 if not 'follow-up' in msg else len(msg['follow-up']),
+ references))
+
+ # recursive follow up -- but references is not keeping track really...
+ if 'follow-up' in msg:
+ for f in msg['follow-up']:
+ message_to_tuple_record(f, records, archive_name, references=message_id)
+
+ return
+
+def json_data_to_pd_dataframe(json_data, archive_name):
+
+ records = []
+ for d in json_data:
+ for dd in d['threads']:
+ message_to_tuple_record(dd, records, archive_name)
+
+ print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records)))
+
+ df = pd.DataFrame.from_records(records,
+ index='date',
+ columns=['message-id',
+ 'from',
+ 'author',
+ 'subject',
+ 'date',
+ 'url',
+ 'content-length',
+ 'nbr-references',
+ 'references'])
+
+ df.index.name = 'date'
+
+ return df
+
+def load_from_file(filename, archive_name, archive_dir, json_data=None):
+
+ if not filename.endswith('.json.gz'):
+ file_path = os.path.join(archive_dir, filename + '.json.gz')
+ else:
+ file_path = os.path.join(archive_dir, filename)
+
+ if os.path.isfile(file_path):
+ with gzip.open(file_path, 'r') as fp:
+ json_data = json.load(fp)
+ return json_data_to_pd_dataframe(json_data['threads'], archive_name)
+ else:
+ #list of all "filename[...].json.gz" in archive_dir
+ files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
+ if files:
+ filename = files[-1] # take the most recent (listed alpha-chronological)
+ file_path = os.path.join(archive_dir, filename)
+ if os.path.isfile(file_path):
+ with gzip.open(file_path, 'r') as fp:
+ json_data = json.load(fp)
+ return json_data_to_pd_dataframe(json_data['threads'], archive_name)
+ else:
+ #list of all json files in archive_dir/filename
+ dir_path = os.path.join(archive_dir, filename)
+ if not os.path.isdir(dir_path):
+ return None
+
+ files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
+ if not files:
+ return None
+
+ # load all json files
+ threads = []
+ for file_path in files:
+ with open(file_path, 'r') as fp:
+ json_data = json.load(fp)
+ threads.append(json_data)
+
+ print('---> ' + archive_name)
+ return json_data_to_pd_dataframe(threads, archive_name)
+
+
+class Archive:
+
+ data = None # "raw" json data
+ dataframe = None # main pd dataframe
+
+ def __init__(self, archive_name, archive_dir="archives"):
+
+ if isinstance(archive_name, pd.core.frame.DataFrame):
+ self.dataframe = archive_name.copy()
+
+ if isinstance(archive_name, str):
+ # need a filename or a dir name....
+ self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data)
+
+ def query(self):
+ q = analysis.query.Query(self)
+ return q
+
diff --git a/analysis/format.py b/analysis/format.py
new file mode 100644
index 0000000..4c8e8b0
--- /dev/null
+++ b/analysis/format.py
@@ -0,0 +1,158 @@
+import analysis.query
+import logging, html, numpy
+from tabulate import tabulate
+
+def makeurl(text, url):
+ return '' + text + ""
+
+def table_threads_ranking(ranking_dataframe):
+
+ html_str = ''
+
+
+ html_str += ''
+ html_str += '| date | '
+ html_str += 'subject | '
+ html_str += 'from | '
+ html_str += 'replies | '
+ html_str += '
'
+
+
+ for i, row in ranking_dataframe.iterrows():
+
+ html_str += ''
+ html_str += '| ' + str(i) + ' | '
+ html_str += '' + makeurl(row['subject'], row['url']) + ' | '
+ html_str += '' + row['from'] + ' | '
+ html_str += '' + str(row['nbr-references']) + ' | '
+ html_str += '
'
+
+ html_str += "
"
+
+ return html_str
+
+
+
+
+class Html:
+
+ query = None
+
+ def __init__(self, q=None):
+
+ if not isinstance(q, query.Query):
+ logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
+ raise Exception()
+
+ self.query = q
+
+ def threads_ranking(self, rank=5, resolution=None):
+
+ data = self.query.threads_ranking(rank=rank)
+
+ h = html.HTML()
+ t = h.table()
+
+ r = t.tr
+ r.td('date', klass='td_date_t')
+ r.td('from', klass='td_from_t')
+ r.td('replies', klass='td_rep_t')
+ r.td('subject', klass='td_subject_t')
+
+ for i, row in data.iterrows():
+ r = t.tr
+
+ print(row.index)
+
+ r.td(str(row['date']), klass='td_date')
+ r.td(row['from'], klass='td_from')
+ r.td(str(row['nbr-references']), klass='td_rep')
+ r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
+
+ return str(t)
+
+ @staticmethod
+ def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}):
+
+ header = []
+ if data_frame.index.name in name_map:
+ header.append(name_map[data_frame.index.name])
+ else:
+ header.append(data_frame.index.name)
+ for h in data_frame.columns:
+ if h in name_map:
+ h = name_map[h]
+ header.append(h)
+
+ css_header = []
+ css_element = []
+ for i in header:
+ css_header.append('td_' + i + '_t')
+ css_element.append('td_' + i)
+
+ h = html.HTML()
+ if table_name:
+ t = h.table(id=table_name, klass=table_name + '_t')
+ else:
+ t = h.table()
+
+ # url map
+ url_hash = {}
+ url_skip = []
+ url_keys = url_map.keys()
+ for u in url_keys:
+ if u in header and url_map[u] in header:
+ url_indx = header.index(url_map[u])
+ url_hash[header.index(u)] = url_indx
+ url_skip.append(url_indx)
+ header.pop(url_indx)
+
+ #header
+ r = t.tr
+ n = 0
+ for j in header:
+ r.td(str(j), klass=css_header[n])
+ n += 1
+
+
+ #elements
+ for k, row in data_frame.iterrows():
+ r = t.tr
+ r.td(str(k), klass=css_element[0])
+ n = 1
+ for l in row:
+
+ if n in url_skip:
+ continue
+
+ if isinstance(l, float):
+ if l % 1 > 0:
+ l = '{0:.4f}'.format(l)
+ else:
+ l = int(l)
+
+ if n in url_hash.keys():
+ url = row[url_hash[n] - 1]
+ r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
+
+ else:
+ r.td(str(l), klass=css_element[n])
+ n += 1
+
+ return str(t)
+
+class Tab:
+
+ @staticmethod
+ def from_dataframe(data_frame, name_map={}, format=".0f"):
+
+ header = []
+ header.append(data_frame.index.name)
+ for h in data_frame.columns:
+ if h in name_map:
+ h = name_map[h]
+ header.append(h)
+
+ return tabulate(data_frame, headers=header, floatfmt=format)
+
+
diff --git a/analysis/plot.py b/analysis/plot.py
new file mode 100644
index 0000000..8f08430
--- /dev/null
+++ b/analysis/plot.py
@@ -0,0 +1,79 @@
+import numpy as np
+import pandas as pd
+import analysis.query
+
+# for colormaps see:
+# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps
+# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps
+# http://matplotlib.org/examples/color/colormaps_reference.html
+# for colors see:
+# http://matplotlib.org/examples/color/named_colors.html
+
+# spectre: slategrey
+# nettime: red
+# crumb: purple
+# empyre: darkblue
+
+def bar_plot_series(series, title, color='blueviolet', ylim=None):
+ return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim)
+
+def save(plot, name):
+ fig = plot.get_figure()
+ fig.savefig(name)
+
+class Plot:
+
+ query = None
+
+ def __init__(self, q=None):
+
+ if not isinstance(q, analysis.query.Query):
+ logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query")
+ raise Exception()
+
+ self.query = q
+
+ '''
+ activity
+ '''
+
+ def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
+
+ activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys()
+ series = []
+ for k in activity_rank:
+ series.append(self.query.activity_from(k, resolution, series=True))
+
+ df = pd.concat(series, axis=1)
+
+ return df.plot.area(colormap='spectral', figsize=figsize, stacked=False)
+
+ '''
+ content lenght
+ '''
+
+ def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
+
+ content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys()
+ series = []
+ for k in content_rank:
+ series.append(self.query.content_length_from(k, resolution, series=True))
+
+ df = pd.concat(series, axis=1)
+
+ return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
+
+ '''
+ threads
+ '''
+
+ def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
+
+ threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys()
+ series = []
+ for k in threads_rank:
+ series.append(self.query.threads_from(k, resolution, series=True))
+
+ df = pd.concat(series, axis=1)
+
+ return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
diff --git a/analysis/query.py b/analysis/query.py
new file mode 100644
index 0000000..5b46488
--- /dev/null
+++ b/analysis/query.py
@@ -0,0 +1,573 @@
+import numpy as np
+import pandas as pd
+import analysis.archive
+import logging
+
+class Query:
+
+ archive = None # analysis.archive.Archive object
+ activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
+ content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
+ threads = None # ...
+ single_threads = None
+ replies = None # ...
+
+ def __init__(self, arch=None):
+
+ if not isinstance(arch, analysis.archive.Archive):
+ logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
+ raise Exception()
+
+ self.archive = arch
+
+ '''
+ activity
+ '''
+
+ def _activity(self):
+
+ if self.activity is None:
+ from_index = self.archive.dataframe.reindex(columns=['from'])
+ self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
+
+ return self.activity
+
+ def activity_from(self, email_address, resolution='y', series=False):
+
+ eaddr = email_address.replace('@', '{at}').lower()
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ self._activity()
+ try:
+ af = self.activity[eaddr]
+ except KeyError:
+ return None
+
+ activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
+
+ if freq == 'AS':
+ activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
+ activity_from.index.name = 'year'
+ else:
+ activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ activity_from.index.name = 'year-month'
+
+ if series:
+ return activity_from
+
+ return activity_from.to_frame('nbr-messages').astype(int)
+
+ def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
+
+ self._activity()
+ afr = self.activity.sum(axis=0).order(ascending=False)
+ if filter_nettime:
+ p = r'^((?!nettime*).)*$'
+ afr = afr[afr.index.str.contains(p)]
+
+ if series:
+ return afr[:rank]
+
+ return afr[:rank].to_frame('nbr-messages').astype(int)
+
+
+ # def activity_overall(self, resolution='y', series=False):
+
+ # freq = 'M'
+ # if resolution.lower() == 'y':
+ # freq = 'AS'
+ # elif resolution.lower() == 'm':
+ # freq = 'M'
+ # else:
+ # return None
+
+ # self._activity()
+
+ # y = self.activity.sum(axis=1)
+ # y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
+
+ # if freq == 'AS':
+ # y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+ # y.index.name = 'year'
+ # else:
+ # y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ # y.index.name = 'year-month'
+
+ # if series:
+ # return y
+
+ # return y.to_frame('nbr-messages').astype(int)
+
+ def activity_overall(self, resolution='y', series=False):
+
+ a = self.archive.dataframe['url']
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()
+
+ if freq == 'AS':
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+ y.index.name = 'year'
+ else:
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ y.index.name = 'year-month'
+
+ if series:
+ return y
+
+ return y.to_frame('nbr-messages').astype(int)
+
+ def cohort(self, resolution='m', series=False):
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ self._activity()
+
+ c = self.activity.idxmax().order().to_frame('date')
+ c.index = c['date']
+
+ cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()
+
+ if freq == 'AS':
+ cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
+ cohort.index.name = 'year'
+ else:
+ cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ cohort.index.name = 'year-month'
+
+ if series:
+ return cohort
+
+ return cohort.to_frame('first-messages').astype(int)
+
+ '''
+ content lenght
+ '''
+
+ def _content_length(self):
+
+ if self.content_length is None:
+ from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
+ self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
+ self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
+
+ return self.content_length
+
+ def content_length_from(self, email_address, resolution='y', series=False):
+
+ eaddr = email_address.replace('@', '{at}').lower()
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ self._content_length()
+ try:
+ af = self.content_length[eaddr]
+ except KeyError:
+ return None
+
+ content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
+
+ if freq == 'AS':
+ content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
+ content_length_from.index.name = 'year'
+ else:
+ content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ content_length_from.index.name = 'year-month'
+
+ if series:
+ return content_length_from
+
+ return content_length_from.to_frame('nbr-bytes').astype(int)
+
+ def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
+
+ self._content_length()
+ cfr = self.content_length.sum(axis=0).order(ascending=False)
+ if filter_nettime:
+ p = r'^((?!nettime*).)*$'
+ cfr = cfr[cfr.index.str.contains(p)]
+
+ if series:
+ return cfr[:rank]
+
+ return cfr[:rank].to_frame('nbr-bytes').astype(int)
+
+ def content_length_overall(self, resolution='y', series=False):
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ self._content_length()
+
+ y = self.content_length.sum(axis=1)
+ y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
+
+ if freq == 'AS':
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+ y.index.name = 'year'
+ else:
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ y.index.name = 'year-month'
+
+ if series:
+ return y
+
+ return y.to_frame('nbr-bytes').astype(int)
+
+
+ '''
+ threads
+ '''
+
+ def _threads(self, thresh=0):
+
+ print("doing threads")
+
+ if self.threads is None:
+ self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
+
+ if self.single_threads is None:
+ self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
+
+ return self.threads;
+
+ def threads_ranking(self, rank=5, resolution='y'):
+
+ self._threads()
+
+ if resolution == None:
+ data = self.threads.drop('message-id', axis=1)[:rank]
+ return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ # get the threads ranking per time resolution
+ #
+ data = self.threads.drop('message-id', axis=1)
+ data = data.groupby([pd.TimeGrouper(freq=freq)])
+ r = {}
+ for k, v in data:
+ if freq == 'AS':
+ time_key = k.strftime('%Y')
+ else:
+ time_key = k.strftime('%Y-%m')
+ frame = v[:rank]
+ frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
+ r[time_key] = frame
+ return r
+
+ def threads_replies_to(self, email_address, resolution='y', series=False):
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ self._threads()
+
+ eaddr = email_address.replace('@', '{at}').lower()
+
+ self._threads()
+ threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
+ threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references
+ threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
+
+ if series:
+ return threads_from_ranking[eaddr]
+
+ threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
+
+ if freq == 'AS':
+ threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
+ threads_from_ranking.index.name = 'year'
+ else:
+ threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ threads_from_ranking.index.name = 'year-month'
+
+ return threads_from_ranking
+
+ def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
+
+ self._threads()
+
+ tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
+
+ if filter_nettime:
+ p = r'^((?!nettime*).)*$'
+ tfr = tfr[tfr.index.str.contains(p)]
+
+ tfr = tfr[:rank].astype(int)
+ return tfr
+
+ def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
+
+ self._threads()
+ tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
+ if filter_nettime:
+ p = r'^((?!nettime*).)*$'
+ tir = tir[tir.index.str.contains(p)]
+
+ if series:
+ return tir[:rank]
+
+ return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
+
+ def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
+
+ # activity
+ self._activity()
+ afr = self.activity.sum(axis=0).astype(int)
+ if filter_nettime:
+ p = r'^((?!nettime*).)*$'
+ afr = afr[afr.index.str.contains(p)]
+
+ # initiated threads [top 25]
+ self._threads()
+ tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
+ if filter_nettime:
+ p = r'^((?!nettime*).)*$'
+ tir = tir[tir.index.str.contains(p)]
+
+ inter = afr.index.intersection(tir.index)
+ avg = tir[inter] / afr[inter]
+
+ labels = ['messages', 'threads', 'avg.threads']
+ return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
+
+ def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
+
+ self._threads()
+
+ #initiated
+ tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
+ if filter_nettime:
+ p = r'^((?!nettime*).)*$'
+ tir = tir[tir.index.str.contains(p)]
+
+ #replies [top 25]
+ tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
+ if filter_nettime:
+ p = r'^((?!nettime*).)*$'
+ tfr = tfr[tfr.index.str.contains(p)]
+ tfr = tfr['nbr-references'] # dataframe to series
+
+
+ inter = tir.index.intersection(tfr.index)
+ avg = tfr[inter] / tir[inter]
+
+ labels = ['threads', 'replies', 'avg.replies']
+ return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
+
+
+ def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ agg = aggregate.lower()
+ if not agg in ['sum', 'mean', 'count']:
+ return None
+
+ if not self.threads is None:
+ del self.threads
+ self.threads = None
+
+ self._threads(tresh)
+
+ if agg == 'sum':
+ # number of replies total (re: sum all the replies)
+ y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
+ elif agg == 'mean':
+ y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
+ else:
+ # number of threads (re: msgs with at least one reply)
+ y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
+
+ if freq == 'AS':
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+ y.index.name = 'year'
+ else:
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ y.index.name = 'year-month'
+
+ if series:
+ return y
+
+ return y.to_frame('nbr-threads').astype(int)
+
+ def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ agg = aggregate.lower()
+ if not agg in ['sum', 'mean', 'count']:
+ return None
+
+ if not self.single_threads is None:
+ del self.single_threads
+ self.single_threads = None
+
+ self._threads(tresh)
+
+
+ y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
+
+
+ if freq == 'AS':
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+ y.index.name = 'year'
+ else:
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ y.index.name = 'year-month'
+
+ if series:
+ return y
+
+ return y.to_frame('nbr-threads').astype(int)
+
+
+ '''
+ replies
+ '''
+
+ def _replies(self):
+
+ if self.replies is None:
+ self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
+ self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
+ return self.replies;
+
+ def replies_ranking(self, rank=5, resolution=None):
+
+ self._replies()
+
+ if resolution == None:
+ data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
+ return data.to_frame('nbr_replies')
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ # get the threads ranking per time resolution
+ #
+ data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
+ r = {}
+ for k, v in data:
+ if freq == 'AS':
+ time_key = k.strftime('%Y')
+ else:
+ time_key = k.strftime('%Y-%m')
+ frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
+ r[time_key] = frame.to_frame('nbr-replies')
+ return r
+
+ def replies_avg_ranking(self, rank=5, filter_nettime=True):
+
+ # activity
+ self._activity()
+ afr = self.activity.sum(axis=0)
+ if filter_nettime:
+ p = r'^((?!nettime*).)*$'
+ afr = afr[afr.index.str.contains(p)]
+
+ # replies in thread [top 25]
+
+ self._replies()
+ rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
+
+ inter = afr.index.intersection(rpl.index)
+ avg = rpl[inter] / afr[inter]
+
+ labels = ['messages', 'replies', 'avg.replies']
+ return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
+
+ def replies_overall(self, resolution='y', series=False):
+
+ freq = 'M'
+ if resolution.lower() == 'y':
+ freq = 'AS'
+ elif resolution.lower() == 'm':
+ freq = 'M'
+ else:
+ return None
+
+ if not self.replies is None:
+ del self.replies
+ self.replies = None
+
+ self._replies()
+
+ y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()
+
+
+ if freq == 'AS':
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+ y.index.name = 'year'
+ else:
+ y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+ y.index.name = 'year-month'
+
+ if series:
+ return y
+
+ return y.to_frame('nbr-replies').astype(int)
+
+
+
+
diff --git a/analysis/util.py b/analysis/util.py
new file mode 100644
index 0000000..cd39f54
--- /dev/null
+++ b/analysis/util.py
@@ -0,0 +1,81 @@
+import email
+import hashlib
+
+def format_content(msg, archive_name):
+ return msg['content']
+
+def format_url(msg, archive_name):
+ return msg['url']
+
+def format_author(msg, archive_name):
+ return msg['author_name']
+
+def format_from_token(from_str, sep):
+ from_addr = email.utils.parseaddr(from_str)[1]
+ if sep not in from_addr:
+ tok = from_str.split()
+ try:
+ at = tok.index(sep)
+ from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
+ if from_addr.startswith('<') or from_addr.endswith('>'):
+ from_addr = from_addr.strip('<').strip('>')
+ except ValueError:
+ print(tok)
+ print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
+ return None
+ else:
+ from_addr = from_addr.replace(sep, '{AT}')
+ return from_addr.lower()
+
+def format_from(msg, archive_name):
+ from_str = msg['from']
+
+ if " {AT} " in from_str:
+ return format_from_token(from_str, '{AT}')
+ elif " at " in from_str:
+ return format_from_token(from_str, 'at')
+ elif "@" in from_str:
+ return format_from_token(from_str, '@')
+ else:
+ return from_str
+
+# returns utc timestamp
+def format_date(msg, archive_name):
+ date_str = msg['date']
+ time_tz = None
+ try:
+ date_tz = email.utils.parsedate_tz(date_str)
+ time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
+ except TypeError:
+ print("Format Date TypeError")
+ print(" > " + date_str)
+ return None
+ except ValueError:
+ print("Format Date ValueError")
+ print(" > " + date_str)
+ return None
+ finally:
+ return time_tz
+
+def format_subject(msg, archive_name):
+ return msg['subject']
+
+def format_id(msg, archive_name):
+ if "message-id" in msg:
+ return msg['message-id']
+ else:
+ # create hash with author_name + date
+ s = msg['author_name'] + msg['date']
+ sha = hashlib.sha1(s.encode('utf-8'))
+ return sha.hexdigest()
+
+# format='%d/%m/%Y'
+def min_date(archive_name):
+ if "nettime" in archive_name:
+ return '01/10/1995'
+ elif archive_name == "spectre":
+ return '01/08/2001'
+ elif archive_name == "empyre":
+ return '01/01/2002'
+ elif archive_name == "crumb":
+ return '01/02/2001'
diff --git a/lists/crawl.py b/lists/crawl.py
index 5bcb38b..73529e2 100644
--- a/lists/crawl.py
+++ b/lists/crawl.py
@@ -1,10 +1,12 @@
from urllib.parse import urlparse
import lists.pipermail as pipermail
import lists.listserv as listserv
+import lists.mhonarc as mhonarc
+import lists.mhonarc_nettime as mhonarc_nettime
DELAY = 0.2
-def crawl(url, name, archive_dir):
+def crawl(url, name, sublist_name=None, archive_dir="archives"):
u = urlparse(url)
# the following type 'tests' are very weak...
@@ -21,6 +23,11 @@ def crawl(url, name, archive_dir):
elif 'cgi-bin' in u.path:
listserv.collect_from_url(url, name, archive_dir)
+ # special case -- nettime.
+ # the name should be the sublist_name (i.e nettime-l)
+ elif "nettime" in name:
+ mhonarc_nettime.collect_from_url(url, name, name, archive_dir)
+
else:
print('mhonarc?')
diff --git a/lists/listserv.py b/lists/listserv.py
index df3230d..c17287b 100644
--- a/lists/listserv.py
+++ b/lists/listserv.py
@@ -43,6 +43,17 @@ def collect_from_url(url, name, base_archive_dir):
del tb
continue
+ # archive['name'] = name
+ # archive['list'] = threads
+
+ # file_path = os.path.join(base_arch_dir, name + '.json')
+
+ # with open(file_path, 'w') as fp:
+ # json.dump(archive, fp, indent=4)
+
+ # logging.info("done.")
+
+
def collect_threads_from_url(url, name, base_arch_dir):
threads = {'name' : name, 'url' : url, 'threads' : []}
diff --git a/lists/mhonarc.py b/lists/mhonarc.py
index 68398df..1ba9c35 100644
--- a/lists/mhonarc.py
+++ b/lists/mhonarc.py
@@ -4,22 +4,27 @@ from bs4 import BeautifulSoup
DELAY = 0.2
-def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
+def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
response = urllib.request.urlopen(url)
- html = response.read().decode(encoding="utf-8")
+ html = response.read()
soup = BeautifulSoup(html, "html5lib")
# base url
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
#collect name
- list_name = soup.select('body p:nth-of-type(2) base title')[0].string
+ list_name = soup.select('body p:nth-of-type(2) title')[0].string
logging.info("Getting " + list_name + " list archive for " + sublist_name)
- lists = soup.select('ul:nth-of-type(2) li')
+ # create (main) directory
+ # this is where all temp files will be created
+ d = os.path.join(base_archive_dir, name)
+ if not os.path.exists(d):
+ os.makedirs(d)
threads = []
+ lists = soup.select('ul:nth-of-type(2) li')
for l in lists:
@@ -33,31 +38,41 @@ def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
threads_url_list = []
threads_links = l.select('ul li a')
for t in threads_links:
- thread_url = urlparse.urljoin(base_url, t.get('href'))
+ thread_url = urllib.parse.urljoin(base_url, t.get('href'))
threads_url_list.append(thread_url)
nbr_threads = str(len(threads_url_list))
n = 0
for u in threads_url_list:
+ time.sleep(DELAY)
n += 1
- logging.info("## " + str(n) + " / " + nbr_threads + " ##")
- threads.append(collect_threads_from_url(u, base_arch_dir, mbox))
+ logging.info("## " + str(n) + " / " + nbr_threads + " ##")
+ try:
+ threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))
+ except KeyboardInterrupt:
+ sys.exit(0)
+ except:
+ logging.warning("Error archiving: " + l[1] + "... Continuing.")
+ ex_t, ex, tb = sys.exc_info()
+ print(ex_t)
+ traceback.print_tb(tb)
+ del tb
+ continue
return threads
# for u in threads_url_list[0:10]:
# print "---------------------------------------"
- # tt = collect_threads_from_url(u, base_arch_dir, mbox)
- # threads.append(tt)
-
+ # tt = collect_threads_from_url(u, base_archive_dir, mbox)
+ # threads.append(tt)
return None
-def collect_threads_from_url(url, base_arch_dir, mbox):
+def collect_threads_from_url(url, base_archive_dir, mbox=False):
response = urllib.request.urlopen(url)
- html = response.read().decode(encoding="utf-8")
+ html = response.read()
soup = BeautifulSoup(html, "html5lib")
# base url
@@ -73,7 +88,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
logging.info("Collecting Threads of: " + threads_name)
# check if archive already exists
- file_path = os.path.join(base_arch_dir, threads['name'] + ".json")
+ file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
if os.path.isfile(file_path):
logging.info("archive already exists. loading from file " + file_path)
with open(file_path, 'r') as fpin:
@@ -114,7 +129,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
def archive_thread(li, base_url, parent_thread_data):
thread_link = li.select('strong a')[0]
- thread_url = urlparse.urljoin(base_url, thread_link.get('href'))
+ thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
thread_id = thread_link.get('name')
thread_title = thread_link.string
thread_author_name = li.select('em')[0].string
@@ -145,6 +160,7 @@ def collect_message(url, message):
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
+ # html = response.read()
soup = BeautifulSoup(html, "html5lib")
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th
@@ -184,6 +200,8 @@ def collect_message(url, message):
else:
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+ # message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
def parse_xcomment(soup, xcom):
diff --git a/lists/mhonarc_nettime.py b/lists/mhonarc_nettime.py
new file mode 100644
index 0000000..5476f37
--- /dev/null
+++ b/lists/mhonarc_nettime.py
@@ -0,0 +1,214 @@
+import urllib.request, urllib.parse
+import logging, os, sys, traceback, re, time, json, gzip
+from bs4 import BeautifulSoup
+
+DELAY = 0.2
+
+def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
+
+ response = urllib.request.urlopen(url)
+ html = response.read()
+ soup = BeautifulSoup(html, "html5lib")
+
+ # base url
+ base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
+
+ #collect name
+ list_name = soup.select('body p:nth-of-type(2) title')[0].string
+ logging.info("Getting " + list_name + " list archive for " + sublist_name)
+
+ # create (main) directory
+ # this is where all temp files will be created
+ d = os.path.join(base_archive_dir, name)
+ if not os.path.exists(d):
+ os.makedirs(d)
+
+ threads = []
+ lists = soup.select('ul:nth-of-type(2) li')
+
+ for l in lists:
+
+ if l.strong is None:
+ continue
+
+ name = l.strong.string
+
+ if name.lower() == sublist_name.lower():
+
+ threads_url_list = []
+ threads_links = l.select('ul li a')
+ for t in threads_links:
+ thread_url = urllib.parse.urljoin(base_url, t.get('href'))
+ threads_url_list.append(thread_url)
+
+ nbr_threads = str(len(threads_url_list))
+ n = 0
+
+ for u in threads_url_list:
+ time.sleep(DELAY)
+ n += 1
+ logging.info("## " + str(n) + " / " + nbr_threads + " ##")
+ try:
+ threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))
+ except KeyboardInterrupt:
+ sys.exit(0)
+ except:
+ logging.warning("Error archiving: " + l[1] + "... Continuing.")
+ ex_t, ex, tb = sys.exc_info()
+ print(ex_t)
+ traceback.print_tb(tb)
+ del tb
+ continue
+
+ return threads
+
+ # for u in threads_url_list[0:10]:
+ # print "---------------------------------------"
+ # tt = collect_threads_from_url(u, base_archive_dir, mbox)
+ # threads.append(tt)
+
+ return None
+
+def collect_threads_from_url(url, base_archive_dir, mbox=False):
+
+ response = urllib.request.urlopen(url)
+ html = response.read()
+ soup = BeautifulSoup(html, "html5lib")
+
+ # base url
+ base_url = url
+
+ # collect name
+ threads_name = soup.select('p:nth-of-type(1) title')[0].string
+ threads_name = threads_name.replace(' ', '_')
+
+ # thread data struct
+ threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
+
+ logging.info("Collecting Threads of: " + threads_name)
+
+ # check if archive already exists
+ file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
+ if os.path.isfile(file_path):
+ logging.info("archive already exists. loading from file " + file_path)
+ with open(file_path, 'r') as fpin:
+ threads = json.load(fpin)
+ else:
+ lists = soup.select('ul:nth-of-type(1) > li')
+
+ nbr_threads = str(len(lists))
+ n = 0
+
+ for l in lists:
+ n += 1
+ logging.info("> " + str(n) + " / " + nbr_threads)
+
+ try:
+ thread = archive_thread(l, base_url, None)
+ threads['threads'].append(thread)
+ except:
+ ex_type, ex, tb = sys.exc_info()
+ traceback.print_tb(tb)
+ del tb
+ continue
+
+ time.sleep(DELAY)
+
+ # write
+ logging.info("writing archive to file " + file_path)
+
+ with open(file_path, 'w') as fp:
+ json.dump(threads, fp, indent=4)
+
+ logging.info("done. ")
+
+ return threads
+
+
+
+def archive_thread(li, base_url, parent_thread_data):
+
+ thread_link = li.select('strong a')[0]
+ thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
+ thread_id = thread_link.get('name')
+ thread_title = thread_link.string
+ thread_author_name = li.select('em')[0].string
+
+ message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
+
+ collect_message(thread_url, message)
+
+ follow = li.select('ul > li')
+ if len(follow) > 0:
+ for f in follow:
+ follow_link = f.select('strong a')
+ if len (follow_link) > 0:
+ archive_thread(f, base_url, message) ## recursion
+
+ if parent_thread_data is None:
+ return message
+
+ if u'follow-up' not in parent_thread_data:
+ parent_thread_data[u'follow-up'] = []
+
+ parent_thread_data[u'follow-up'].append(message)
+
+ return message
+
+
+def collect_message(url, message):
+
+ response = urllib.request.urlopen(url)
+ html = response.read().decode(encoding="utf-8")
+ # html = response.read()
+ soup = BeautifulSoup(html, "html5lib")
+
+ #note: this should follow an RFC header standard -- MHonArc has header info in the 1th
+
+ message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
+
+ # mhonarc xcomments
+ # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+ message['subject'] = parse_xcomment(soup, "X-Subject")
+ message['date'] = parse_xcomment(soup, "X-Date")
+ message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
+ message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
+ message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
+
+ # parse what is displayed on the page
+
+ info = soup.select('ul:nth-of-type(1) > li')
+
+ for i in info:
+ if i.em == None:
+ continue
+ field = i.em.string
+ if field.lower() in message_labels:
+ message[field.lower()] = i.text.strip(field + ": ")
+
+ ## reformat from -- [author_name, email_addr]
+
+ # from_addr = email.utils.parseaddr(message['from'])
+ # message['author_name'] = from_addr[0]
+ # message['from'] = from_addr[1]
+
+ ## -- content --
+ # test
+ # c1 = soup.select('pre:nth-of-type(1)')
+ # if len(c1) > 0:
+ # message['content'] = c1[0].text
+ # else:
+ # message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
+ message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
+# mhonarc xcomments
+# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+def parse_xcomment(soup, xcom):
+ com = soup.find(text=re.compile(xcom))
+ if com is not None:
+ return com.strip('').strip(xcom + ":").strip()
+ return com
+
+def test_xcomment(soup):
+ return soup.find(text=re.compile('X-Message-Id')) is not None
diff --git a/lists/pipermail.py b/lists/pipermail.py
index dcdf757..c08c68f 100644
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@@ -8,7 +8,8 @@ DELAY = 0.2
def collect_from_url(url, name, base_archive_dir):
response = urllib.request.urlopen(url)
- html = response.read().decode(encoding="utf-8")
+ # html = response.read().decode(encoding="utf-8")
+ html = response.read()
soup = BeautifulSoup(html, "html5lib")
threads_list = soup.find_all('tr')
@@ -195,7 +196,8 @@ def collect_message(url, message):
# logging.info(" + " + url)
response = urllib.request.urlopen(url)
- html = response.read().decode(encoding="utf-8")
+ # html = response.read().decode(encoding="utf-8")
+ html = response.read()
soup = BeautifulSoup(html, "html5lib")
if lists.mhonarc.test_xcomment(soup):
diff --git a/search/archive.py b/search/archive.py
index 3dff99f..9f333d9 100644
--- a/search/archive.py
+++ b/search/archive.py
@@ -69,6 +69,10 @@ class Archive():
i += 1
if nbr_hits > 0:
+ # nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001')
+ if k.startswith("nettime-l_"):
+ dt = datetime.strptime(k, "nettime-l_%b_%y")
+ k = dt.strftime("%B_%Y")
search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
return search_results
@@ -97,6 +101,12 @@ def get_key(kv_tuple):
except Exception:
pass
+ # nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01"
+ try:
+ return datetime.strptime(k, "nettime-l_%b_%y")
+ except Exception:
+ pass
+
print("--------------")
print(k)
diff --git a/www/routes.py b/www/routes.py
index 163bb9f..9212258 100644
--- a/www/routes.py
+++ b/www/routes.py
@@ -118,7 +118,7 @@ def searh():
################################
##
- ## need to chache all the below
+ ## need to cache all the below??
##
################################
@@ -128,7 +128,13 @@ def searh():
a.load(l)
results.append(a.search(k_arg))
- return jsonify(result=results)
+ ## -- sort results?
+ search_results = sorted(results, key=get_result_key)
+
+ return jsonify(result=search_results)
+
+def get_result_key(r):
+ return r['archive']
diff --git a/www/static/c3.min.css b/www/static/c3.min.css
old mode 100755
new mode 100644
diff --git a/www/static/c3.min.js b/www/static/c3.min.js
old mode 100755
new mode 100644
diff --git a/www/static/search.js b/www/static/search.js
index 1435edb..437ca59 100644
--- a/www/static/search.js
+++ b/www/static/search.js
@@ -1,18 +1,26 @@
$(document).ready(function(){
- $('#search').on('submit', function(e) {
+ $('#loading').hide()
+
+ $('#search').submit(function(e) {
e.preventDefault();
args = $(this).serialize();
+ $('#graph').empty();
+ $('#results').empty();
+
+ $('#loading').show()
$.get('/search?'+args, function(data) {
+ $('#loading').hide()
console.log(data);
- $('#graph').empty();
- $('#results').empty();
+ // $('#graph').empty();
+ // $('#results').empty();
$.each(data.result, function(i, item) {
search_result_archive(item);
});
- graph(data);
+ graph(data);
});
});
+
});
function search_result_archive(a) {
diff --git a/www/templates/search.html b/www/templates/search.html
index 5ccee0a..472ba68 100644
--- a/www/templates/search.html
+++ b/www/templates/search.html
@@ -16,6 +16,7 @@
{% endfor %}
+ Loading...