many many things...

2017-11-04 13:34:05 +01:00 · 2017-11-04 13:34:05 +01:00 · 874a27a8c9
commit 874a27a8c9
parent f540b26e4e
18 changed files with 1574 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 # mailinglists specific
 archives/
+figs/
 config.py

 # Byte-compiled / optimized / DLL files
--- a/analyse.py
+++ b/analyse.py
@ -0,0 +1,230 @@
+import os
+
+# matplot view/windows
+import matplotlib
+matplotlib.interactive(True)
+
+# pd display
+import pandas as pd
+pd.set_option('display.max_colwidth', 100)
+
+from analysis.archive import Archive
+from analysis.query import Query
+from analysis.plot import Plot
+
+import analysis.format
+
+# spectre: slategrey
+# nettime: red
+# crumb: purple
+# empyre: darkblue
+
+def save_fig_cohort(q, name, dir, color):
+	t = name + " - Cohorts"	
+	pp = q.cohort().plot(color=color, title=t)
+	ts = name + "_cohorts.png"
+	filename = os.path.join(dir, ts)
+	pp.get_figure().savefig(filename)
+
+def save_fig_messages_total(q, name, dir, color):
+	t = name + " - Nbr. Messages"	
+	pp = q.activity_overall().plot(kind='bar', color=color, title=t)
+	ts = name + "_messages.png"
+	filename = os.path.join(dir, ts)
+	pp.get_figure().savefig(filename)
+
+def save_fig_threads_total(q, name, dir, color):
+	t = name + " - Nbr. Threads"	
+	pp = q.threads_overall().plot(kind='bar', color=color, title=t)
+	ts = name + "_threads.png"
+	filename = os.path.join(dir, ts)
+	pp.get_figure().savefig(filename)
+
+def save_fig_messages_constituency(q, name, dir):
+	t = name + " - Messages Constituency"
+	replies = pd.Series(q.replies_overall(series=True))
+	# threads = pd.Series(q.single_threads_overall(series=True))
+	threads = pd.Series(q.threads_overall(series=True))
+	messages = pd.Series(q.activity_overall(series=True))
+	single_messages = messages - (replies + threads)
+
+	# df = {'a': single_messages, 'b': threads, 'c': replies}
+	# df = pd.DataFrame([single_messages, threads, replies], columns=['a', 'b', 'c'])
+	df = pd.concat([single_messages.to_frame('single-messages').astype(int), threads.to_frame('threads').astype(int), replies.to_frame('replies').astype(int)], axis=1)
+	pp = df.plot(kind='bar', stacked=True, title=t)
+
+	# pp = [single_messages, threads, replies].plot(kind='bar', stacked=True)
+
+	ts = name + "_constituency.png"
+	filename = os.path.join(dir, ts)
+	pp.get_figure().savefig(filename)
+
+def save_fig_avg_threads_replies(q, name, dir, color):
+	t = name + " - Avg. Threads + Replies"
+	replies = pd.Series(q.replies_overall(series=True))
+	threads = pd.Series(q.threads_overall(series=True))
+	messages = pd.Series(q.activity_overall(series=True))
+
+	avg_threads_messages = (replies + threads) / messages
+
+	pp = pd.DataFrame(avg_threads_messages).plot(kind='bar', color=color, title=t)
+
+	ts = name + "_avg_threads_replies.png"
+	filename = os.path.join(dir, ts)
+	pp.get_figure().savefig(filename)
+
+def save_fig_diff_threads_replies_vs_messages(q, name, dir, color):
+	t = name + " - Diff. Threads + Replies vs Single Messages"
+	replies = pd.Series(q.replies_overall(series=True))
+	threads = pd.Series(q.threads_overall(series=True))
+	rt = replies + threads
+	messages = pd.Series(q.activity_overall(series=True))
+
+	diff_threads_messages =  (2 * rt) - messages
+
+	pp = pd.DataFrame(diff_threads_messages).plot(kind='bar', color=color, title=t)
+
+	ts = name + "_diff_threads_replies_messages.png"
+	filename = os.path.join(dir, ts)
+	pp.get_figure().savefig(filename)
+
+def save_fig_ratio_replies_threads(q, name, dir, color):
+	t = name + " - Ratio Replies per Thread"
+	replies = pd.Series(q.replies_overall(series=True))
+	threads = pd.Series(q.threads_overall(series=True))
+
+	ratio_replies_threads =  replies / threads
+
+	pp = pd.DataFrame(ratio_replies_threads).plot(kind='bar', color=color, title=t)
+
+	ts = name + "_ratio_replies_threads.png"
+	filename = os.path.join(dir, ts)
+	pp.get_figure().savefig(filename)
+
+def html_td_rank_year(year, data):
+	td_str = '<td class="td_list">'
+	if year in data:
+		td_str += analysis.format.table_threads_ranking(data[year])
+	td_str += '</td>'
+	return td_str
+
+def html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre):
+
+	html_str = '<table id="rankings">'
+
+	html_str += '<tr>'
+	html_str += '<td class="td_year_t">year</td>'
+	html_str += '<td class="td_list_t">nettime</td>'
+	html_str += '<td class="td_list_t">crumb</td>'
+	html_str += '<td class="td_list_t">spectre</td>'
+	html_str += '<td class="td_list_t">empyre</td>'
+	html_str += '</tr>'
+
+	years = sorted(ranking_nettime.keys())
+
+	print(years)
+
+	for i in years:
+		html_str += '<tr>'
+		html_str += '<td class="td_list">' + i + '</td>'
+		html_str += html_td_rank_year(i, ranking_nettime)
+		html_str += html_td_rank_year(i, ranking_crumb)
+		html_str += html_td_rank_year(i, ranking_spectre)
+		html_str += html_td_rank_year(i, ranking_empyre)
+		html_str += '</tr>'
+
+	html_str += '</table>'
+	return html_str
+
+
+print("nettime")
+#nettime
+nt = Archive('nettime-l')
+ntq = nt.query()
+ntp = Plot(ntq)
+
+
+
+# save_fig_cohort(ntq, 'nettime', 'figs/', 'red')
+# save_fig_messages_total(ntq, 'nettime', 'figs/', 'red')
+# save_fig_threads_total(ntq, 'nettime', 'figs/', 'red')
+# save_fig_messages_constituency(ntq, 'nettime', 'figs/')
+
+# save_fig_avg_threads_replies(ntq, 'nettime', 'figs/', 'red')
+# save_fig_diff_threads_replies_vs_messages(ntq, 'nettime', 'figs/', 'red')
+# save_fig_ratio_replies_threads(ntq, 'nettime', 'figs/', 'red')
+
+ranking_nettime = ntq.threads_ranking(rank=15)
+
+# print(r['2000'])
+
+# print(analysis.format.table_threads_ranking(r['2000']))
+
+
+print("crumb")
+#crumb
+cr = Archive('crumb')
+crq = cr.query()
+crp = Plot(crq)
+
+# save_fig_cohort(crq, 'crumb', 'figs/', 'purple')
+# save_fig_messages_total(crq, 'crumb', 'figs/', 'purple')
+# save_fig_threads_total(crq, 'crumb', 'figs/', 'purple')
+# save_fig_messages_constituency(crq, 'crumb', 'figs/')
+
+# save_fig_avg_threads_replies(crq, 'crumb', 'figs/', 'purple')
+# save_fig_diff_threads_replies_vs_messages(crq, 'crumb', 'figs/', 'purple')
+# save_fig_ratio_replies_threads(crq, 'crumb', 'figs/', 'purple')
+
+ranking_crumb = crq.threads_ranking(rank=15)
+
+
+print("empyre")
+#empyre
+em = Archive('empyre')
+emq = em.query()
+emp = Plot(emq)
+
+# save_fig_cohort(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_messages_total(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_threads_total(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_messages_constituency(emq, 'empyre', 'figs/')
+
+# save_fig_avg_threads_replies(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_diff_threads_replies_vs_messages(emq, 'empyre', 'figs/', 'darkblue')
+# save_fig_ratio_replies_threads(emq, 'empyre', 'figs/', 'darkblue')
+
+ranking_empyre = emq.threads_ranking(rank=15)
+
+print("spectre")
+#spectre
+sp = Archive('spectre')
+spq = sp.query()
+spp = Plot(spq)
+
+# save_fig_cohort(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_messages_total(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_threads_total(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_messages_constituency(spq, 'spectre', 'figs/')
+
+# save_fig_avg_threads_replies(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_diff_threads_replies_vs_messages(spq, 'spectre', 'figs/', 'slategrey')
+# save_fig_ratio_replies_threads(spq, 'spectre', 'figs/', 'slategrey')
+
+ranking_spectre = spq.threads_ranking(rank=15)
+
+
+## comparative ranking
+
+rankings = html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre)
+
+html_template = 'figs/ranking/index_template.html'
+with open(html_template, 'r') as fp:
+	h = fp.read()
+
+html = h.replace("--table--", rankings)
+
+html_output = 'figs/ranking/index.html'
+with open(html_output, 'w+') as fp:
+	fp.write(html)
+
--- a/analysis/archive.py
+++ b/analysis/archive.py
@ -0,0 +1,152 @@
+import numpy as np
+import pandas as pd
+import email, email.parser
+import os, datetime, json, gzip, re
+import analysis.util
+import analysis.query
+
+
+def filter_date(msg, archive_name):
+	
+	time_tz = analysis.util.format_date(msg, archive_name)
+	if not time_tz:
+		return None
+
+	dt = datetime.datetime.fromtimestamp(time_tz)
+	try:
+		date_time = pd.to_datetime(dt)
+	except pd.tslib.OutOfBoundsDatetime:
+		print('time out of bound')
+		print(dt)
+		return None
+
+	min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y')
+	max_date = pd.to_datetime(datetime.datetime.now())
+	if date_time < min_date or date_time > max_date:
+		return None
+
+	return date_time
+
+
+def message_to_tuple_record(msg, records, archive_name, references='X'):
+
+	# check date first?
+	date = filter_date(msg, archive_name)
+	if not date:
+		print("Archive::filter_date returned None. Skip.")
+		return
+
+	# check / filter from email address second?
+	from_addr = analysis.util.format_from(msg, archive_name)
+	if not from_addr:
+		print("Archive::analysis.util.format_from returned None. Skip.")
+		return
+
+	url = analysis.util.format_url(msg, archive_name)
+	author = analysis.util.format_author(msg, archive_name)
+	subject = analysis.util.format_subject(msg, archive_name)
+	message_id = analysis.util.format_id(msg, archive_name)
+	content = analysis.util.format_content(msg, archive_name)
+
+	records.append((message_id,
+						from_addr,
+						author,
+						subject,
+						date,
+						url,
+						len(content),
+						0 if not 'follow-up' in msg else len(msg['follow-up']),
+						references))
+
+	# recursive follow up -- but references is not keeping track really...
+	if 'follow-up' in msg:
+		for f in msg['follow-up']:
+			message_to_tuple_record(f, records, archive_name, references=message_id)
+
+	return 
+
+def json_data_to_pd_dataframe(json_data, archive_name):
+
+	records = []
+	for d in json_data:
+		for dd in d['threads']:
+			message_to_tuple_record(dd, records, archive_name)
+
+	print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records)))
+
+	df = pd.DataFrame.from_records(records,
+						index='date',
+						columns=['message-id',
+									'from',
+									'author',
+									'subject',
+									'date',
+									'url',
+									'content-length',
+									'nbr-references',
+									'references'])
+
+	df.index.name = 'date'
+
+	return df
+
+def load_from_file(filename, archive_name, archive_dir, json_data=None):
+
+	if not filename.endswith('.json.gz'):
+		file_path = os.path.join(archive_dir, filename + '.json.gz')
+	else:
+		file_path = os.path.join(archive_dir, filename)
+
+	if os.path.isfile(file_path):
+		with gzip.open(file_path, 'r') as fp:
+			json_data = json.load(fp)
+			return json_data_to_pd_dataframe(json_data['threads'], archive_name)
+	else:
+		#list of all "filename[...].json.gz" in archive_dir
+		files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
+		if files:
+			filename = files[-1] # take the most recent (listed alpha-chronological)
+			file_path = os.path.join(archive_dir, filename)
+			if os.path.isfile(file_path):
+				with gzip.open(file_path, 'r') as fp:
+					json_data = json.load(fp)
+					return json_data_to_pd_dataframe(json_data['threads'], archive_name)
+		else:
+			#list of all json files in archive_dir/filename
+			dir_path = os.path.join(archive_dir, filename)
+			if not os.path.isdir(dir_path):
+				return None
+
+			files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
+			if not files:
+				return None
+
+			# load all json files
+			threads = []
+			for file_path in files:
+				with open(file_path, 'r') as fp:
+					json_data = json.load(fp)
+					threads.append(json_data)
+
+			print('---> ' + archive_name)
+			return json_data_to_pd_dataframe(threads, archive_name)
+				
+
+class Archive:
+
+	data = None				# "raw" json data
+	dataframe = None 		# main pd dataframe
+
+	def __init__(self, archive_name, archive_dir="archives"):
+
+		if isinstance(archive_name, pd.core.frame.DataFrame):
+			self.dataframe = archive_name.copy()
+
+		if isinstance(archive_name, str):
+			# need a filename or a dir name....
+			self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data)
+
+	def query(self):
+		q = analysis.query.Query(self)
+		return q
+
--- a/analysis/format.py
+++ b/analysis/format.py
@ -0,0 +1,158 @@
+import analysis.query
+import logging, html, numpy
+from tabulate import tabulate
+
+def makeurl(text, url):
+	return '<a href="' + url + '">' + text + "</a>"
+
+def table_threads_ranking(ranking_dataframe):
+
+	html_str = '<table class="threads_ranking">'
+
+
+	html_str += '<tr>'
+	html_str += '<td class="td_date_t">date</td>'
+	html_str += '<td class="td_subject_t">subject</td>'
+	html_str += '<td class="td_from_t">from</td>'
+	html_str += '<td class="td_rep_t">replies</td>'
+	html_str += '</tr>'
+
+
+	for i, row in ranking_dataframe.iterrows():
+
+		html_str += '<tr>'
+		html_str += '<td class="td_date">' + str(i) + '</td>'
+		html_str += '<td class="td_subject">' + makeurl(row['subject'], row['url']) + '</td>'
+		html_str += '<td class="td_from">' + row['from'] + '</td>'
+		html_str += '<td class="td_rep">' + str(row['nbr-references']) + '</td>'
+		html_str += '</tr>'
+
+	html_str += "</table>"
+
+	return html_str
+
+
+
+
+class Html:
+
+	query = None
+
+	def __init__(self, q=None):
+
+		if not isinstance(q, query.Query):
+			logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
+			raise Exception()
+
+		self.query = q
+
+	def threads_ranking(self, rank=5, resolution=None):
+		
+		data = self.query.threads_ranking(rank=rank)
+
+		h = html.HTML()
+		t = h.table()
+
+		r = t.tr
+		r.td('date', klass='td_date_t')
+		r.td('from', klass='td_from_t')
+		r.td('replies', klass='td_rep_t')
+		r.td('subject', klass='td_subject_t')
+
+		for i, row in data.iterrows():
+			r = t.tr
+
+			print(row.index)
+
+			r.td(str(row['date']), klass='td_date')
+			r.td(row['from'], klass='td_from')
+			r.td(str(row['nbr-references']), klass='td_rep')
+			r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
+
+		return str(t)
+
+	@staticmethod
+	def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}):
+
+		header = []
+		if data_frame.index.name in name_map:
+			header.append(name_map[data_frame.index.name])
+		else:
+			header.append(data_frame.index.name)
+		for h in data_frame.columns:
+			if h in name_map:
+				h = name_map[h]
+			header.append(h)
+
+		css_header = []
+		css_element = []
+		for i in header:
+			css_header.append('td_' + i + '_t')
+			css_element.append('td_' + i)
+
+		h = html.HTML()
+		if table_name:
+			t = h.table(id=table_name, klass=table_name + '_t')
+		else:
+			t = h.table()
+
+		# url map
+		url_hash = {}
+		url_skip = []
+		url_keys = url_map.keys()
+		for u in url_keys:
+			 if u in header and url_map[u] in header:
+			 	url_indx = header.index(url_map[u])
+			 	url_hash[header.index(u)] = url_indx
+			 	url_skip.append(url_indx)
+			 	header.pop(url_indx)
+
+		#header
+		r = t.tr
+		n = 0
+		for j in header:
+			r.td(str(j), klass=css_header[n])
+			n += 1
+
+		
+		#elements		
+		for k, row in data_frame.iterrows():
+			r = t.tr
+			r.td(str(k), klass=css_element[0])
+			n = 1
+			for l in row:
+
+				if n in url_skip:
+					continue
+
+				if isinstance(l, float): 
+					if l % 1 > 0:
+						l = '{0:.4f}'.format(l)
+					else:
+						l = int(l)
+
+				if n in url_hash.keys():
+					url = row[url_hash[n] - 1]
+					r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
+
+				else:
+					r.td(str(l), klass=css_element[n])
+				n += 1
+
+		return str(t)
+
+class Tab:
+
+	@staticmethod
+	def from_dataframe(data_frame, name_map={}, format=".0f"):
+
+		header = []
+		header.append(data_frame.index.name)
+		for h in data_frame.columns:
+			if h in name_map:
+				h = name_map[h]
+			header.append(h)
+
+		return tabulate(data_frame, headers=header, floatfmt=format)
+
+
--- a/analysis/plot.py
+++ b/analysis/plot.py
@ -0,0 +1,79 @@
+import numpy as np
+import pandas as pd
+import analysis.query
+
+# for colormaps see: 
+# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps
+# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps
+# http://matplotlib.org/examples/color/colormaps_reference.html
+# for colors see:
+# http://matplotlib.org/examples/color/named_colors.html
+
+# spectre: slategrey
+# nettime: red
+# crumb: purple
+# empyre: darkblue
+
+def bar_plot_series(series, title, color='blueviolet', ylim=None):
+	return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim)
+
+def save(plot, name):
+	fig = plot.get_figure()
+	fig.savefig(name)
+
+class Plot:
+
+	query = None
+
+	def __init__(self, q=None):
+
+		if not isinstance(q, analysis.query.Query):
+			logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query")
+			raise Exception()
+
+		self.query = q
+
+	'''
+	activity
+	'''					
+
+	def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
+
+		activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys()
+		series = []
+		for k in activity_rank:
+			series.append(self.query.activity_from(k, resolution, series=True))
+			
+		df = pd.concat(series, axis=1)
+
+		return df.plot.area(colormap='spectral', figsize=figsize, stacked=False)
+		
+	'''
+	content lenght
+	'''
+
+	def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
+
+		content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys()
+		series = []
+		for k in content_rank:
+			series.append(self.query.content_length_from(k, resolution, series=True))
+			
+		df = pd.concat(series, axis=1)
+
+		return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)		
+
+	'''
+	threads
+	'''			
+
+	def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
+
+		threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys()
+		series = []
+		for k in threads_rank:
+			series.append(self.query.threads_from(k, resolution, series=True))
+			
+		df = pd.concat(series, axis=1)
+		
+		return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
--- a/analysis/query.py
+++ b/analysis/query.py
@ -0,0 +1,573 @@
+import numpy as np
+import pandas as pd
+import analysis.archive
+import logging
+
+class Query:
+
+	archive = None			# analysis.archive.Archive object
+	activity = None			# (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
+	content_length = None	# (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
+	threads = None			# ...
+	single_threads = None
+	replies = None			# ...
+
+	def __init__(self, arch=None):
+
+		if not isinstance(arch, analysis.archive.Archive):
+			logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
+			raise Exception()
+
+		self.archive = arch
+
+	'''
+	activity
+	'''			
+
+	def _activity(self):
+
+		if self.activity is None:
+			from_index = self.archive.dataframe.reindex(columns=['from'])
+			self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
+
+		return self.activity
+
+	def activity_from(self, email_address, resolution='y', series=False):
+
+		eaddr = email_address.replace('@', '{at}').lower()
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None		
+
+		self._activity()
+		try:
+			af = self.activity[eaddr]			
+		except KeyError:
+			return None
+
+		activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
+
+		if freq == 'AS':
+			activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
+			activity_from.index.name = 'year'
+		else:
+			activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+			activity_from.index.name = 'year-month'
+
+		if series:
+			return activity_from
+
+		return activity_from.to_frame('nbr-messages').astype(int)
+
+	def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
+		
+		self._activity()
+		afr = self.activity.sum(axis=0).order(ascending=False)
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			afr = afr[afr.index.str.contains(p)]
+
+		if series:
+			return afr[:rank]
+
+		return afr[:rank].to_frame('nbr-messages').astype(int)	
+
+
+	# def activity_overall(self, resolution='y', series=False):
+
+	# 	freq = 'M'
+	# 	if resolution.lower() == 'y':
+	# 		freq = 'AS'
+	# 	elif resolution.lower() == 'm':
+	# 		freq = 'M'
+	# 	else:
+	# 		return None
+
+	# 	self._activity()
+
+	# 	y = self.activity.sum(axis=1)
+	# 	y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
+
+	# 	if freq == 'AS':
+	# 		y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+	# 		y.index.name = 'year'
+	# 	else:
+	# 		y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+	# 		y.index.name = 'year-month'
+
+	# 	if series:
+	# 		return y
+
+	# 	return y.to_frame('nbr-messages').astype(int)
+
+	def activity_overall(self, resolution='y', series=False):
+
+		a = self.archive.dataframe['url']
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()
+
+		if freq == 'AS':
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+			y.index.name = 'year'
+		else:
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+			y.index.name = 'year-month'
+
+		if series:
+			return y
+
+		return y.to_frame('nbr-messages').astype(int)
+
+	def cohort(self, resolution='m', series=False):
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		self._activity()
+
+		c = self.activity.idxmax().order().to_frame('date')
+		c.index = c['date']
+
+		cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()
+
+		if freq == 'AS':
+			cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
+			cohort.index.name = 'year'
+		else:
+			cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+			cohort.index.name = 'year-month'
+
+		if series:
+			return cohort
+
+		return cohort.to_frame('first-messages').astype(int)
+
+	'''
+	content lenght
+	'''
+
+	def _content_length(self):
+
+		if self.content_length is None:
+			from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
+			self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
+			self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
+
+		return self.content_length
+
+	def content_length_from(self, email_address, resolution='y', series=False):
+
+		eaddr = email_address.replace('@', '{at}').lower()
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None		
+
+		self._content_length()
+		try:
+			af = self.content_length[eaddr]			
+		except KeyError:
+			return None
+
+		content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
+
+		if freq == 'AS':
+			content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
+			content_length_from.index.name = 'year'
+		else:
+			content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+			content_length_from.index.name = 'year-month'
+
+		if series:
+			return content_length_from
+
+		return content_length_from.to_frame('nbr-bytes').astype(int)
+
+	def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
+		
+		self._content_length()
+		cfr = self.content_length.sum(axis=0).order(ascending=False)
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			cfr = cfr[cfr.index.str.contains(p)]
+
+		if series:
+			return cfr[:rank]
+
+		return cfr[:rank].to_frame('nbr-bytes').astype(int)
+
+	def content_length_overall(self, resolution='y', series=False):
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		self._content_length()
+
+		y = self.content_length.sum(axis=1)
+		y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
+
+		if freq == 'AS':
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+			y.index.name = 'year'
+		else:
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+			y.index.name = 'year-month'
+
+		if series:
+			return y
+
+		return y.to_frame('nbr-bytes').astype(int)
+
+
+	'''
+	threads
+	'''			
+
+	def _threads(self, thresh=0):
+
+		print("doing threads")
+
+		if self.threads is None:
+			self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
+
+		if self.single_threads is None:
+			self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
+
+		return self.threads;
+
+	def threads_ranking(self, rank=5, resolution='y'):
+
+		self._threads()
+
+		if resolution == None:
+			data = self.threads.drop('message-id', axis=1)[:rank]
+			return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		# get the threads ranking per time resolution
+		# 
+		data = self.threads.drop('message-id', axis=1)
+		data = data.groupby([pd.TimeGrouper(freq=freq)])
+		r = {}
+		for k, v in data:
+			if freq == 'AS':
+				time_key = k.strftime('%Y')
+			else:
+				time_key = k.strftime('%Y-%m')
+			frame = v[:rank]
+			frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
+			r[time_key] = frame
+		return r
+
+	def threads_replies_to(self, email_address, resolution='y', series=False):
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		self._threads()
+
+		eaddr = email_address.replace('@', '{at}').lower()
+
+		self._threads()
+		threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
+		threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum()  # <-- sum = adding up nbr references 
+		threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
+
+		if series:
+			return threads_from_ranking[eaddr]
+
+		threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
+
+		if freq == 'AS':
+			threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
+			threads_from_ranking.index.name = 'year'
+		else:
+			threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+			threads_from_ranking.index.name = 'year-month'
+
+		return threads_from_ranking
+
+	def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
+
+		self._threads()
+
+		tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
+
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			tfr = tfr[tfr.index.str.contains(p)]
+
+		tfr = tfr[:rank].astype(int)
+		return tfr
+
+	def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
+
+		self._threads()
+		tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			tir = tir[tir.index.str.contains(p)]
+
+		if series:
+			return tir[:rank]
+
+		return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
+
+	def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
+
+		# activity
+		self._activity()
+		afr = self.activity.sum(axis=0).astype(int)
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			afr = afr[afr.index.str.contains(p)]
+
+		# initiated threads [top 25]
+		self._threads()
+		tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			tir = tir[tir.index.str.contains(p)]
+
+		inter = afr.index.intersection(tir.index)
+		avg = tir[inter] / afr[inter]
+
+		labels = ['messages', 'threads', 'avg.threads']
+		return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
+
+	def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
+
+		self._threads()
+
+		#initiated
+		tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			tir = tir[tir.index.str.contains(p)]
+
+		#replies [top 25]
+		tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			tfr = tfr[tfr.index.str.contains(p)]
+		tfr = tfr['nbr-references']			# dataframe to series
+
+
+		inter = tir.index.intersection(tfr.index)
+		avg = tfr[inter] / tir[inter] 
+
+		labels = ['threads', 'replies', 'avg.replies']
+		return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
+
+
+	def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		agg = aggregate.lower()
+		if not agg in ['sum', 'mean', 'count']:
+			return None
+
+		if not self.threads is None:
+			del self.threads
+			self.threads = None
+
+		self._threads(tresh)
+
+		if agg == 'sum':
+			# number of replies total (re: sum all the replies)
+			y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
+		elif agg == 'mean':
+			y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
+		else:
+			# number of threads (re: msgs with at least one reply)
+			y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
+
+		if freq == 'AS':
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+			y.index.name = 'year'
+		else:
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+			y.index.name = 'year-month'
+
+		if series:
+			return y
+
+		return y.to_frame('nbr-threads').astype(int)
+
+	def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		agg = aggregate.lower()
+		if not agg in ['sum', 'mean', 'count']:
+			return None
+
+		if not self.single_threads is None:
+			del self.single_threads
+			self.single_threads = None
+
+		self._threads(tresh)
+
+
+		y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
+
+
+		if freq == 'AS':
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+			y.index.name = 'year'
+		else:
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))			
+			y.index.name = 'year-month'
+
+		if series:
+			return y
+
+		return y.to_frame('nbr-threads').astype(int)
+
+
+	'''
+	replies
+	'''
+
+	def _replies(self):
+
+		if self.replies is None:
+			self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
+			self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
+		return self.replies;
+
+	def replies_ranking(self, rank=5, resolution=None):
+
+		self._replies()
+
+		if resolution == None:
+			data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
+			return data.to_frame('nbr_replies')
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		# get the threads ranking per time resolution
+		# 
+		data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
+		r = {}
+		for k, v in data:
+			if freq == 'AS':
+				time_key = k.strftime('%Y')
+			else:
+				time_key = k.strftime('%Y-%m')
+			frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
+			r[time_key] = frame.to_frame('nbr-replies')
+		return r
+
+	def replies_avg_ranking(self, rank=5, filter_nettime=True):
+
+			# activity
+			self._activity()
+			afr = self.activity.sum(axis=0)
+			if filter_nettime:
+				p = r'^((?!nettime*).)*$'
+				afr = afr[afr.index.str.contains(p)]
+
+			# replies in thread [top 25]
+
+			self._replies()
+			rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
+
+			inter = afr.index.intersection(rpl.index)
+			avg = rpl[inter] / afr[inter]
+
+			labels = ['messages', 'replies', 'avg.replies']
+			return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
+
+	def replies_overall(self, resolution='y', series=False):
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		if not self.replies is None:
+			del self.replies
+			self.replies = None
+
+		self._replies()
+
+		y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()
+
+
+		if freq == 'AS':
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
+			y.index.name = 'year'
+		else:
+			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
+			y.index.name = 'year-month'
+
+		if series:
+			return y
+
+		return y.to_frame('nbr-replies').astype(int)
+
+
+
+
--- a/analysis/util.py
+++ b/analysis/util.py
@ -0,0 +1,81 @@
+import email
+import hashlib
+
+def format_content(msg, archive_name):
+	return msg['content']
+
+def format_url(msg, archive_name):
+	return msg['url']
+
+def format_author(msg, archive_name):
+	return msg['author_name']
+
+def format_from_token(from_str, sep):
+	from_addr = email.utils.parseaddr(from_str)[1]
+	if sep not in from_addr:
+		tok = from_str.split()		
+		try:
+			at = tok.index(sep)
+			from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
+			if from_addr.startswith('<') or from_addr.endswith('>'):
+				from_addr = from_addr.strip('<').strip('>')
+		except ValueError:
+			print(tok)
+			print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
+			return None
+	else:
+		from_addr = from_addr.replace(sep, '{AT}')
+	return from_addr.lower()
+
+def format_from(msg, archive_name):
+	from_str = msg['from']
+
+	if " {AT} " in from_str:
+		return format_from_token(from_str, '{AT}')
+	elif " at " in from_str:
+		return format_from_token(from_str, 'at')
+	elif "@" in from_str:
+		return format_from_token(from_str, '@')
+	else:
+		return from_str
+
+# returns utc timestamp
+def format_date(msg, archive_name):
+	date_str = msg['date']
+	time_tz = None
+	try:
+		date_tz = email.utils.parsedate_tz(date_str)
+		time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
+	except TypeError:
+		print("Format Date TypeError")
+		print("  > " + date_str)
+		return None
+	except ValueError:
+		print("Format Date ValueError")
+		print("  > " + date_str)
+		return None
+	finally:
+		return time_tz
+
+def format_subject(msg, archive_name):
+	return msg['subject']
+
+def format_id(msg, archive_name):
+	if "message-id" in msg:
+		return msg['message-id']
+	else:
+		# create hash with author_name + date
+		s = msg['author_name'] + msg['date']
+		sha = hashlib.sha1(s.encode('utf-8'))
+		return sha.hexdigest()
+
+# format='%d/%m/%Y'
+def min_date(archive_name):
+	if "nettime" in archive_name:
+		return '01/10/1995'
+	elif archive_name == "spectre":
+		return '01/08/2001'
+	elif archive_name == "empyre":
+		return '01/01/2002'
+	elif archive_name == "crumb":
+		return '01/02/2001'
--- a/lists/crawl.py
+++ b/lists/crawl.py
@ -1,10 +1,12 @@
 from urllib.parse import urlparse
 import lists.pipermail as pipermail
 import lists.listserv as listserv
+import lists.mhonarc as mhonarc
+import lists.mhonarc_nettime as mhonarc_nettime

 DELAY = 0.2

-def crawl(url, name, archive_dir):
+def crawl(url, name, sublist_name=None, archive_dir="archives"):
 	u = urlparse(url)

 	# the following type 'tests' are very weak...
@ -21,6 +23,11 @@ def crawl(url, name, archive_dir):
 	elif 'cgi-bin' in u.path:
 		listserv.collect_from_url(url, name, archive_dir)

+	# special case -- nettime.
+	# the name should be the sublist_name (i.e nettime-l)
+	elif "nettime" in name:
+		mhonarc_nettime.collect_from_url(url, name, name, archive_dir)
+
 	else:
 		print('mhonarc?')

--- a/lists/listserv.py
+++ b/lists/listserv.py
@ -43,6 +43,17 @@ def collect_from_url(url, name, base_archive_dir):
 			del tb
 			continue

+	# archive['name'] = name
+	# archive['list'] = threads
+
+	# file_path = os.path.join(base_arch_dir, name + '.json')
+
+	# with open(file_path, 'w') as fp:
+	# 	json.dump(archive, fp, indent=4)
+
+	# logging.info("done.")
+
+
 def collect_threads_from_url(url, name, base_arch_dir):

 	threads = {'name' : name, 'url' : url, 'threads' : []}
--- a/lists/mhonarc.py
+++ b/lists/mhonarc.py
@ -4,22 +4,27 @@ from bs4 import BeautifulSoup

 DELAY = 0.2

-def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
+def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):

    response = urllib.request.urlopen(url)
-    html = response.read().decode(encoding="utf-8")
+    html = response.read()
    soup = BeautifulSoup(html, "html5lib")

    # base url 
    base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')

 	#collect name
-    list_name = soup.select('body p:nth-of-type(2) base title')[0].string
+    list_name = soup.select('body p:nth-of-type(2) title')[0].string
    logging.info("Getting " + list_name + " list archive for " + sublist_name)

-    lists = soup.select('ul:nth-of-type(2) li')
+    # create (main) directory 
+    # this is where all temp files will be created
+    d = os.path.join(base_archive_dir, name)
+    if not os.path.exists(d):
+        os.makedirs(d)

    threads = []
+    lists = soup.select('ul:nth-of-type(2) li')    

    for l in lists:

@ -33,31 +38,41 @@ def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
            threads_url_list = []
            threads_links = l.select('ul li a')
            for t in threads_links:
-                thread_url = urlparse.urljoin(base_url, t.get('href'))
+                thread_url = urllib.parse.urljoin(base_url, t.get('href'))
                threads_url_list.append(thread_url)

            nbr_threads = str(len(threads_url_list))
            n = 0

            for u in threads_url_list:
+                time.sleep(DELAY)
                n += 1
                logging.info("## " + str(n) + " / " + nbr_threads + " ##")                
-                threads.append(collect_threads_from_url(u, base_arch_dir, mbox))                
+                try:
+                    threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))   
+                except KeyboardInterrupt:
+                    sys.exit(0)
+                except:
+                    logging.warning("Error archiving: " + l[1] + "... Continuing.")
+                    ex_t, ex, tb = sys.exc_info()
+                    print(ex_t)
+                    traceback.print_tb(tb)
+                    del tb
+                    continue                   

            return threads

            # for u in threads_url_list[0:10]:
            #     print "---------------------------------------"
-            #     tt = collect_threads_from_url(u, base_arch_dir, mbox)
+            #     tt = collect_threads_from_url(u, base_archive_dir, mbox)
            #     threads.append(tt)                

-
    return None

-def collect_threads_from_url(url, base_arch_dir, mbox):
+def collect_threads_from_url(url, base_archive_dir, mbox=False):

    response = urllib.request.urlopen(url)
-    html = response.read().decode(encoding="utf-8")
+    html = response.read()
    soup = BeautifulSoup(html, "html5lib")

    # base url 
@ -73,7 +88,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
    logging.info("Collecting Threads of: " + threads_name)

    # check if archive already exists
-    file_path = os.path.join(base_arch_dir, threads['name'] + ".json")
+    file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
    if os.path.isfile(file_path):
        logging.info("archive already exists. loading from file " + file_path)
        with open(file_path, 'r') as fpin:
@ -114,7 +129,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
 def archive_thread(li, base_url, parent_thread_data):

 	thread_link = li.select('strong a')[0]
-	thread_url = urlparse.urljoin(base_url, thread_link.get('href'))
+	thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
 	thread_id = thread_link.get('name')
 	thread_title = thread_link.string
 	thread_author_name = li.select('em')[0].string
@ -145,6 +160,7 @@ def collect_message(url, message):

    response = urllib.request.urlopen(url)
    html = response.read().decode(encoding="utf-8")
+    # html = response.read()
    soup = BeautifulSoup(html, "html5lib")    

    #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
@ -184,6 +200,8 @@ def collect_message(url, message):
    else:
        message['content'] = soup.select('pre:nth-of-type(2)')[0].text

+    # message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
 # mhonarc xcomments
 # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
 def parse_xcomment(soup, xcom):
--- a/lists/mhonarc_nettime.py
+++ b/lists/mhonarc_nettime.py
@ -0,0 +1,214 @@
+import urllib.request, urllib.parse
+import logging, os, sys, traceback, re, time, json, gzip
+from bs4 import BeautifulSoup
+
+DELAY = 0.2
+
+def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
+
+    response = urllib.request.urlopen(url)
+    html = response.read()
+    soup = BeautifulSoup(html, "html5lib")
+
+    # base url 
+    base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
+
+	#collect name
+    list_name = soup.select('body p:nth-of-type(2) title')[0].string
+    logging.info("Getting " + list_name + " list archive for " + sublist_name)
+
+    # create (main) directory 
+    # this is where all temp files will be created
+    d = os.path.join(base_archive_dir, name)
+    if not os.path.exists(d):
+        os.makedirs(d)
+
+    threads = []
+    lists = soup.select('ul:nth-of-type(2) li')    
+
+    for l in lists:
+
+    	if l.strong is None:
+    		continue
+
+    	name = l.strong.string
+
+    	if name.lower() == sublist_name.lower():
+
+            threads_url_list = []
+            threads_links = l.select('ul li a')
+            for t in threads_links:
+                thread_url = urllib.parse.urljoin(base_url, t.get('href'))
+                threads_url_list.append(thread_url)
+
+            nbr_threads = str(len(threads_url_list))
+            n = 0
+
+            for u in threads_url_list:
+                time.sleep(DELAY)
+                n += 1
+                logging.info("## " + str(n) + " / " + nbr_threads + " ##")                
+                try:
+                    threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))   
+                except KeyboardInterrupt:
+                    sys.exit(0)
+                except:
+                    logging.warning("Error archiving: " + l[1] + "... Continuing.")
+                    ex_t, ex, tb = sys.exc_info()
+                    print(ex_t)
+                    traceback.print_tb(tb)
+                    del tb
+                    continue                   
+
+            return threads
+
+            # for u in threads_url_list[0:10]:
+            #     print "---------------------------------------"
+            #     tt = collect_threads_from_url(u, base_archive_dir, mbox)
+            #     threads.append(tt)                
+
+    return None
+
+def collect_threads_from_url(url, base_archive_dir, mbox=False):
+
+    response = urllib.request.urlopen(url)
+    html = response.read()
+    soup = BeautifulSoup(html, "html5lib")
+
+    # base url 
+    base_url = url
+
+    # collect name
+    threads_name = soup.select('p:nth-of-type(1) title')[0].string
+    threads_name = threads_name.replace(' ', '_')
+
+    # thread data struct
+    threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
+
+    logging.info("Collecting Threads of: " + threads_name)
+
+    # check if archive already exists
+    file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
+    if os.path.isfile(file_path):
+        logging.info("archive already exists. loading from file " + file_path)
+        with open(file_path, 'r') as fpin:
+            threads = json.load(fpin)
+    else:
+        lists = soup.select('ul:nth-of-type(1) > li')
+
+        nbr_threads = str(len(lists))
+        n = 0
+
+        for l in lists:
+            n += 1
+            logging.info("> " + str(n) + " / " + nbr_threads)
+
+            try:
+                thread = archive_thread(l, base_url, None)
+                threads['threads'].append(thread)
+            except:
+                ex_type, ex, tb = sys.exc_info()
+                traceback.print_tb(tb)
+                del tb                
+                continue
+
+            time.sleep(DELAY)
+
+        # write 
+        logging.info("writing archive to file " + file_path)
+
+        with open(file_path, 'w') as fp:
+            json.dump(threads, fp, indent=4)
+
+        logging.info("done. ")
+
+    return threads
+
+    
+
+def archive_thread(li, base_url, parent_thread_data):
+
+	thread_link = li.select('strong a')[0]
+	thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
+	thread_id = thread_link.get('name')
+	thread_title = thread_link.string
+	thread_author_name = li.select('em')[0].string
+
+	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
+
+	collect_message(thread_url, message)
+
+	follow = li.select('ul > li')
+	if len(follow) > 0:
+		for f in follow:
+			follow_link = f.select('strong a')
+			if len (follow_link) > 0:
+				archive_thread(f, base_url, message)  ## recursion
+	
+	if parent_thread_data is None:
+		return message
+
+	if u'follow-up' not in parent_thread_data:
+		parent_thread_data[u'follow-up'] = []
+
+	parent_thread_data[u'follow-up'].append(message)
+
+	return message
+
+
+def collect_message(url, message):
+
+    response = urllib.request.urlopen(url)
+    html = response.read().decode(encoding="utf-8")
+    # html = response.read()
+    soup = BeautifulSoup(html, "html5lib")    
+
+    #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
+
+    message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    
+
+    # mhonarc xcomments
+    # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+    message['subject'] = parse_xcomment(soup, "X-Subject")
+    message['date'] = parse_xcomment(soup, "X-Date")
+    message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
+    message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
+    message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
+
+    # parse what is displayed on the page
+
+    info = soup.select('ul:nth-of-type(1) > li')
+
+    for i in info:
+        if i.em == None:
+            continue
+        field = i.em.string
+        if field.lower() in message_labels:
+        	message[field.lower()] = i.text.strip(field + ": ")
+
+    ## reformat from -- [author_name, email_addr]
+
+    # from_addr = email.utils.parseaddr(message['from'])
+    # message['author_name'] = from_addr[0]
+    # message['from'] = from_addr[1]
+
+    ## -- content --
+    # test
+    # c1 = soup.select('pre:nth-of-type(1)')
+    # if len(c1) > 0:
+    #     message['content'] = c1[0].text
+    # else:
+    #     message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
+    message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
+# mhonarc xcomments
+# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+def parse_xcomment(soup, xcom):
+    com = soup.find(text=re.compile(xcom))
+    if com is not None:
+        return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
+    return com
+
+def test_xcomment(soup):
+    return soup.find(text=re.compile('X-Message-Id')) is not None
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@ -8,7 +8,8 @@ DELAY = 0.2
 def collect_from_url(url, name, base_archive_dir):

 	response = urllib.request.urlopen(url)
-	html = response.read().decode(encoding="utf-8")
+	# html = response.read().decode(encoding="utf-8")
+	html = response.read()
 	soup = BeautifulSoup(html, "html5lib")

 	threads_list = soup.find_all('tr')
@ -195,7 +196,8 @@ def collect_message(url, message):
 	# logging.info("	+ " + url)

 	response = urllib.request.urlopen(url)
-	html = response.read().decode(encoding="utf-8")
+	# html = response.read().decode(encoding="utf-8")
+	html = response.read()
 	soup = BeautifulSoup(html, "html5lib")

 	if lists.mhonarc.test_xcomment(soup):
--- a/search/archive.py
+++ b/search/archive.py
@ -69,6 +69,10 @@ class Archive():
 				i += 1

 			if nbr_hits > 0:
+				# nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001')
+				if k.startswith("nettime-l_"):
+					dt = datetime.strptime(k, "nettime-l_%b_%y")
+					k = dt.strftime("%B_%Y")
 				search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})

 		return search_results
@ -97,6 +101,12 @@ def get_key(kv_tuple):
 	except Exception:
 		pass

+	# nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01"
+	try:
+		return datetime.strptime(k, "nettime-l_%b_%y")
+	except Exception:
+		pass
+
 	print("--------------")
 	print(k)

--- a/www/routes.py
+++ b/www/routes.py
@ -118,7 +118,7 @@ def searh():

 	################################
 	##
-	##	need to chache all the below
+	##	need to cache all the below??
 	##
 	################################

@ -128,7 +128,13 @@ def searh():
 		a.load(l)
 		results.append(a.search(k_arg))

-	return jsonify(result=results)
+	## -- sort results?
+	search_results = sorted(results, key=get_result_key)
+
+	return jsonify(result=search_results)
+
+def get_result_key(r):
+	return r['archive']

 	

--- a/www/static/c3.min.css
+++ b/www/static/c3.min.css
--- a/www/static/c3.min.js
+++ b/www/static/c3.min.js
--- a/www/static/search.js
+++ b/www/static/search.js
@ -1,18 +1,26 @@

 $(document).ready(function(){
-	$('#search').on('submit', function(e) {
+	$('#loading').hide()
+
+	$('#search').submit(function(e) {
 		e.preventDefault();
 		args = $(this).serialize();	
+		$('#graph').empty();
+		$('#results').empty();		
+
+		$('#loading').show()
 		$.get('/search?'+args, function(data) {
+			$('#loading').hide()
 			console.log(data);
-			$('#graph').empty();
-			$('#results').empty();
+			// $('#graph').empty();
+			// $('#results').empty();
 			$.each(data.result, function(i, item) {
 				search_result_archive(item);
 			});
 			graph(data);		
 		});
 	});
+
 });

 function search_result_archive(a) {	
--- a/www/templates/search.html
+++ b/www/templates/search.html
@ -16,6 +16,7 @@
 			 {% endfor %}
 		</select>
 		<input type="submit" value="search" id="submit">
+		<div id="loading">Loading...</div>
 	</form>
 	<div id="graph"></div>
 	<div id="results"></div>