MEGA -- DB

2019-07-11 13:21:42 +02:00 · 2019-07-11 13:21:42 +02:00 · 4197cd4d32
commit 4197cd4d32
parent 3703dcc169
25 changed files with 663 additions and 1657 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,11 @@
-# mailinglists specific
+# listservs specific
 archives/
-figs/
+config/
 config.py
+test.py
+
+#macos
+.DS_Store

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/6
+++ b/6
@ -1,3 +1,9 @@
+
+TODO (July 2019):
+	- refactor archive.py and search.py
+	- test lists import with mariadb backend
+
+
 usage: archive.py [-h] [--arch ARCH] url [url ...]

 Mailinglists are dead. Long live mailinglists!
--- a/analyse.py
+++ b/analyse.py
@ -1,230 +0,0 @@
-import os
-
-# matplot view/windows
-import matplotlib
-matplotlib.interactive(True)
-
-# pd display
-import pandas as pd
-pd.set_option('display.max_colwidth', 100)
-
-from analysis.archive import Archive
-from analysis.query import Query
-from analysis.plot import Plot
-
-import analysis.format
-
-# spectre: slategrey
-# nettime: red
-# crumb: purple
-# empyre: darkblue
-
-def save_fig_cohort(q, name, dir, color):
-	t = name + " - Cohorts"	
-	pp = q.cohort().plot(color=color, title=t)
-	ts = name + "_cohorts.png"
-	filename = os.path.join(dir, ts)
-	pp.get_figure().savefig(filename)
-
-def save_fig_messages_total(q, name, dir, color):
-	t = name + " - Nbr. Messages"	
-	pp = q.activity_overall().plot(kind='bar', color=color, title=t)
-	ts = name + "_messages.png"
-	filename = os.path.join(dir, ts)
-	pp.get_figure().savefig(filename)
-
-def save_fig_threads_total(q, name, dir, color):
-	t = name + " - Nbr. Threads"	
-	pp = q.threads_overall().plot(kind='bar', color=color, title=t)
-	ts = name + "_threads.png"
-	filename = os.path.join(dir, ts)
-	pp.get_figure().savefig(filename)
-
-def save_fig_messages_constituency(q, name, dir):
-	t = name + " - Messages Constituency"
-	replies = pd.Series(q.replies_overall(series=True))
-	# threads = pd.Series(q.single_threads_overall(series=True))
-	threads = pd.Series(q.threads_overall(series=True))
-	messages = pd.Series(q.activity_overall(series=True))
-	single_messages = messages - (replies + threads)
-
-	# df = {'a': single_messages, 'b': threads, 'c': replies}
-	# df = pd.DataFrame([single_messages, threads, replies], columns=['a', 'b', 'c'])
-	df = pd.concat([single_messages.to_frame('single-messages').astype(int), threads.to_frame('threads').astype(int), replies.to_frame('replies').astype(int)], axis=1)
-	pp = df.plot(kind='bar', stacked=True, title=t)
-
-	# pp = [single_messages, threads, replies].plot(kind='bar', stacked=True)
-
-	ts = name + "_constituency.png"
-	filename = os.path.join(dir, ts)
-	pp.get_figure().savefig(filename)
-
-def save_fig_avg_threads_replies(q, name, dir, color):
-	t = name + " - Avg. Threads + Replies"
-	replies = pd.Series(q.replies_overall(series=True))
-	threads = pd.Series(q.threads_overall(series=True))
-	messages = pd.Series(q.activity_overall(series=True))
-
-	avg_threads_messages = (replies + threads) / messages
-
-	pp = pd.DataFrame(avg_threads_messages).plot(kind='bar', color=color, title=t)
-
-	ts = name + "_avg_threads_replies.png"
-	filename = os.path.join(dir, ts)
-	pp.get_figure().savefig(filename)
-
-def save_fig_diff_threads_replies_vs_messages(q, name, dir, color):
-	t = name + " - Diff. Threads + Replies vs Single Messages"
-	replies = pd.Series(q.replies_overall(series=True))
-	threads = pd.Series(q.threads_overall(series=True))
-	rt = replies + threads
-	messages = pd.Series(q.activity_overall(series=True))
-
-	diff_threads_messages =  (2 * rt) - messages
-
-	pp = pd.DataFrame(diff_threads_messages).plot(kind='bar', color=color, title=t)
-
-	ts = name + "_diff_threads_replies_messages.png"
-	filename = os.path.join(dir, ts)
-	pp.get_figure().savefig(filename)
-
-def save_fig_ratio_replies_threads(q, name, dir, color):
-	t = name + " - Ratio Replies per Thread"
-	replies = pd.Series(q.replies_overall(series=True))
-	threads = pd.Series(q.threads_overall(series=True))
-
-	ratio_replies_threads =  replies / threads
-
-	pp = pd.DataFrame(ratio_replies_threads).plot(kind='bar', color=color, title=t)
-
-	ts = name + "_ratio_replies_threads.png"
-	filename = os.path.join(dir, ts)
-	pp.get_figure().savefig(filename)
-
-def html_td_rank_year(year, data):
-	td_str = '<td class="td_list">'
-	if year in data:
-		td_str += analysis.format.table_threads_ranking(data[year])
-	td_str += '</td>'
-	return td_str
-
-def html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre):
-
-	html_str = '<table id="rankings">'
-
-	html_str += '<tr>'
-	html_str += '<td class="td_year_t">year</td>'
-	html_str += '<td class="td_list_t">nettime</td>'
-	html_str += '<td class="td_list_t">crumb</td>'
-	html_str += '<td class="td_list_t">spectre</td>'
-	html_str += '<td class="td_list_t">empyre</td>'
-	html_str += '</tr>'
-
-	years = sorted(ranking_nettime.keys())
-
-	print(years)
-
-	for i in years:
-		html_str += '<tr>'
-		html_str += '<td class="td_list">' + i + '</td>'
-		html_str += html_td_rank_year(i, ranking_nettime)
-		html_str += html_td_rank_year(i, ranking_crumb)
-		html_str += html_td_rank_year(i, ranking_spectre)
-		html_str += html_td_rank_year(i, ranking_empyre)
-		html_str += '</tr>'
-
-	html_str += '</table>'
-	return html_str
-
-
-print("nettime")
-#nettime
-nt = Archive('nettime-l')
-ntq = nt.query()
-ntp = Plot(ntq)
-
-
-
-# save_fig_cohort(ntq, 'nettime', 'figs/', 'red')
-# save_fig_messages_total(ntq, 'nettime', 'figs/', 'red')
-# save_fig_threads_total(ntq, 'nettime', 'figs/', 'red')
-# save_fig_messages_constituency(ntq, 'nettime', 'figs/')
-
-# save_fig_avg_threads_replies(ntq, 'nettime', 'figs/', 'red')
-# save_fig_diff_threads_replies_vs_messages(ntq, 'nettime', 'figs/', 'red')
-# save_fig_ratio_replies_threads(ntq, 'nettime', 'figs/', 'red')
-
-ranking_nettime = ntq.threads_ranking(rank=15)
-
-# print(r['2000'])
-
-# print(analysis.format.table_threads_ranking(r['2000']))
-
-
-print("crumb")
-#crumb
-cr = Archive('crumb')
-crq = cr.query()
-crp = Plot(crq)
-
-# save_fig_cohort(crq, 'crumb', 'figs/', 'purple')
-# save_fig_messages_total(crq, 'crumb', 'figs/', 'purple')
-# save_fig_threads_total(crq, 'crumb', 'figs/', 'purple')
-# save_fig_messages_constituency(crq, 'crumb', 'figs/')
-
-# save_fig_avg_threads_replies(crq, 'crumb', 'figs/', 'purple')
-# save_fig_diff_threads_replies_vs_messages(crq, 'crumb', 'figs/', 'purple')
-# save_fig_ratio_replies_threads(crq, 'crumb', 'figs/', 'purple')
-
-ranking_crumb = crq.threads_ranking(rank=15)
-
-
-print("empyre")
-#empyre
-em = Archive('empyre')
-emq = em.query()
-emp = Plot(emq)
-
-# save_fig_cohort(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_messages_total(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_threads_total(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_messages_constituency(emq, 'empyre', 'figs/')
-
-# save_fig_avg_threads_replies(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_diff_threads_replies_vs_messages(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_ratio_replies_threads(emq, 'empyre', 'figs/', 'darkblue')
-
-ranking_empyre = emq.threads_ranking(rank=15)
-
-print("spectre")
-#spectre
-sp = Archive('spectre')
-spq = sp.query()
-spp = Plot(spq)
-
-# save_fig_cohort(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_messages_total(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_threads_total(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_messages_constituency(spq, 'spectre', 'figs/')
-
-# save_fig_avg_threads_replies(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_diff_threads_replies_vs_messages(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_ratio_replies_threads(spq, 'spectre', 'figs/', 'slategrey')
-
-ranking_spectre = spq.threads_ranking(rank=15)
-
-
-## comparative ranking
-
-rankings = html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre)
-
-html_template = 'figs/ranking/index_template.html'
-with open(html_template, 'r') as fp:
-	h = fp.read()
-
-html = h.replace("--table--", rankings)
-
-html_output = 'figs/ranking/index.html'
-with open(html_output, 'w+') as fp:
-	fp.write(html)
-
--- a/analysis/archive.py
+++ b/analysis/archive.py
@ -1,165 +0,0 @@
-import numpy as np
-import pandas as pd
-import email, email.parser
-import os, datetime, json, gzip, re
-import analysis.util
-import analysis.query
-
-import search.archive ## circular...
-
-
-def filter_date(msg, archive_name):
-	
-	time_tz = analysis.util.format_date(msg, archive_name)
-	if not time_tz:
-		return None
-
-	dt = datetime.datetime.fromtimestamp(time_tz)
-	try:
-		date_time = pd.to_datetime(dt)
-	except pd.tslib.OutOfBoundsDatetime:
-		print('time out of bound')
-		print(dt)
-		return None
-
-	min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y')
-	max_date = pd.to_datetime(datetime.datetime.now())
-	if date_time < min_date or date_time > max_date:
-		return None
-
-	return date_time
-
-
-def message_to_tuple_record(msg, records, archive_name, references='X'):
-
-	# check date first?
-	date = filter_date(msg, archive_name)
-	if not date:
-		print("Archive::filter_date returned None. Skip.")
-		return
-
-	# check / filter from email address second?
-	from_addr = analysis.util.format_from(msg, archive_name)
-	if not from_addr:
-		print("Archive::analysis.util.format_from returned None. Skip.")
-		return
-
-	url = analysis.util.format_url(msg, archive_name)
-	author = analysis.util.format_author(msg, archive_name)
-	subject = analysis.util.format_subject(msg, archive_name)
-	message_id = analysis.util.format_id(msg, archive_name)
-	content = analysis.util.format_content(msg, archive_name)
-
-	records.append((message_id,
-						from_addr,
-						author,
-						subject,
-						date,
-						url,
-						len(content),
-						0 if not 'follow-up' in msg else len(msg['follow-up']),
-						references))
-
-	# recursive follow up -- but references is not keeping track really...
-	if 'follow-up' in msg:
-		for f in msg['follow-up']:
-			message_to_tuple_record(f, records, archive_name, references=message_id)
-
-	return 
-
-def json_data_to_pd_dataframe(json_data, archive_name):
-
-	records = []
-	for d in json_data:
-		for dd in d['threads']:
-			message_to_tuple_record(dd, records, archive_name)
-
-	print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records)))
-
-	df = pd.DataFrame.from_records(records,
-						index='date',
-						columns=['message-id',
-									'from',
-									'author',
-									'subject',
-									'date',
-									'url',
-									'content-length',
-									'nbr-references',
-									'references'])
-
-	df.index.name = 'date'
-
-	return df
-
-def load_from_file(filename, archive_name, archive_dir, json_data=None):
-
-	if not filename.endswith('.json.gz'):
-		file_path = os.path.join(archive_dir, filename + '.json.gz')
-	else:
-		file_path = os.path.join(archive_dir, filename)
-
-	if os.path.isfile(file_path):
-		with gzip.open(file_path, 'r') as fp:
-			json_data = json.load(fp)
-			return json_data_to_pd_dataframe(json_data['threads'], archive_name)
-	else:
-		#list of all "filename[...].json.gz" in archive_dir
-		files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
-		if files:
-			filename = files[-1] # take the most recent (listed alpha-chronological)
-			file_path = os.path.join(archive_dir, filename)
-			if os.path.isfile(file_path):
-				with gzip.open(file_path, 'r') as fp:
-					json_data = json.load(fp)
-					return json_data_to_pd_dataframe(json_data['threads'], archive_name)
-		else:
-			#list of all json files in archive_dir/filename
-			dir_path = os.path.join(archive_dir, filename)
-			if not os.path.isdir(dir_path):
-				return None
-
-			files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
-			if not files:
-				return None
-
-			# load all json files
-			threads = []
-			for file_path in files:
-				with open(file_path, 'r') as fp:
-					json_data = json.load(fp)
-					threads.append(json_data)
-
-			print('---> ' + archive_name)
-			return json_data_to_pd_dataframe(threads, archive_name)
-
-def load_from_search_archive(archive):
-	threads = []
-	for k, v in archive.archive.items():
-		threads.append(v)
-	return json_data_to_pd_dataframe(threads, archive.archive_name)
-
-
-				
-
-class Archive:
-
-	data = None				# "raw" json data
-	dataframe = None 		# main pd dataframe
-
-	def __init__(self, archive_name, archive_dir="archives"):
-
-		if isinstance(archive_name, pd.core.frame.DataFrame):
-			self.dataframe = archive_name ## no copies here
-
-		if isinstance(archive_name, search.archive.Archive):
-			self.dataframe = load_from_search_archive(archive_name)
-
-		if isinstance(archive_name, str):
-			# need a filename or a dir name....
-			self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data)
-
-	def query(self):
-		q = analysis.query.Query(self)
-		return q
-
--- a/analysis/format.py
+++ b/analysis/format.py
@ -1,165 +0,0 @@
-import analysis.query
-import logging, html, numpy
-from tabulate import tabulate
-
-def makeurl(text, url):
-	return '<a href="' + url + '">' + text + "</a>"
-
-def table_threads_ranking(ranking_dataframe):
-
-	html_str = '<table class="threads_ranking">'
-
-
-	html_str += '<tr>'
-	html_str += '<td class="td_date_t">date</td>'
-	html_str += '<td class="td_subject_t">subject</td>'
-	html_str += '<td class="td_from_t">from</td>'
-	html_str += '<td class="td_rep_t">replies</td>'
-	html_str += '</tr>'
-
-
-	for i, row in ranking_dataframe.iterrows():
-
-		html_str += '<tr>'
-		html_str += '<td class="td_date">' + str(i) + '</td>'
-		html_str += '<td class="td_subject">' + makeurl(row['subject'], row['url']) + '</td>'
-		html_str += '<td class="td_from">' + row['from'] + '</td>'
-		html_str += '<td class="td_rep">' + str(row['nbr-references']) + '</td>'
-		html_str += '</tr>'
-
-	html_str += "</table>"
-
-	return html_str
-
-def frame_to_dictionary_threads_ranking(ranking_dataframe):
-
-	results = []
-	for i, row in ranking_dataframe.iterrows():
-		d = {'date': str(i), 'subject': row['subject'], 'url': row['url'], 'from': row['from'], 'nbr-references': row['nbr-references']}
-		results.append(d)
-	return results
-
-
-
-class Html:
-
-	query = None
-
-	def __init__(self, q=None):
-
-		if not isinstance(q, query.Query):
-			logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
-			raise Exception()
-
-		self.query = q
-
-	def threads_ranking(self, rank=5, resolution=None):
-		
-		data = self.query.threads_ranking(rank=rank)
-
-		h = html.HTML()
-		t = h.table()
-
-		r = t.tr
-		r.td('date', klass='td_date_t')
-		r.td('from', klass='td_from_t')
-		r.td('replies', klass='td_rep_t')
-		r.td('subject', klass='td_subject_t')
-
-		for i, row in data.iterrows():
-			r = t.tr
-
-			print(row.index)
-
-			r.td(str(row['date']), klass='td_date')
-			r.td(row['from'], klass='td_from')
-			r.td(str(row['nbr-references']), klass='td_rep')
-			r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
-
-		return str(t)
-
-	@staticmethod
-	def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}):
-
-		header = []
-		if data_frame.index.name in name_map:
-			header.append(name_map[data_frame.index.name])
-		else:
-			header.append(data_frame.index.name)
-		for h in data_frame.columns:
-			if h in name_map:
-				h = name_map[h]
-			header.append(h)
-
-		css_header = []
-		css_element = []
-		for i in header:
-			css_header.append('td_' + i + '_t')
-			css_element.append('td_' + i)
-
-		h = html.HTML()
-		if table_name:
-			t = h.table(id=table_name, klass=table_name + '_t')
-		else:
-			t = h.table()
-
-		# url map
-		url_hash = {}
-		url_skip = []
-		url_keys = url_map.keys()
-		for u in url_keys:
-			 if u in header and url_map[u] in header:
-			 	url_indx = header.index(url_map[u])
-			 	url_hash[header.index(u)] = url_indx
-			 	url_skip.append(url_indx)
-			 	header.pop(url_indx)
-
-		#header
-		r = t.tr
-		n = 0
-		for j in header:
-			r.td(str(j), klass=css_header[n])
-			n += 1
-
-		
-		#elements		
-		for k, row in data_frame.iterrows():
-			r = t.tr
-			r.td(str(k), klass=css_element[0])
-			n = 1
-			for l in row:
-
-				if n in url_skip:
-					continue
-
-				if isinstance(l, float): 
-					if l % 1 > 0:
-						l = '{0:.4f}'.format(l)
-					else:
-						l = int(l)
-
-				if n in url_hash.keys():
-					url = row[url_hash[n] - 1]
-					r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
-
-				else:
-					r.td(str(l), klass=css_element[n])
-				n += 1
-
-		return str(t)
-
-class Tab:
-
-	@staticmethod
-	def from_dataframe(data_frame, name_map={}, format=".0f"):
-
-		header = []
-		header.append(data_frame.index.name)
-		for h in data_frame.columns:
-			if h in name_map:
-				h = name_map[h]
-			header.append(h)
-
-		return tabulate(data_frame, headers=header, floatfmt=format)
-
-
--- a/analysis/plot.py
+++ b/analysis/plot.py
@ -1,79 +0,0 @@
-import numpy as np
-import pandas as pd
-import analysis.query
-
-# for colormaps see: 
-# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps
-# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps
-# http://matplotlib.org/examples/color/colormaps_reference.html
-# for colors see:
-# http://matplotlib.org/examples/color/named_colors.html
-
-# spectre: slategrey
-# nettime: red
-# crumb: purple
-# empyre: darkblue
-
-def bar_plot_series(series, title, color='blueviolet', ylim=None):
-	return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim)
-
-def save(plot, name):
-	fig = plot.get_figure()
-	fig.savefig(name)
-
-class Plot:
-
-	query = None
-
-	def __init__(self, q=None):
-
-		if not isinstance(q, analysis.query.Query):
-			logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query")
-			raise Exception()
-
-		self.query = q
-
-	'''
-	activity
-	'''					
-
-	def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
-
-		activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys()
-		series = []
-		for k in activity_rank:
-			series.append(self.query.activity_from(k, resolution, series=True))
-			
-		df = pd.concat(series, axis=1)
-
-		return df.plot.area(colormap='spectral', figsize=figsize, stacked=False)
-		
-	'''
-	content lenght
-	'''
-
-	def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
-
-		content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys()
-		series = []
-		for k in content_rank:
-			series.append(self.query.content_length_from(k, resolution, series=True))
-			
-		df = pd.concat(series, axis=1)
-
-		return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)		
-
-	'''
-	threads
-	'''			
-
-	def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
-
-		threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys()
-		series = []
-		for k in threads_rank:
-			series.append(self.query.threads_from(k, resolution, series=True))
-			
-		df = pd.concat(series, axis=1)
-		
-		return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
--- a/analysis/query.py
+++ b/analysis/query.py
@ -1,573 +0,0 @@
-import numpy as np
-import pandas as pd
-import analysis.archive
-import logging
-
-class Query:
-
-	archive = None			# analysis.archive.Archive object
-	activity = None			# (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
-	content_length = None	# (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
-	threads = None			# ...
-	single_threads = None
-	replies = None			# ...
-
-	def __init__(self, arch=None):
-
-		if not isinstance(arch, analysis.archive.Archive):
-			logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
-			raise Exception()
-
-		self.archive = arch
-
-	'''
-	activity
-	'''			
-
-	def _activity(self):
-
-		if self.activity is None:
-			from_index = self.archive.dataframe.reindex(columns=['from'])
-			self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
-
-		return self.activity
-
-	def activity_from(self, email_address, resolution='y', series=False):
-
-		eaddr = email_address.replace('@', '{at}').lower()
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None		
-
-		self._activity()
-		try:
-			af = self.activity[eaddr]			
-		except KeyError:
-			return None
-
-		activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
-
-		if freq == 'AS':
-			activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
-			activity_from.index.name = 'year'
-		else:
-			activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
-			activity_from.index.name = 'year-month'
-
-		if series:
-			return activity_from
-
-		return activity_from.to_frame('nbr-messages').astype(int)
-
-	def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
-		
-		self._activity()
-		afr = self.activity.sum(axis=0).order(ascending=False)
-		if filter_nettime:
-			p = r'^((?!nettime*).)*$'
-			afr = afr[afr.index.str.contains(p)]
-
-		if series:
-			return afr[:rank]
-
-		return afr[:rank].to_frame('nbr-messages').astype(int)	
-
-
-	# def activity_overall(self, resolution='y', series=False):
-
-	# 	freq = 'M'
-	# 	if resolution.lower() == 'y':
-	# 		freq = 'AS'
-	# 	elif resolution.lower() == 'm':
-	# 		freq = 'M'
-	# 	else:
-	# 		return None
-
-	# 	self._activity()
-
-	# 	y = self.activity.sum(axis=1)
-	# 	y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
-
-	# 	if freq == 'AS':
-	# 		y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
-	# 		y.index.name = 'year'
-	# 	else:
-	# 		y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
-	# 		y.index.name = 'year-month'
-
-	# 	if series:
-	# 		return y
-
-	# 	return y.to_frame('nbr-messages').astype(int)
-
-	def activity_overall(self, resolution='y', series=False):
-
-		a = self.archive.dataframe['url']
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None
-
-		y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()
-
-		if freq == 'AS':
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
-			y.index.name = 'year'
-		else:
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
-			y.index.name = 'year-month'
-
-		if series:
-			return y
-
-		return y.to_frame('nbr-messages').astype(int)
-
-	def cohort(self, resolution='m', series=False):
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None
-
-		self._activity()
-
-		c = self.activity.idxmax().order().to_frame('date')
-		c.index = c['date']
-
-		cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()
-
-		if freq == 'AS':
-			cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
-			cohort.index.name = 'year'
-		else:
-			cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
-			cohort.index.name = 'year-month'
-
-		if series:
-			return cohort
-
-		return cohort.to_frame('first-messages').astype(int)
-
-	'''
-	content lenght
-	'''
-
-	def _content_length(self):
-
-		if self.content_length is None:
-			from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
-			self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
-			self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
-
-		return self.content_length
-
-	def content_length_from(self, email_address, resolution='y', series=False):
-
-		eaddr = email_address.replace('@', '{at}').lower()
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None		
-
-		self._content_length()
-		try:
-			af = self.content_length[eaddr]			
-		except KeyError:
-			return None
-
-		content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
-
-		if freq == 'AS':
-			content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
-			content_length_from.index.name = 'year'
-		else:
-			content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
-			content_length_from.index.name = 'year-month'
-
-		if series:
-			return content_length_from
-
-		return content_length_from.to_frame('nbr-bytes').astype(int)
-
-	def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
-		
-		self._content_length()
-		cfr = self.content_length.sum(axis=0).order(ascending=False)
-		if filter_nettime:
-			p = r'^((?!nettime*).)*$'
-			cfr = cfr[cfr.index.str.contains(p)]
-
-		if series:
-			return cfr[:rank]
-
-		return cfr[:rank].to_frame('nbr-bytes').astype(int)
-
-	def content_length_overall(self, resolution='y', series=False):
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None
-
-		self._content_length()
-
-		y = self.content_length.sum(axis=1)
-		y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
-
-		if freq == 'AS':
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
-			y.index.name = 'year'
-		else:
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
-			y.index.name = 'year-month'
-
-		if series:
-			return y
-
-		return y.to_frame('nbr-bytes').astype(int)
-
-
-	'''
-	threads
-	'''			
-
-	def _threads(self, thresh=0):
-
-		print("doing threads")
-
-		if self.threads is None:
-			self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
-
-		if self.single_threads is None:
-			self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
-
-		return self.threads;
-
-	def threads_ranking(self, rank=5, resolution='y'):
-
-		self._threads()
-
-		if resolution == None:
-			data = self.threads.drop('message-id', axis=1)[:rank]
-			return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None
-
-		# get the threads ranking per time resolution
-		# 
-		data = self.threads.drop('message-id', axis=1)
-		data = data.groupby([pd.TimeGrouper(freq=freq)])
-		r = {}
-		for k, v in data:
-			if freq == 'AS':
-				time_key = k.strftime('%Y')
-			else:
-				time_key = k.strftime('%Y-%m')
-			frame = v[:rank]
-			frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
-			r[time_key] = frame
-		return r
-
-	def threads_replies_to(self, email_address, resolution='y', series=False):
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None
-
-		self._threads()
-
-		eaddr = email_address.replace('@', '{at}').lower()
-
-		self._threads()
-		threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
-		threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum()  # <-- sum = adding up nbr references 
-		threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
-
-		if series:
-			return threads_from_ranking[eaddr]
-
-		threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
-
-		if freq == 'AS':
-			threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
-			threads_from_ranking.index.name = 'year'
-		else:
-			threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
-			threads_from_ranking.index.name = 'year-month'
-
-		return threads_from_ranking
-
-	def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
-
-		self._threads()
-
-		tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
-
-		if filter_nettime:
-			p = r'^((?!nettime*).)*$'
-			tfr = tfr[tfr.index.str.contains(p)]
-
-		tfr = tfr[:rank].astype(int)
-		return tfr
-
-	def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
-
-		self._threads()
-		tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
-		if filter_nettime:
-			p = r'^((?!nettime*).)*$'
-			tir = tir[tir.index.str.contains(p)]
-
-		if series:
-			return tir[:rank]
-
-		return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
-
-	def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
-
-		# activity
-		self._activity()
-		afr = self.activity.sum(axis=0).astype(int)
-		if filter_nettime:
-			p = r'^((?!nettime*).)*$'
-			afr = afr[afr.index.str.contains(p)]
-
-		# initiated threads [top 25]
-		self._threads()
-		tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
-		if filter_nettime:
-			p = r'^((?!nettime*).)*$'
-			tir = tir[tir.index.str.contains(p)]
-
-		inter = afr.index.intersection(tir.index)
-		avg = tir[inter] / afr[inter]
-
-		labels = ['messages', 'threads', 'avg.threads']
-		return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
-
-	def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
-
-		self._threads()
-
-		#initiated
-		tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
-		if filter_nettime:
-			p = r'^((?!nettime*).)*$'
-			tir = tir[tir.index.str.contains(p)]
-
-		#replies [top 25]
-		tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
-		if filter_nettime:
-			p = r'^((?!nettime*).)*$'
-			tfr = tfr[tfr.index.str.contains(p)]
-		tfr = tfr['nbr-references']			# dataframe to series
-
-
-		inter = tir.index.intersection(tfr.index)
-		avg = tfr[inter] / tir[inter] 
-
-		labels = ['threads', 'replies', 'avg.replies']
-		return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
-
-
-	def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None
-
-		agg = aggregate.lower()
-		if not agg in ['sum', 'mean', 'count']:
-			return None
-
-		if not self.threads is None:
-			del self.threads
-			self.threads = None
-
-		self._threads(tresh)
-
-		if agg == 'sum':
-			# number of replies total (re: sum all the replies)
-			y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
-		elif agg == 'mean':
-			y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
-		else:
-			# number of threads (re: msgs with at least one reply)
-			y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
-
-		if freq == 'AS':
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
-			y.index.name = 'year'
-		else:
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
-			y.index.name = 'year-month'
-
-		if series:
-			return y
-
-		return y.to_frame('nbr-threads').astype(int)
-
-	def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None
-
-		agg = aggregate.lower()
-		if not agg in ['sum', 'mean', 'count']:
-			return None
-
-		if not self.single_threads is None:
-			del self.single_threads
-			self.single_threads = None
-
-		self._threads(tresh)
-
-
-		y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
-
-
-		if freq == 'AS':
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
-			y.index.name = 'year'
-		else:
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))			
-			y.index.name = 'year-month'
-
-		if series:
-			return y
-
-		return y.to_frame('nbr-threads').astype(int)
-
-
-	'''
-	replies
-	'''
-
-	def _replies(self):
-
-		if self.replies is None:
-			self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
-			self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
-		return self.replies;
-
-	def replies_ranking(self, rank=5, resolution=None):
-
-		self._replies()
-
-		if resolution == None:
-			data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
-			return data.to_frame('nbr_replies')
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None
-
-		# get the threads ranking per time resolution
-		# 
-		data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
-		r = {}
-		for k, v in data:
-			if freq == 'AS':
-				time_key = k.strftime('%Y')
-			else:
-				time_key = k.strftime('%Y-%m')
-			frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
-			r[time_key] = frame.to_frame('nbr-replies')
-		return r
-
-	def replies_avg_ranking(self, rank=5, filter_nettime=True):
-
-			# activity
-			self._activity()
-			afr = self.activity.sum(axis=0)
-			if filter_nettime:
-				p = r'^((?!nettime*).)*$'
-				afr = afr[afr.index.str.contains(p)]
-
-			# replies in thread [top 25]
-
-			self._replies()
-			rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
-
-			inter = afr.index.intersection(rpl.index)
-			avg = rpl[inter] / afr[inter]
-
-			labels = ['messages', 'replies', 'avg.replies']
-			return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
-
-	def replies_overall(self, resolution='y', series=False):
-
-		freq = 'M'
-		if resolution.lower() == 'y':
-			freq = 'AS'
-		elif resolution.lower() == 'm':
-			freq = 'M'
-		else:
-			return None
-
-		if not self.replies is None:
-			del self.replies
-			self.replies = None
-
-		self._replies()
-
-		y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()
-
-
-		if freq == 'AS':
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
-			y.index.name = 'year'
-		else:
-			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
-			y.index.name = 'year-month'
-
-		if series:
-			return y
-
-		return y.to_frame('nbr-replies').astype(int)
-
-
-
-
--- a/analysis/util.py
+++ b/analysis/util.py
@ -1,92 +0,0 @@
-import email
-import hashlib
-
-def format_content(msg, archive_name):
-	return msg['content']
-
-def format_url(msg, archive_name):
-	return msg['url']
-
-def format_author(msg, archive_name):
-	return msg['author_name']
-
-def format_from_token(from_str, sep):
-
-	fff = from_str
-
-	from_addr = email.utils.parseaddr(from_str)[1]
-
-	fffa = email.utils.parseaddr(from_str)
-
-	if sep not in from_addr:
-		tok = from_str.split()
-		try:
-			at = tok.index(sep)
-			from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
-			if from_addr.startswith('<') or from_addr.endswith('>'):
-				from_addr = from_addr.strip('<').strip('>')
-		except ValueError:
-			print(tok)
-			print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
-			print("*** " + fff)
-			print("+++")
-			print(fffa)
-			print("----")
-
-			return None
-	else:
-		from_addr = from_addr.replace(sep, '{AT}')
-	return from_addr.lower()
-
-def format_from(msg, archive_name):
-	from_str = msg['from']	
-
-	if " {AT} " in from_str:
-		return format_from_token(from_str, '{AT}')
-	elif " at " in from_str:
-		return format_from_token(from_str, 'at')
-	elif "@" in from_str:
-		return format_from_token(from_str, '@')
-	else:
-		return from_str
-
-# returns utc timestamp
-def format_date(msg, archive_name):
-	date_str = msg['date']
-	time_tz = None
-	try:
-		date_tz = email.utils.parsedate_tz(date_str)
-		time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
-	except TypeError:
-		print("Format Date TypeError")
-		print("  > " + date_str)
-		return None
-	except ValueError:
-		print("Format Date ValueError")
-		print("  > " + date_str)
-		return None
-	finally:
-		return time_tz
-
-def format_subject(msg, archive_name):
-	return msg['subject']
-
-def format_id(msg, archive_name):
-	if "message-id" in msg:
-		return msg['message-id']
-	else:
-		# create hash with author_name + date
-		s = msg['author_name'] + msg['date']
-		sha = hashlib.sha1(s.encode('utf-8'))
-		return sha.hexdigest()
-
-# format='%d/%m/%Y'
-def min_date(archive_name):
-	if "nettime" in archive_name:
-		return '01/10/1995'
-	elif archive_name == "spectre":
-		return '01/08/2001'
-	elif archive_name == "empyre":
-		return '01/01/2002'
-	elif archive_name == "crumb":
-		return '01/02/2001'
--- a/archive/init.py
+++ b/archive/init.py
--- a/archive/archive.py
+++ b/archive/archive.py
@ -0,0 +1,257 @@
+import email, email.parser
+import os, json, gzip, re
+import mysql.connector as mariadb
+import archive.sql, archive.util
+from datetime import date, datetime
+from dateutil import parser
+import terminal.progress
+
+def load_from_file(filename, archive_name, archive_dir):
+
+	if not filename.endswith('.json.gz'):
+		file_path = os.path.join(archive_dir, filename + '.json.gz')
+	else:
+		file_path = os.path.join(archive_dir, filename)
+
+	if os.path.isfile(file_path):
+		with gzip.open(file_path, 'r') as fp:
+			json_data = json.load(fp)
+			return (json_data, archive_name)
+	else:
+		#list of all "filename[...].json.gz" in archive_dir
+		files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
+		if files:
+			filename = files[-1] # take the most recent (listed alpha-chronological)
+			file_path = os.path.join(archive_dir, filename)
+			if os.path.isfile(file_path):
+				with gzip.open(file_path, 'r') as fp:
+					json_data = json.load(fp)
+					return (json_data, archive_name) 					# <--- this makes no sense....
+
+		else:
+			#list of all json files in archive_dir/filename
+			dir_path = os.path.join(archive_dir, filename)
+			if not os.path.isdir(dir_path):
+				return None
+
+			files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
+			if not files:
+				return None
+
+			# load all json files
+			threads = []
+			for file_path in files:
+				with open(file_path, 'r') as fp:
+					json_data = json.load(fp)
+					threads.append(json_data)
+			
+			return (threads, archive_name)
+
+def connect_db(database, host, user, password):
+
+	try:
+		con = mariadb.connect(host=host, user=user, password=password, database=database)
+	except mariadb.Error as error:
+		print("Error: {}".format(error))
+		if error.errno == 1049:
+			if util.y_n_question("Table " + archive_name + " does not exist. Create it?"):
+				print("creating")
+			else: 
+				print("not creating")
+		return None
+	finally:
+		return con
+
+
+class Archive:
+
+	data = None	# "raw" json data
+	db_con = None
+
+	def __init__(self, archive_name, archive_dir):
+
+		if isinstance(archive_name, str):
+			# need a filename or a dir name....
+			print("reading archive " + archive_name, end='')
+			(self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir)
+			print(" - done.")
+
+	def __init__(self, archive_name, database, host, user, password):
+
+		self.archive_name = archive_name
+		self.db_con = connect_db(database, host, user, password)
+
+	def __init__(self, archive_name, config):
+
+		self.archive_name = archive_name
+		self.db_con = connect_db(config['database'], config['host'], config['user'], config['password'])
+
+	def __enter__(self):
+		return self
+
+	def __exit__(self, exc_type, exc_value, traceback):
+		if self.db_con is not None:
+			self.db_con.close()
+
+
+	def create_db(self, host, database, user, password):
+
+		print("creating table: " + self.archive_name, end='')
+		self.db_con = connect_db(database, host, user, password)
+		if self.db_con is None:
+			return
+
+		try:			
+			cursor = self.db_con.cursor()
+			cursor.execute(archive.sql.CREATE.format(self.archive_name))
+		except mariadb.Error as error:
+			print("Error: {}".format(error))
+		finally:
+			cursor.close()
+
+		print(" - done.")
+
+	def insert_db(self, host, database, user, password):
+
+		self.db_con = connect_db(database, host, user, password)
+
+		if self.db_con is None:
+			return		
+
+		try:
+			cursor = self.db_con.cursor()
+
+			progress = terminal.progress.ProgressBar(self.archive_name, len(self.data), fmt=terminal.progress.ProgressBar.FULL)
+
+			for t in self.data:
+
+				n_inserted = self.recursive_insert_db(cursor, t["threads"])
+				# print(" - insert: " + str(n_inserted), end='')
+				if n_inserted > 0:
+					self.db_con.commit()
+
+				progress.current += 1
+				progress()
+
+			progress.done()
+			self.db_con.commit()
+
+		except mariadb.Error as error:
+			pass
+			# print("Error: {}".format(error))
+		finally:
+			cursor.close()
+
+	def recursive_insert_db(self, cursor, thread):
+		
+		n_inserted = 0
+		for m in thread:
+			try:
+
+				from_ = archive.util.format_from(m)
+				author_name_ = archive.util.format_author(m)
+				to_ = archive.util.format_to(m)					
+				date_ = archive.util.format_date(m, self.archive_name)
+
+				if date_ is None or from_ is None:
+					# print("\nerrorororororo")
+					# print(m['from'] + " -- " + m['date'])
+					continue
+
+				cursor.execute(archive.sql.INSERT, (from_,author_name_,to_,m["subject"],date_,m["content-type"],m["content"],m["url"]))
+				n_inserted += 1
+
+				if "follow-up" in m:
+					n_inserted += self.recursive_insert_db(cursor, m["follow-up"])
+
+			except mariadb.Error as error:
+				if error.errno == 1062:
+					#duplication continue  <------------------------- look this up... 
+					# print("\nError: {}".format(error))
+					continue
+
+		return n_inserted
+
+	def content_search(self, term, bool=True):
+
+		if self.db_con is None:
+			print("Not connection to database...")
+			return
+
+		try:
+			cursor = self.db_con.cursor(buffered=True)
+			if bool:
+				cursor.execute(archive.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term))
+			else:
+				cursor.execute(archive.sql.CONTENT_QUERY.format(self.archive_name, term))
+
+			# print(cursor.rowcount)
+			results = []
+			for (from_, author_name_, subject_, date_, url_) in cursor:
+				results.append((from_, author_name_, subject_, date_, url_))
+				# print("{} {} {}".format(from_, str(date_), url_))
+			return results
+
+		except mariadb.Error as error:
+			print("Error: {}".format(error))
+		finally:
+			cursor.close()
+
+	def from_search(self, term, bool=True):
+
+		if self.db_con is None:
+			print("Not connection to database...")
+			return
+
+		try:
+			cursor = self.db_con.cursor(buffered=True)
+			if bool:
+				cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term))
+			else:
+				cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term))
+
+			# print(cursor.rowcount)
+			results = []
+			for (from_, author_name_, subject_, date_, url_) in cursor:
+				results.append((from_, author_name_, subject_, date_, url_))
+				# print("{} {} {}".format(from_, str(date_), url_))
+			return results
+
+		except mariadb.Error as error:
+			print("Error: {}".format(error))
+		finally:
+			cursor.close()
+
+	# analysis
+	def longest_field(self, field, thread, max_length=0):
+		import archive.util
+		for m in thread:
+			if not field in m:
+				if "threads" in m:
+					max_length = self.longest_field(field, m["threads"], max_length)
+				continue
+			if m[field] is None:
+				continue
+			if field == "from":
+				m[field] = archive.util.format_from(m)
+			elif field == "author_name":
+				m[field] = archive.util.format_author(m)
+			elif field == "to":
+				m[field] = archive.util.format_to(m)
+			elif field == "date":
+				m[field] = str(archive.util.format_date(m, self.archive_name))
+
+
+			if m[field] is None:
+				continue
+
+			l = len(m[field])
+			if l > max_length:
+				max_length = l
+				print(">> " + m[field])
+			if "follow-up" in m:
+				max_length = self.longest_field(field, m["follow-up"], max_length)
+		return max_length
+
+
+		
--- a/archive/sql.py
+++ b/archive/sql.py
@ -0,0 +1,31 @@
+CREATE = "CREATE TABLE `{}` (" \
+	"`from_` varchar(85) NOT NULL," \
+	"`author_name_` varchar(200) NOT NULL," \
+	"`to_` text(60)," \
+	"`subject_` varchar(3500) NOT NULL," \
+	"`date_` datetime NOT NULL," \
+	"`content_type_` varchar(15) NOT NULL," \
+	"`content_` mediumtext NOT NULL," \
+	"`url_`	varchar(100) NOT NULL," \
+"PRIMARY KEY(`from_`, `date_`)," \
+"FULLTEXT (`subject_`, `content_`)," \
+"FULLTEXT (`from_`, `author_name_`)" \
+") ENGINE = InnoDB;"
+
+INSERT = ("INSERT INTO nettime_l" 
+	"(from_, author_name_, to_, subject_, date_, content_type_, content_, url_) "
+	"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")
+
+CONTENT_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
+	"WHERE MATCH(subject_, content_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
+
+CONTENT_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
+	"WHERE MATCH(subject_, content_) AGAINST('{}') ORDER BY date_")
+
+FROM_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
+	"WHERE MATCH(from_, author_name_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
+
+FROM_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
+	"WHERE MATCH(from_, author_name_) AGAINST('{}') ORDER BY date_")
+
+# SELECT from_, author_name_, subject_, date_, url_ FROM nettime_l WHERE MATCH(content_) AGAINST('%s' IN BOOLEAN MODE)
--- a/archive/util.py
+++ b/archive/util.py
@ -0,0 +1,225 @@
+import email, datetime, sys
+import hashlib
+import dateparser
+
+def format_content(msg):
+	return msg['content']
+
+def format_url(msg):
+	return msg['url']
+
+def format_author(msg):
+
+	if 'author_name' not in msg or msg['author_name'] is None:
+		return None
+
+	author_str = msg['author_name'].replace('"', '')
+
+	if "by way of" in author_str:
+		toks = author_str.split("by way of")
+		if toks[0] == "":
+			author_str = format_from(msg)
+		elif toks[0][-1] == "(":
+			author_str = toks[0][:-1].strip()
+		else:
+			author_str = toks[0]
+
+	if ("(" in author_str) or ("<" in author_str):
+		# ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault <zx {AT} xyz.net>	
+		# print("±±±±±±")
+		# print("name: " + author_str)
+		# print("from: " + msg['from'])			
+		if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'):
+			author_str = author_str.split('(')[0].strip()
+		else:
+			author_str = email.utils.parseaddr(author_str)[0]
+		# print("	Name:" + author_str.replace('"', ''))
+		# print("	From:" + format_from(msg))
+
+	if " ," in author_str:
+		# nettime's_roving_reporter ,       thing.net {AT} bbs.thing.net
+	 	author_str = author_str.split(' ,')[0]
+
+
+	return author_str
+
+def format_from_token(from_str, sep):
+	from_addr = email.utils.parseaddr(from_str)[1]
+	if sep not in from_addr:
+		tok = from_str.split()		
+		try:
+			at = tok.index(sep)
+			from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
+			if from_addr.startswith('<') or from_addr.endswith('>'):
+				from_addr = from_addr.strip('<').strip('>')
+		except ValueError:
+			print(tok)
+			print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
+			return None
+	else:
+		from_addr = from_addr.replace(sep, '{AT}')
+	return "".join(from_addr.lower().split())
+
+def format_from(msg):
+
+	if 'from' not in msg or msg['from'] is None:
+		return None
+
+	from_str = msg['from']
+
+	if " {AT} " in from_str:
+		return format_from_token(from_str, '{AT}')
+	elif " at " in from_str:
+		return format_from_token(from_str, 'at')
+	elif "@" in from_str:
+		return format_from_token(from_str, '@')
+	else:
+		return "".join(from_str.split())
+
+def format_to(msg):
+
+	if "to" not in msg or msg["to"] is None:
+		return None
+
+	to_str = msg["to"]
+	toks = email.utils.parseaddr(to_str)
+	# print(toks)
+
+	if len(toks) == 2:
+		to_str = toks[1]
+
+	return "".join(to_str.lower().split()) 
+
+
+# returns utc timestamp --- old...
+def format_date_utc(msg, archive_name):
+
+	if 'date' not in msg or msg['date'] is None:
+		return None
+
+	date_str = msg['date'].replace('.', '')
+	time_tz = None
+	try:
+		date_tz = email.utils.parsedate_tz(date_str)
+		time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
+	except TypeError:
+		print("Format Date TypeError")
+		print("  > " + date_str)
+		return None
+	except ValueError:
+		print("Format Date ValueError")
+		print("  > " + date_str)
+		return None
+	finally:
+		return time_tz
+
+def format_date(msg, archive_name):
+
+	if 'date' not in msg or msg['date'] is None:
+		return None
+
+	# date_str = msg['date'].replace('.', '')
+	date_str = msg['date']
+
+	# fix Thu, 01 Aug 2002 17:33:08 +0900 (JST)
+	if '(' in date_str:
+		date_str = date_str.split('(')[0].rstrip()
+
+
+	date_time = dateparser.parse(date_str)
+	if date_time is None:
+
+		# random stuff...
+		fix = False
+		toks = date_str.split()
+
+		if len(toks[-1]) == 5 or len(toks[-1]) == 4:
+			# ex. Thu, 24 Jan 2002 15:21:31 -0000
+			if toks[-1] in ['+0000', '-0000', '0000']:
+				date_str = date_str[:-5]
+				fix = True
+			# ex. Fri, 25 Jan 2002 13:21:49 +1050
+			elif toks[-1][-2] == '5':
+				d = list(date_str)
+				d[-2] = '3'
+				date_str = "".join(d)
+				fix = True
+
+		if toks[-1][-1] != '0':
+			#ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005' 
+			date_str = date_str[:-5]
+			fix = True
+		
+		if 'Fru' in toks[0]:
+			date_str = date_str.replace('Fru', 'Fri')
+			fix = True
+		elif 'Thur' in toks[0]:
+			date_str = date_str.replace('Thur', 'Thu')
+			fix = True
+
+		if not fix:
+			# print("----")
+			return None
+
+		date_time = dateparser.parse(date_str)
+		if date_time is None:
+
+			if 'GMT' in date_str:
+				# ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100'
+				date_str = date_str.split('GMT')[0].rstrip()
+				fix = True
+
+			if 'METDST' in toks[-1]:
+				# ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST'
+				date_str = date_str.replace('METDST', 'MET')
+				fix = True
+
+
+			if not fix:
+				# print("++++")
+				return None
+
+			date_time = dateparser.parse(date_str)
+			return date_time
+
+	# else:
+	# 	print(date_str)
+	
+	# date_time = datetime.datetime.fromtimestamp(time_tz)
+
+	min_d = datetime.datetime.strptime(min_date(archive_name), "%d/%m/%Y")
+	max_d = datetime.datetime.now()
+
+	date_time_naive = date_time.replace(tzinfo=None)
+
+	if date_time_naive < min_d or date_time_naive > max_d:
+		return None
+
+	return date_time		
+
+def format_subject(msg, archive_name):
+
+	if 'subject' not in msg or msg['subject'] is None:
+		return None
+
+	return msg['subject']
+
+def format_id(msg, archive_name):
+	if "message-id" in msg:
+		return msg['message-id']
+	else:
+		# create hash with author_name + date
+		s = msg['author_name'] + msg['date']
+		sha = hashlib.sha1(s.encode('utf-8'))
+		return sha.hexdigest()
+
+# format='%d/%m/%Y'
+def min_date(archive_name):
+	if "nettime" in archive_name:
+		return '01/10/1995'
+	elif archive_name == "spectre":
+		return '01/08/2001'
+	elif archive_name == "empyre":
+		return '01/01/2002'
+	elif archive_name == "crumb":
+		return '01/02/2001'
--- a/conda_env.yml
+++ b/conda_env.yml
@ -1,29 +1,34 @@
-name: listservs
+name: listserv
 channels:
- defaults
+  - defaults
 dependencies:
- beautiful-soup=4.3.2=py34_0
- click=6.7=py34_0
- flask=0.12=py34_0
- gunicorn=19.1.0=py34_0
- html5lib=0.999=py34_0
- itsdangerous=0.24=py34_0
- jinja2=2.9.6=py34_0
- markupsafe=0.23=py34_2
- openssl=1.0.2l=0
- pastedeploy=1.5.2=py34_1
- pip=9.0.1=py34_1
- python=3.4.5=0
- readline=6.2=2
- setuptools=27.2.0=py34_0
- six=1.10.0=py34_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- werkzeug=0.11.15=py34_0
- wheel=0.29.0=py34_0
- xz=5.2.2=1
- zlib=1.2.8=3
- pip:
-  - beautifulsoup4==4.3.2
-  - webencodings==0.5.1
+  - ca-certificates=2019.5.15=0
+  - openssl=1.0.2s=h1de35cc_0
+  - pip=9.0.1=py34_1
+  - python=3.4.5=0
+  - readline=6.2=2
+  - setuptools=27.2.0=py34_0
+  - sqlite=3.13.0=0
+  - tk=8.5.18=0
+  - wheel=0.29.0=py34_0
+  - xz=5.2.4=h1de35cc_4
+  - zlib=1.2.11=h1de35cc_3
+  - pip:
+    - beautifulsoup4==4.7.1
+    - click==7.0
+    - dateparser==0.7.1
+    - flask==1.0.4
+    - gunicorn==19.9.0
+    - itsdangerous==1.1.0
+    - jinja2==2.10.1
+    - markupsafe==1.1.1
+    - mysql-connector-python==8.0.16
+    - protobuf==3.8.0
+    - python-dateutil==2.8.0
+    - pytz==2019.1
+    - regex==2019.6.8
+    - six==1.12.0
+    - soupsieve==1.9.2
+    - tzlocal==1.5.1
+    - werkzeug==0.15.4

--- a/search/archive.py
+++ b/search/archive.py
@ -1,150 +0,0 @@
-import logging, os, json, re
-from datetime import datetime
-
-import analysis.archive ## circular...
-import analysis.query
-import analysis.format
-
-import threading
-
-class Archive():
-
-	def __init__(self, archives_dir=None):
-		if archives_dir==None:
-			from www import config
-			self.archives_dir = config.ARCHIVES_PATH
-		else:
-			self.archives_dir = archives_dir
-
-		self.loaded = False
-
-		self.lock_search = threading.Lock()
-		self.lock_threads_ranking = threading.Lock()
-
-	def load(self, archive_name=None):
-
-		if archive_name == None:
-			raise Exception('Archive is not specified')
-
-		archive_path = os.path.join(self.archives_dir, archive_name)
-		if not os.path.isdir(archive_path):
-			raise Exception('Archive ' + path + ' does not exist')
-
-		self.archive_name = archive_name
-		self.archive_path = archive_path
-
-		files = [f for f in os.listdir(archive_path) if f.endswith('.json')]
-
-		self.archive = {}
-
-		for f in files:
-			file_path = os.path.join(archive_path, f)
-			label = f.replace('.json', '')
-			with open(file_path) as fdata:
-				self.archive[label] = json.load(fdata)	
-
-		self.loaded = True		
-
-	def search_message(self, keyword, msg, index_str, results, field='content'):
-
-		nbr_hits = 0
-		if msg[field] is not None and msg[field].lower().find(keyword.lower()) > 0:
-			nbr_hits += 1
-			results.append({ "index_str": index_str, "subject": msg['subject'], "date": msg['date'], "author_name": msg['author_name'], "url": msg['url'] })
-
-		if 'follow-up' in msg:
-			i = 0
-			for m in msg['follow-up']:
-				current_index_str = index_str + '/' + str(i)
-				nbr_hits += self.search_message(keyword, m, current_index_str, results, field)
-				i += 1
-
-		return nbr_hits
-
-
-	def search(self, keyword, field='content', min_hits=0):
-
-		with self.lock_search:
-
-			search_results = { "keyword": keyword, "field": field, "archive": self.archive_name, "results": [] }
-
-			for k, v in sorted(self.archive.items(), key=get_key, reverse=True):
-
-				current_index_str = self.archive_name + '/' + k
-				hits = []
-				nbr_hits = 0
-				i = 0
-				for m in v['threads']:
-					current_index_str = self.archive_name + '/' + k + '/' + str(i)
-					nbr_hits += self.search_message(keyword, m, current_index_str, hits, field)
-					i += 1
-
-				if nbr_hits > min_hits:
-					# nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001')
-					if k.startswith("nettime-l_"):
-						dt = datetime.strptime(k, "nettime-l_%b_%y")
-						k = dt.strftime("%B_%Y")
-					search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
-
-			return search_results
-
-	def threads_ranking(self, rank=5):
-
-		with self.lock_threads_ranking:
-
-			search_results = { "keyword": "thread ranking", "field": "ranking", "archive": self.archive_name, "results": [] }
-
-			a = analysis.archive.Archive(self)
-			q = a.query();
-
-			ranking = q.threads_ranking(rank=rank)
-
-			for i in ranking:
-				r = analysis.format.frame_to_dictionary_threads_ranking(ranking[i])
-				for h in r:
-					hit = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}]				
-					search_results['results'].append({'thread': h['date'], 'nbr_hits': h['nbr-references'], 'hits': hit})
-			del a
-			del q
-
-			return search_results
-
-
-				
-def get_key(kv_tuple):
-
-	k = kv_tuple[0]	
-
-	# k is of the form "Month_Year" - ex.: "January_2001"
-	try:
-		return datetime.strptime(k, "%B_%Y")
-	except Exception:
-		pass
-
-	# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
-	try:
-		return datetime.strptime(k, "%b_%y")
-	except Exception:
-		pass
-
-	# k is of the form "Year" - ex.: "2001"
-	try:
-		return datetime.strptime(k, "%Y")
-	except Exception:
-		pass
-
-	# nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01"
-	try:
-		return datetime.strptime(k, "nettime-l_%b_%y")
-	except Exception:
-		pass
-
-	print("--------------")
-	print(k)
-
-	return None
-
-
-
-
-
--- a/2
+++ b/2
@ -1 +1 @@
-source activate listservs
+source activate listserv
--- a/terminal/progress.py
+++ b/terminal/progress.py
@ -0,0 +1,43 @@
+from __future__ import print_function
+import sys
+import re
+
+# https://stackoverflow.com/questions/3160699/python-progress-bar
+
+class ProgressBar(object):
+    DEFAULT = 'Progress: %(bar)s %(percent)3d%%'
+    FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'
+
+    def __init__(self, title, total, width=40, fmt=DEFAULT, symbol='=',
+                 output=sys.stderr):
+        assert len(symbol) == 1
+
+        self.title = title
+        self.total = total
+        self.width = width
+        self.symbol = symbol
+        self.output = output
+        self.fmt = re.sub(r'(?P<name>%\(.+?\))d',
+            r'\g<name>%dd' % len(str(total)), fmt)
+
+        self.current = 0
+
+    def __call__(self):
+        percent = self.current / float(self.total)
+        size = int(self.width * percent)
+        remaining = self.total - self.current
+        bar = self.title + ' [' + self.symbol * size + ' ' * (self.width - size) + ']'
+
+        args = {
+            'total': self.total,
+            'bar': bar,
+            'current': self.current,
+            'percent': percent * 100,
+            'remaining': remaining
+        }
+        print('\r' + self.fmt % args, file=self.output, end='')
+
+    def done(self):
+        self.current = self.total
+        self()
+        print('', file=self.output)
--- a/terminal/util.py
+++ b/terminal/util.py
@ -0,0 +1,16 @@
+
+def y_n_question(question_str):
+
+	yes = {'yes','y', 'ye', ''}
+	no = {'no','n'}
+
+	while True:
+		sys.stdout.write(question_str + " [Y/n]: ")
+		choice = input().lower()
+		if choice in yes:
+		   return True
+		elif choice in no:
+		   return False
+		else:
+		   sys.stdout.write("\nPlease respond with 'yes' or 'no'\n")
+		   continue		
--- a/3
+++ b/3
@ -1,2 +1 @@
-
-gunicorn -w 1 -b 127.0.0.1:5555 www-serve:app
+gunicorn -w 1 --bind 0.0.0.0:5555 www-serve:app
--- a/www-serve.py
+++ b/www-serve.py
@ -1,2 +1,4 @@
 from www import app
-#app.run(debug=True, threaded=True, use_reloader=False) 	# uncomment this line to run flask's server
+
+if __name__ == "__main__":
+	app.run(debug=True, use_reloader=False)
--- a/www/routes.py
+++ b/www/routes.py
@ -1,144 +1,46 @@
 from flask import render_template, request, jsonify
 from www import app
-from www import archives
-import search.archive
+import archive.archive as archive
+import config
+import www.config as wconfig
 from datetime import datetime
-
 import logging
-logging.info(' ------- arch = Archives() -------- ')
-arch = archives.Archives()
-arch.load()
-archives_data = arch.data


@app.route('/')
 def index():
-	k = archives_data.keys()
-	return render_template("index.html", archives=k)
-
-# def get_key(kv_tuple):
-
-# 	k = kv_tuple[0]
-
-# 	# k is of the form "Month_Year" - ex.: "January_2001"
-# 	try:
-# 		return datetime.strptime(k, "%B_%Y")
-# 	except Exception:
-# 		pass
-
-# 	# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
-# 	try:
-# 		return datetime.strptime(k, "%b_%y")
-# 	except Exception:
-# 		pass
-
-# 	# k is of the form "Year" - ex.: "2001"
-# 	try:
-# 		return datetime.strptime(k, "%Y")
-# 	except Exception:
-# 		pass
-
-# 	return None
-
-@app.route('/<list>')
-def get_list(list):
-	if list in archives_data:
-		d = []
-		for k, v in sorted(archives_data[list].archive.items(), key=search.archive.get_key, reverse=True):
-			d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])})
-		return render_template("list.html", list_name=list, list=d)
-
-	else:
-		return 'nee nee'
-
-@app.route('/<list>/<sublist>')
-def get_sublist(list, sublist):
-
-	print(list)
-	print(sublist)
-
-	sublist = sublist.replace(' ', '_')
-	if list in archives_data and sublist in archives_data[list].archive:
-		return render_template("threads.html", sublist_name=sublist, threads=archives_data[list].archive[sublist]['threads'])
-	else:
-		return 'na na'
-
-@app.route('/<list>/<sublist>/<int:index>')
-def get_message(list, sublist, index):
-
-	sublist = sublist.replace(' ', '_')
-	index = int(index)
-	if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']):
-		return render_template("message.html", message=archives_data[list].archive[sublist]['threads'][index])
-	else:
-		'non non'
-
-@app.route('/<list>/<sublist>/<int:index>/<path:follow_ups>')
-def get_follow_ups(list, sublist, index, follow_ups):
-
-	sublist = sublist.replace(' ', '_')
-	index = int(index)
-
-	ups = follow_ups.split('/')
-	follow = []
-	for u in ups:
-		follow.append(int(u))
-
-	if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']):
-		message = archives_data[list].archive[sublist]['threads'][index]
-		for f in follow:
-			message = message['follow-up'][f]
-		return render_template("message.html", message=message)
-	else:
-		'nope nope'
+	return render_template("index.html")

@app.route('/search')
 def searh():
 	
 	if len(request.args) < 1:
-		k = archives_data.keys()
-		return render_template("search.html", archives=k, fields=['content', 'from(name)', 'from(email)'], hits=['n/a', '2', '3', '4', '5', '6', '7', '8', '9'])
+		return render_template("search.html", archives=wconfig.lists_to_serve, fields=['content', 'from'])

 	k_arg = request.args.get('keyword')
 	l_arg = request.args.get('list')
-	sl_arg = request.args.get('sublist')
 	f_arg = request.args.get('field')
-	h_arg = request.args.get('hits')

 	if k_arg is None or k_arg.strip() == '':
 		return "no keyword..."
-
-	if l_arg is None:
-		return "no list..."
 	
-	if not (l_arg == "all") and not (l_arg in archives_data):
+	if l_arg != "all" and l_arg not in wconfig.lists_to_serve:
 		return "list '" + l_arg + "' does not exist"

-	if sl_arg is not None:
-		if not sl_arg in archives_data[l]:
-			return "sublist '" + sl_arg + "' does not exist in list '" + l_arg + "'"
+	if f_arg not in ['content', 'from']:
+		return "field '" + f_arg + "' does not exist"

-	if f_arg == "from(name)":
-		f_arg = 'author_name'
-	elif f_arg == "from(email)":
-		f_arg = 'from'

 	lists = []
 	if l_arg == "all":
-		for k in archives_data.keys():
-			lists.append(k)
+		lists = wconfig.lists_to_serve
 	else:
 		lists.append(l_arg)

-	nbr_hits = 0
-	if h_arg in ['2', '3', '4', '5', '6', '7', '8', '9']:
-		nbr_hits = int(h_arg)
-
-

 	################################
 	##
-	##	need to cache all the below
+	##	need to cache all the below.....
 	##
 	################################

@ -147,18 +49,41 @@ def searh():
 	logging.info("search keyword = " + k_arg)

 	for l in lists:
-		if k_arg == "rank":
-			logging.info("	ranking " + l)
-			s = archives_data[l].threads_ranking()
-		else:
-			s = archives_data[l].search(keyword=k_arg, field=f_arg, min_hits=nbr_hits)
+		
+		with archive.Archive(l, config=config.db) as a:
+			if f_arg == 'content':
+				r = a.content_search(k_arg)
+			else:
+				r = a.from_search(k_arg)

-		results.append(s)
+			# format data to return
+			search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] }
+			month_year_results = {}

-	## -- sort results?
-	search_results = sorted(results, key=get_result_key)
+			for (from_, author_name_, subject_, date_, url_) in r:
+				m_y = date_.strftime("%B_%Y")
+				if m_y not in month_year_results:
+					month_year_results[m_y] = []
+				month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_})

-	return jsonify(result=search_results)
+			for k, v in sorted(month_year_results.items(), key=get_key, reverse=True):
+				search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v})
+
+			# search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
+			# where:
+			#	'thread' = "%B_%Y" aka. January 2001
+			#	'nbr_hits' = nbr hits for that month
+			#	'hits' = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}]
+
+		results.append(search_results)
+
+
+	sorted_results = sorted(results, key=get_result_key)
+	return jsonify(result=sorted_results)		
+
+
+def get_key(kv):
+	return datetime.strptime(kv[0], "%B_%Y")

 def get_result_key(r):
 	return r['archive']
--- a/www/templates/index.html
+++ b/www/templates/index.html
@ -1,8 +1,6 @@
 <html>
 <head></head>
 <body>
-	 {% for a in archives %}
-	 <a href="/{{ a }}"><h3>{{ a }}</h3></a>
-	 {% endfor %}
+	 <a href="/search"><h3>---> SEARCH <---</h3></a>
 </body>
 </html>
--- a/www/templates/list.html
+++ b/www/templates/list.html
@ -1,10 +0,0 @@
-<html>
-<head></head>
-<body>
-	<ul>
-	 {% for t in list %}
-	 <li><a href="{{ list_name }}/{{ t.name }}"><h3>{{ t.name }} -- {{ t.nbr_threads }}</h3></a></li>
-	 {% endfor %}
-	 </ul>
-</body>
-</html>
--- a/www/templates/message.html
+++ b/www/templates/message.html
@ -1,11 +0,0 @@
-<html>
-<head>
-	<meta charset="UTF-8">
-</head>
-<body>
-	 <h3>{{ message.subject }}</h3>
-	 <h4>{{ message.author_name }}</h4>
-	 <h4>{{ message.date }}</h4>
-	 <p>{{ message.content }} </p>
-</body>
-</html>
--- a/www/templates/search.html
+++ b/www/templates/search.html
@ -20,11 +20,6 @@
 			 <option value="{{ a }}">{{ a }}</option>
 			 {% endfor %}
 		</select>		
-		<select form="search" name="hits">
-			 {% for a in hits %}
-			 <option value="{{ a }}">{{ a }}</option>
-			 {% endfor %}
-		</select>				
 		<input type="submit" value="search" id="submit">
 		<div id="loading">Loading...</div>
 	</form>
--- a/www/templates/threads.html
+++ b/www/templates/threads.html
@ -1,25 +0,0 @@
-<html>
-<head></head>
-<body>
-{% macro message(m, index, urlpath)-%}
-{% set path = urlpath + '/' + index|string %}
-	<li>
-		 {{ index }}. <a href="{{ path }}">{{ m.subject }}</a> <i>{{ m.author_name }}</i>
-		 {% if m.get('follow-up') %}
-		 <ul>
-		 	{% for msg in m.get('follow-up') %}
-		 		{{ message(m=msg, index=loop.index - 1, urlpath=path) }}
-		 	{% endfor %}
-		 </ul>
-		 {% endif %}		
-	</li>
-{%- endmacro %}
-
-	<ul>
-	 {% for m in threads recursive %}
-	 	{{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }}
-	 {% endfor %}
-	 </ul>
-
-</body>
-</html>