diff --git a/.gitignore b/.gitignore
index 18c05fa..49fd8f2 100755
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,11 @@
-# mailinglists specific
+# listservs specific
archives/
-figs/
+config/
config.py
+test.py
+
+#macos
+.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/README b/README
index 998c786..9ae5b7a 100644
--- a/README
+++ b/README
@@ -1,3 +1,9 @@
+
+TODO (July 2019):
+ - refactor archive.py and search.py
+ - test lists import with mariadb backend
+
+
usage: archive.py [-h] [--arch ARCH] url [url ...]
Mailinglists are dead. Long live mailinglists!
diff --git a/analyse.py b/analyse.py
deleted file mode 100644
index bef4381..0000000
--- a/analyse.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import os
-
-# matplot view/windows
-import matplotlib
-matplotlib.interactive(True)
-
-# pd display
-import pandas as pd
-pd.set_option('display.max_colwidth', 100)
-
-from analysis.archive import Archive
-from analysis.query import Query
-from analysis.plot import Plot
-
-import analysis.format
-
-# spectre: slategrey
-# nettime: red
-# crumb: purple
-# empyre: darkblue
-
-def save_fig_cohort(q, name, dir, color):
- t = name + " - Cohorts"
- pp = q.cohort().plot(color=color, title=t)
- ts = name + "_cohorts.png"
- filename = os.path.join(dir, ts)
- pp.get_figure().savefig(filename)
-
-def save_fig_messages_total(q, name, dir, color):
- t = name + " - Nbr. Messages"
- pp = q.activity_overall().plot(kind='bar', color=color, title=t)
- ts = name + "_messages.png"
- filename = os.path.join(dir, ts)
- pp.get_figure().savefig(filename)
-
-def save_fig_threads_total(q, name, dir, color):
- t = name + " - Nbr. Threads"
- pp = q.threads_overall().plot(kind='bar', color=color, title=t)
- ts = name + "_threads.png"
- filename = os.path.join(dir, ts)
- pp.get_figure().savefig(filename)
-
-def save_fig_messages_constituency(q, name, dir):
- t = name + " - Messages Constituency"
- replies = pd.Series(q.replies_overall(series=True))
- # threads = pd.Series(q.single_threads_overall(series=True))
- threads = pd.Series(q.threads_overall(series=True))
- messages = pd.Series(q.activity_overall(series=True))
- single_messages = messages - (replies + threads)
-
- # df = {'a': single_messages, 'b': threads, 'c': replies}
- # df = pd.DataFrame([single_messages, threads, replies], columns=['a', 'b', 'c'])
- df = pd.concat([single_messages.to_frame('single-messages').astype(int), threads.to_frame('threads').astype(int), replies.to_frame('replies').astype(int)], axis=1)
- pp = df.plot(kind='bar', stacked=True, title=t)
-
- # pp = [single_messages, threads, replies].plot(kind='bar', stacked=True)
-
- ts = name + "_constituency.png"
- filename = os.path.join(dir, ts)
- pp.get_figure().savefig(filename)
-
-def save_fig_avg_threads_replies(q, name, dir, color):
- t = name + " - Avg. Threads + Replies"
- replies = pd.Series(q.replies_overall(series=True))
- threads = pd.Series(q.threads_overall(series=True))
- messages = pd.Series(q.activity_overall(series=True))
-
- avg_threads_messages = (replies + threads) / messages
-
- pp = pd.DataFrame(avg_threads_messages).plot(kind='bar', color=color, title=t)
-
- ts = name + "_avg_threads_replies.png"
- filename = os.path.join(dir, ts)
- pp.get_figure().savefig(filename)
-
-def save_fig_diff_threads_replies_vs_messages(q, name, dir, color):
- t = name + " - Diff. Threads + Replies vs Single Messages"
- replies = pd.Series(q.replies_overall(series=True))
- threads = pd.Series(q.threads_overall(series=True))
- rt = replies + threads
- messages = pd.Series(q.activity_overall(series=True))
-
- diff_threads_messages = (2 * rt) - messages
-
- pp = pd.DataFrame(diff_threads_messages).plot(kind='bar', color=color, title=t)
-
- ts = name + "_diff_threads_replies_messages.png"
- filename = os.path.join(dir, ts)
- pp.get_figure().savefig(filename)
-
-def save_fig_ratio_replies_threads(q, name, dir, color):
- t = name + " - Ratio Replies per Thread"
- replies = pd.Series(q.replies_overall(series=True))
- threads = pd.Series(q.threads_overall(series=True))
-
- ratio_replies_threads = replies / threads
-
- pp = pd.DataFrame(ratio_replies_threads).plot(kind='bar', color=color, title=t)
-
- ts = name + "_ratio_replies_threads.png"
- filename = os.path.join(dir, ts)
- pp.get_figure().savefig(filename)
-
-def html_td_rank_year(year, data):
- td_str = '
'
- if year in data:
- td_str += analysis.format.table_threads_ranking(data[year])
- td_str += ' | '
- return td_str
-
-def html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre):
-
- html_str = ''
-
- html_str += ''
- html_str += '| year | '
- html_str += 'nettime | '
- html_str += 'crumb | '
- html_str += 'spectre | '
- html_str += 'empyre | '
- html_str += '
'
-
- years = sorted(ranking_nettime.keys())
-
- print(years)
-
- for i in years:
- html_str += ''
- html_str += '| ' + i + ' | '
- html_str += html_td_rank_year(i, ranking_nettime)
- html_str += html_td_rank_year(i, ranking_crumb)
- html_str += html_td_rank_year(i, ranking_spectre)
- html_str += html_td_rank_year(i, ranking_empyre)
- html_str += '
'
-
- html_str += '
'
- return html_str
-
-
-print("nettime")
-#nettime
-nt = Archive('nettime-l')
-ntq = nt.query()
-ntp = Plot(ntq)
-
-
-
-# save_fig_cohort(ntq, 'nettime', 'figs/', 'red')
-# save_fig_messages_total(ntq, 'nettime', 'figs/', 'red')
-# save_fig_threads_total(ntq, 'nettime', 'figs/', 'red')
-# save_fig_messages_constituency(ntq, 'nettime', 'figs/')
-
-# save_fig_avg_threads_replies(ntq, 'nettime', 'figs/', 'red')
-# save_fig_diff_threads_replies_vs_messages(ntq, 'nettime', 'figs/', 'red')
-# save_fig_ratio_replies_threads(ntq, 'nettime', 'figs/', 'red')
-
-ranking_nettime = ntq.threads_ranking(rank=15)
-
-# print(r['2000'])
-
-# print(analysis.format.table_threads_ranking(r['2000']))
-
-
-print("crumb")
-#crumb
-cr = Archive('crumb')
-crq = cr.query()
-crp = Plot(crq)
-
-# save_fig_cohort(crq, 'crumb', 'figs/', 'purple')
-# save_fig_messages_total(crq, 'crumb', 'figs/', 'purple')
-# save_fig_threads_total(crq, 'crumb', 'figs/', 'purple')
-# save_fig_messages_constituency(crq, 'crumb', 'figs/')
-
-# save_fig_avg_threads_replies(crq, 'crumb', 'figs/', 'purple')
-# save_fig_diff_threads_replies_vs_messages(crq, 'crumb', 'figs/', 'purple')
-# save_fig_ratio_replies_threads(crq, 'crumb', 'figs/', 'purple')
-
-ranking_crumb = crq.threads_ranking(rank=15)
-
-
-print("empyre")
-#empyre
-em = Archive('empyre')
-emq = em.query()
-emp = Plot(emq)
-
-# save_fig_cohort(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_messages_total(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_threads_total(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_messages_constituency(emq, 'empyre', 'figs/')
-
-# save_fig_avg_threads_replies(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_diff_threads_replies_vs_messages(emq, 'empyre', 'figs/', 'darkblue')
-# save_fig_ratio_replies_threads(emq, 'empyre', 'figs/', 'darkblue')
-
-ranking_empyre = emq.threads_ranking(rank=15)
-
-print("spectre")
-#spectre
-sp = Archive('spectre')
-spq = sp.query()
-spp = Plot(spq)
-
-# save_fig_cohort(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_messages_total(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_threads_total(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_messages_constituency(spq, 'spectre', 'figs/')
-
-# save_fig_avg_threads_replies(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_diff_threads_replies_vs_messages(spq, 'spectre', 'figs/', 'slategrey')
-# save_fig_ratio_replies_threads(spq, 'spectre', 'figs/', 'slategrey')
-
-ranking_spectre = spq.threads_ranking(rank=15)
-
-
-## comparative ranking
-
-rankings = html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre)
-
-html_template = 'figs/ranking/index_template.html'
-with open(html_template, 'r') as fp:
- h = fp.read()
-
-html = h.replace("--table--", rankings)
-
-html_output = 'figs/ranking/index.html'
-with open(html_output, 'w+') as fp:
- fp.write(html)
-
diff --git a/analysis/archive.py b/analysis/archive.py
deleted file mode 100644
index 3fc77cd..0000000
--- a/analysis/archive.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import numpy as np
-import pandas as pd
-import email, email.parser
-import os, datetime, json, gzip, re
-import analysis.util
-import analysis.query
-
-import search.archive ## circular...
-
-
-def filter_date(msg, archive_name):
-
- time_tz = analysis.util.format_date(msg, archive_name)
- if not time_tz:
- return None
-
- dt = datetime.datetime.fromtimestamp(time_tz)
- try:
- date_time = pd.to_datetime(dt)
- except pd.tslib.OutOfBoundsDatetime:
- print('time out of bound')
- print(dt)
- return None
-
- min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y')
- max_date = pd.to_datetime(datetime.datetime.now())
- if date_time < min_date or date_time > max_date:
- return None
-
- return date_time
-
-
-def message_to_tuple_record(msg, records, archive_name, references='X'):
-
- # check date first?
- date = filter_date(msg, archive_name)
- if not date:
- print("Archive::filter_date returned None. Skip.")
- return
-
- # check / filter from email address second?
- from_addr = analysis.util.format_from(msg, archive_name)
- if not from_addr:
- print("Archive::analysis.util.format_from returned None. Skip.")
- return
-
- url = analysis.util.format_url(msg, archive_name)
- author = analysis.util.format_author(msg, archive_name)
- subject = analysis.util.format_subject(msg, archive_name)
- message_id = analysis.util.format_id(msg, archive_name)
- content = analysis.util.format_content(msg, archive_name)
-
- records.append((message_id,
- from_addr,
- author,
- subject,
- date,
- url,
- len(content),
- 0 if not 'follow-up' in msg else len(msg['follow-up']),
- references))
-
- # recursive follow up -- but references is not keeping track really...
- if 'follow-up' in msg:
- for f in msg['follow-up']:
- message_to_tuple_record(f, records, archive_name, references=message_id)
-
- return
-
-def json_data_to_pd_dataframe(json_data, archive_name):
-
- records = []
- for d in json_data:
- for dd in d['threads']:
- message_to_tuple_record(dd, records, archive_name)
-
- print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records)))
-
- df = pd.DataFrame.from_records(records,
- index='date',
- columns=['message-id',
- 'from',
- 'author',
- 'subject',
- 'date',
- 'url',
- 'content-length',
- 'nbr-references',
- 'references'])
-
- df.index.name = 'date'
-
- return df
-
-def load_from_file(filename, archive_name, archive_dir, json_data=None):
-
- if not filename.endswith('.json.gz'):
- file_path = os.path.join(archive_dir, filename + '.json.gz')
- else:
- file_path = os.path.join(archive_dir, filename)
-
- if os.path.isfile(file_path):
- with gzip.open(file_path, 'r') as fp:
- json_data = json.load(fp)
- return json_data_to_pd_dataframe(json_data['threads'], archive_name)
- else:
- #list of all "filename[...].json.gz" in archive_dir
- files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
- if files:
- filename = files[-1] # take the most recent (listed alpha-chronological)
- file_path = os.path.join(archive_dir, filename)
- if os.path.isfile(file_path):
- with gzip.open(file_path, 'r') as fp:
- json_data = json.load(fp)
- return json_data_to_pd_dataframe(json_data['threads'], archive_name)
- else:
- #list of all json files in archive_dir/filename
- dir_path = os.path.join(archive_dir, filename)
- if not os.path.isdir(dir_path):
- return None
-
- files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
- if not files:
- return None
-
- # load all json files
- threads = []
- for file_path in files:
- with open(file_path, 'r') as fp:
- json_data = json.load(fp)
- threads.append(json_data)
-
- print('---> ' + archive_name)
- return json_data_to_pd_dataframe(threads, archive_name)
-
-def load_from_search_archive(archive):
- threads = []
- for k, v in archive.archive.items():
- threads.append(v)
- return json_data_to_pd_dataframe(threads, archive.archive_name)
-
-
-
-
-class Archive:
-
- data = None # "raw" json data
- dataframe = None # main pd dataframe
-
- def __init__(self, archive_name, archive_dir="archives"):
-
- if isinstance(archive_name, pd.core.frame.DataFrame):
- self.dataframe = archive_name ## no copies here
-
- if isinstance(archive_name, search.archive.Archive):
- self.dataframe = load_from_search_archive(archive_name)
-
- if isinstance(archive_name, str):
- # need a filename or a dir name....
- self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data)
-
- def query(self):
- q = analysis.query.Query(self)
- return q
-
diff --git a/analysis/format.py b/analysis/format.py
deleted file mode 100644
index 2ac54a4..0000000
--- a/analysis/format.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import analysis.query
-import logging, html, numpy
-from tabulate import tabulate
-
-def makeurl(text, url):
- return '' + text + ""
-
-def table_threads_ranking(ranking_dataframe):
-
- html_str = ''
-
-
- html_str += ''
- html_str += '| date | '
- html_str += 'subject | '
- html_str += 'from | '
- html_str += 'replies | '
- html_str += '
'
-
-
- for i, row in ranking_dataframe.iterrows():
-
- html_str += ''
- html_str += '| ' + str(i) + ' | '
- html_str += '' + makeurl(row['subject'], row['url']) + ' | '
- html_str += '' + row['from'] + ' | '
- html_str += '' + str(row['nbr-references']) + ' | '
- html_str += '
'
-
- html_str += "
"
-
- return html_str
-
-def frame_to_dictionary_threads_ranking(ranking_dataframe):
-
- results = []
- for i, row in ranking_dataframe.iterrows():
- d = {'date': str(i), 'subject': row['subject'], 'url': row['url'], 'from': row['from'], 'nbr-references': row['nbr-references']}
- results.append(d)
- return results
-
-
-
-class Html:
-
- query = None
-
- def __init__(self, q=None):
-
- if not isinstance(q, query.Query):
- logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
- raise Exception()
-
- self.query = q
-
- def threads_ranking(self, rank=5, resolution=None):
-
- data = self.query.threads_ranking(rank=rank)
-
- h = html.HTML()
- t = h.table()
-
- r = t.tr
- r.td('date', klass='td_date_t')
- r.td('from', klass='td_from_t')
- r.td('replies', klass='td_rep_t')
- r.td('subject', klass='td_subject_t')
-
- for i, row in data.iterrows():
- r = t.tr
-
- print(row.index)
-
- r.td(str(row['date']), klass='td_date')
- r.td(row['from'], klass='td_from')
- r.td(str(row['nbr-references']), klass='td_rep')
- r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
-
- return str(t)
-
- @staticmethod
- def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}):
-
- header = []
- if data_frame.index.name in name_map:
- header.append(name_map[data_frame.index.name])
- else:
- header.append(data_frame.index.name)
- for h in data_frame.columns:
- if h in name_map:
- h = name_map[h]
- header.append(h)
-
- css_header = []
- css_element = []
- for i in header:
- css_header.append('td_' + i + '_t')
- css_element.append('td_' + i)
-
- h = html.HTML()
- if table_name:
- t = h.table(id=table_name, klass=table_name + '_t')
- else:
- t = h.table()
-
- # url map
- url_hash = {}
- url_skip = []
- url_keys = url_map.keys()
- for u in url_keys:
- if u in header and url_map[u] in header:
- url_indx = header.index(url_map[u])
- url_hash[header.index(u)] = url_indx
- url_skip.append(url_indx)
- header.pop(url_indx)
-
- #header
- r = t.tr
- n = 0
- for j in header:
- r.td(str(j), klass=css_header[n])
- n += 1
-
-
- #elements
- for k, row in data_frame.iterrows():
- r = t.tr
- r.td(str(k), klass=css_element[0])
- n = 1
- for l in row:
-
- if n in url_skip:
- continue
-
- if isinstance(l, float):
- if l % 1 > 0:
- l = '{0:.4f}'.format(l)
- else:
- l = int(l)
-
- if n in url_hash.keys():
- url = row[url_hash[n] - 1]
- r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
-
- else:
- r.td(str(l), klass=css_element[n])
- n += 1
-
- return str(t)
-
-class Tab:
-
- @staticmethod
- def from_dataframe(data_frame, name_map={}, format=".0f"):
-
- header = []
- header.append(data_frame.index.name)
- for h in data_frame.columns:
- if h in name_map:
- h = name_map[h]
- header.append(h)
-
- return tabulate(data_frame, headers=header, floatfmt=format)
-
-
diff --git a/analysis/plot.py b/analysis/plot.py
deleted file mode 100644
index 8f08430..0000000
--- a/analysis/plot.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import numpy as np
-import pandas as pd
-import analysis.query
-
-# for colormaps see:
-# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps
-# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps
-# http://matplotlib.org/examples/color/colormaps_reference.html
-# for colors see:
-# http://matplotlib.org/examples/color/named_colors.html
-
-# spectre: slategrey
-# nettime: red
-# crumb: purple
-# empyre: darkblue
-
-def bar_plot_series(series, title, color='blueviolet', ylim=None):
- return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim)
-
-def save(plot, name):
- fig = plot.get_figure()
- fig.savefig(name)
-
-class Plot:
-
- query = None
-
- def __init__(self, q=None):
-
- if not isinstance(q, analysis.query.Query):
- logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query")
- raise Exception()
-
- self.query = q
-
- '''
- activity
- '''
-
- def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
-
- activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys()
- series = []
- for k in activity_rank:
- series.append(self.query.activity_from(k, resolution, series=True))
-
- df = pd.concat(series, axis=1)
-
- return df.plot.area(colormap='spectral', figsize=figsize, stacked=False)
-
- '''
- content lenght
- '''
-
- def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
-
- content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys()
- series = []
- for k in content_rank:
- series.append(self.query.content_length_from(k, resolution, series=True))
-
- df = pd.concat(series, axis=1)
-
- return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
-
- '''
- threads
- '''
-
- def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
-
- threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys()
- series = []
- for k in threads_rank:
- series.append(self.query.threads_from(k, resolution, series=True))
-
- df = pd.concat(series, axis=1)
-
- return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
diff --git a/analysis/query.py b/analysis/query.py
deleted file mode 100644
index 5b46488..0000000
--- a/analysis/query.py
+++ /dev/null
@@ -1,573 +0,0 @@
-import numpy as np
-import pandas as pd
-import analysis.archive
-import logging
-
-class Query:
-
- archive = None # analysis.archive.Archive object
- activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
- content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
- threads = None # ...
- single_threads = None
- replies = None # ...
-
- def __init__(self, arch=None):
-
- if not isinstance(arch, analysis.archive.Archive):
- logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
- raise Exception()
-
- self.archive = arch
-
- '''
- activity
- '''
-
- def _activity(self):
-
- if self.activity is None:
- from_index = self.archive.dataframe.reindex(columns=['from'])
- self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
-
- return self.activity
-
- def activity_from(self, email_address, resolution='y', series=False):
-
- eaddr = email_address.replace('@', '{at}').lower()
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- self._activity()
- try:
- af = self.activity[eaddr]
- except KeyError:
- return None
-
- activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
-
- if freq == 'AS':
- activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
- activity_from.index.name = 'year'
- else:
- activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- activity_from.index.name = 'year-month'
-
- if series:
- return activity_from
-
- return activity_from.to_frame('nbr-messages').astype(int)
-
- def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
-
- self._activity()
- afr = self.activity.sum(axis=0).order(ascending=False)
- if filter_nettime:
- p = r'^((?!nettime*).)*$'
- afr = afr[afr.index.str.contains(p)]
-
- if series:
- return afr[:rank]
-
- return afr[:rank].to_frame('nbr-messages').astype(int)
-
-
- # def activity_overall(self, resolution='y', series=False):
-
- # freq = 'M'
- # if resolution.lower() == 'y':
- # freq = 'AS'
- # elif resolution.lower() == 'm':
- # freq = 'M'
- # else:
- # return None
-
- # self._activity()
-
- # y = self.activity.sum(axis=1)
- # y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
-
- # if freq == 'AS':
- # y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
- # y.index.name = 'year'
- # else:
- # y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- # y.index.name = 'year-month'
-
- # if series:
- # return y
-
- # return y.to_frame('nbr-messages').astype(int)
-
- def activity_overall(self, resolution='y', series=False):
-
- a = self.archive.dataframe['url']
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()
-
- if freq == 'AS':
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
- y.index.name = 'year'
- else:
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- y.index.name = 'year-month'
-
- if series:
- return y
-
- return y.to_frame('nbr-messages').astype(int)
-
- def cohort(self, resolution='m', series=False):
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- self._activity()
-
- c = self.activity.idxmax().order().to_frame('date')
- c.index = c['date']
-
- cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()
-
- if freq == 'AS':
- cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
- cohort.index.name = 'year'
- else:
- cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- cohort.index.name = 'year-month'
-
- if series:
- return cohort
-
- return cohort.to_frame('first-messages').astype(int)
-
- '''
- content lenght
- '''
-
- def _content_length(self):
-
- if self.content_length is None:
- from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
- self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
- self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
-
- return self.content_length
-
- def content_length_from(self, email_address, resolution='y', series=False):
-
- eaddr = email_address.replace('@', '{at}').lower()
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- self._content_length()
- try:
- af = self.content_length[eaddr]
- except KeyError:
- return None
-
- content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
-
- if freq == 'AS':
- content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
- content_length_from.index.name = 'year'
- else:
- content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- content_length_from.index.name = 'year-month'
-
- if series:
- return content_length_from
-
- return content_length_from.to_frame('nbr-bytes').astype(int)
-
- def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
-
- self._content_length()
- cfr = self.content_length.sum(axis=0).order(ascending=False)
- if filter_nettime:
- p = r'^((?!nettime*).)*$'
- cfr = cfr[cfr.index.str.contains(p)]
-
- if series:
- return cfr[:rank]
-
- return cfr[:rank].to_frame('nbr-bytes').astype(int)
-
- def content_length_overall(self, resolution='y', series=False):
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- self._content_length()
-
- y = self.content_length.sum(axis=1)
- y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
-
- if freq == 'AS':
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
- y.index.name = 'year'
- else:
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- y.index.name = 'year-month'
-
- if series:
- return y
-
- return y.to_frame('nbr-bytes').astype(int)
-
-
- '''
- threads
- '''
-
- def _threads(self, thresh=0):
-
- print("doing threads")
-
- if self.threads is None:
- self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
-
- if self.single_threads is None:
- self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
-
- return self.threads;
-
- def threads_ranking(self, rank=5, resolution='y'):
-
- self._threads()
-
- if resolution == None:
- data = self.threads.drop('message-id', axis=1)[:rank]
- return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- # get the threads ranking per time resolution
- #
- data = self.threads.drop('message-id', axis=1)
- data = data.groupby([pd.TimeGrouper(freq=freq)])
- r = {}
- for k, v in data:
- if freq == 'AS':
- time_key = k.strftime('%Y')
- else:
- time_key = k.strftime('%Y-%m')
- frame = v[:rank]
- frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
- r[time_key] = frame
- return r
-
- def threads_replies_to(self, email_address, resolution='y', series=False):
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- self._threads()
-
- eaddr = email_address.replace('@', '{at}').lower()
-
- self._threads()
- threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
- threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references
- threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
-
- if series:
- return threads_from_ranking[eaddr]
-
- threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
-
- if freq == 'AS':
- threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
- threads_from_ranking.index.name = 'year'
- else:
- threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- threads_from_ranking.index.name = 'year-month'
-
- return threads_from_ranking
-
- def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
-
- self._threads()
-
- tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
-
- if filter_nettime:
- p = r'^((?!nettime*).)*$'
- tfr = tfr[tfr.index.str.contains(p)]
-
- tfr = tfr[:rank].astype(int)
- return tfr
-
- def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
-
- self._threads()
- tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
- if filter_nettime:
- p = r'^((?!nettime*).)*$'
- tir = tir[tir.index.str.contains(p)]
-
- if series:
- return tir[:rank]
-
- return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
-
- def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
-
- # activity
- self._activity()
- afr = self.activity.sum(axis=0).astype(int)
- if filter_nettime:
- p = r'^((?!nettime*).)*$'
- afr = afr[afr.index.str.contains(p)]
-
- # initiated threads [top 25]
- self._threads()
- tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
- if filter_nettime:
- p = r'^((?!nettime*).)*$'
- tir = tir[tir.index.str.contains(p)]
-
- inter = afr.index.intersection(tir.index)
- avg = tir[inter] / afr[inter]
-
- labels = ['messages', 'threads', 'avg.threads']
- return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
-
- def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
-
- self._threads()
-
- #initiated
- tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
- if filter_nettime:
- p = r'^((?!nettime*).)*$'
- tir = tir[tir.index.str.contains(p)]
-
- #replies [top 25]
- tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
- if filter_nettime:
- p = r'^((?!nettime*).)*$'
- tfr = tfr[tfr.index.str.contains(p)]
- tfr = tfr['nbr-references'] # dataframe to series
-
-
- inter = tir.index.intersection(tfr.index)
- avg = tfr[inter] / tir[inter]
-
- labels = ['threads', 'replies', 'avg.replies']
- return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
-
-
- def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- agg = aggregate.lower()
- if not agg in ['sum', 'mean', 'count']:
- return None
-
- if not self.threads is None:
- del self.threads
- self.threads = None
-
- self._threads(tresh)
-
- if agg == 'sum':
- # number of replies total (re: sum all the replies)
- y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
- elif agg == 'mean':
- y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
- else:
- # number of threads (re: msgs with at least one reply)
- y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
-
- if freq == 'AS':
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
- y.index.name = 'year'
- else:
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- y.index.name = 'year-month'
-
- if series:
- return y
-
- return y.to_frame('nbr-threads').astype(int)
-
- def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- agg = aggregate.lower()
- if not agg in ['sum', 'mean', 'count']:
- return None
-
- if not self.single_threads is None:
- del self.single_threads
- self.single_threads = None
-
- self._threads(tresh)
-
-
- y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
-
-
- if freq == 'AS':
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
- y.index.name = 'year'
- else:
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- y.index.name = 'year-month'
-
- if series:
- return y
-
- return y.to_frame('nbr-threads').astype(int)
-
-
- '''
- replies
- '''
-
- def _replies(self):
-
- if self.replies is None:
- self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
- self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
- return self.replies;
-
- def replies_ranking(self, rank=5, resolution=None):
-
- self._replies()
-
- if resolution == None:
- data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
- return data.to_frame('nbr_replies')
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- # get the threads ranking per time resolution
- #
- data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
- r = {}
- for k, v in data:
- if freq == 'AS':
- time_key = k.strftime('%Y')
- else:
- time_key = k.strftime('%Y-%m')
- frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
- r[time_key] = frame.to_frame('nbr-replies')
- return r
-
- def replies_avg_ranking(self, rank=5, filter_nettime=True):
-
- # activity
- self._activity()
- afr = self.activity.sum(axis=0)
- if filter_nettime:
- p = r'^((?!nettime*).)*$'
- afr = afr[afr.index.str.contains(p)]
-
- # replies in thread [top 25]
-
- self._replies()
- rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
-
- inter = afr.index.intersection(rpl.index)
- avg = rpl[inter] / afr[inter]
-
- labels = ['messages', 'replies', 'avg.replies']
- return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
-
- def replies_overall(self, resolution='y', series=False):
-
- freq = 'M'
- if resolution.lower() == 'y':
- freq = 'AS'
- elif resolution.lower() == 'm':
- freq = 'M'
- else:
- return None
-
- if not self.replies is None:
- del self.replies
- self.replies = None
-
- self._replies()
-
- y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()
-
-
- if freq == 'AS':
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
- y.index.name = 'year'
- else:
- y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
- y.index.name = 'year-month'
-
- if series:
- return y
-
- return y.to_frame('nbr-replies').astype(int)
-
-
-
-
diff --git a/analysis/util.py b/analysis/util.py
deleted file mode 100644
index 4602517..0000000
--- a/analysis/util.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import email
-import hashlib
-
-def format_content(msg, archive_name):
- return msg['content']
-
-def format_url(msg, archive_name):
- return msg['url']
-
-def format_author(msg, archive_name):
- return msg['author_name']
-
-def format_from_token(from_str, sep):
-
- fff = from_str
-
- from_addr = email.utils.parseaddr(from_str)[1]
-
- fffa = email.utils.parseaddr(from_str)
-
- if sep not in from_addr:
- tok = from_str.split()
- try:
- at = tok.index(sep)
- from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
- if from_addr.startswith('<') or from_addr.endswith('>'):
- from_addr = from_addr.strip('<').strip('>')
- except ValueError:
- print(tok)
- print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
- print("*** " + fff)
- print("+++")
- print(fffa)
- print("----")
-
- return None
- else:
- from_addr = from_addr.replace(sep, '{AT}')
- return from_addr.lower()
-
-def format_from(msg, archive_name):
- from_str = msg['from']
-
- if " {AT} " in from_str:
- return format_from_token(from_str, '{AT}')
- elif " at " in from_str:
- return format_from_token(from_str, 'at')
- elif "@" in from_str:
- return format_from_token(from_str, '@')
- else:
- return from_str
-
-# returns utc timestamp
-def format_date(msg, archive_name):
- date_str = msg['date']
- time_tz = None
- try:
- date_tz = email.utils.parsedate_tz(date_str)
- time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
- except TypeError:
- print("Format Date TypeError")
- print(" > " + date_str)
- return None
- except ValueError:
- print("Format Date ValueError")
- print(" > " + date_str)
- return None
- finally:
- return time_tz
-
-def format_subject(msg, archive_name):
- return msg['subject']
-
-def format_id(msg, archive_name):
- if "message-id" in msg:
- return msg['message-id']
- else:
- # create hash with author_name + date
- s = msg['author_name'] + msg['date']
- sha = hashlib.sha1(s.encode('utf-8'))
- return sha.hexdigest()
-
-# format='%d/%m/%Y'
-def min_date(archive_name):
- if "nettime" in archive_name:
- return '01/10/1995'
- elif archive_name == "spectre":
- return '01/08/2001'
- elif archive_name == "empyre":
- return '01/01/2002'
- elif archive_name == "crumb":
- return '01/02/2001'
diff --git a/search/__init__.py b/archive/__init__.py
similarity index 100%
rename from search/__init__.py
rename to archive/__init__.py
diff --git a/archive/archive.py b/archive/archive.py
new file mode 100644
index 0000000..4610904
--- /dev/null
+++ b/archive/archive.py
@@ -0,0 +1,257 @@
+import email, email.parser
+import os, json, gzip, re
+import mysql.connector as mariadb
+import archive.sql, archive.util
+from datetime import date, datetime
+from dateutil import parser
+import terminal.progress
+
+def load_from_file(filename, archive_name, archive_dir):
+
+ if not filename.endswith('.json.gz'):
+ file_path = os.path.join(archive_dir, filename + '.json.gz')
+ else:
+ file_path = os.path.join(archive_dir, filename)
+
+ if os.path.isfile(file_path):
+ with gzip.open(file_path, 'r') as fp:
+ json_data = json.load(fp)
+ return (json_data, archive_name)
+ else:
+ #list of all "filename[...].json.gz" in archive_dir
+ files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
+ if files:
+ filename = files[-1] # take the most recent (listed alpha-chronological)
+ file_path = os.path.join(archive_dir, filename)
+ if os.path.isfile(file_path):
+ with gzip.open(file_path, 'r') as fp:
+ json_data = json.load(fp)
+ return (json_data, archive_name) # <--- this makes no sense....
+
+ else:
+ #list of all json files in archive_dir/filename
+ dir_path = os.path.join(archive_dir, filename)
+ if not os.path.isdir(dir_path):
+ return None
+
+ files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
+ if not files:
+ return None
+
+ # load all json files
+ threads = []
+ for file_path in files:
+ with open(file_path, 'r') as fp:
+ json_data = json.load(fp)
+ threads.append(json_data)
+
+ return (threads, archive_name)
+
+def connect_db(database, host, user, password):
+
+ try:
+ con = mariadb.connect(host=host, user=user, password=password, database=database)
+ except mariadb.Error as error:
+ print("Error: {}".format(error))
+ if error.errno == 1049:
+ if util.y_n_question("Table " + archive_name + " does not exist. Create it?"):
+ print("creating")
+ else:
+ print("not creating")
+ return None
+ finally:
+ return con
+
+
+class Archive:
+
+ data = None # "raw" json data
+ db_con = None
+
+ def __init__(self, archive_name, archive_dir):
+
+ if isinstance(archive_name, str):
+ # need a filename or a dir name....
+ print("reading archive " + archive_name, end='')
+ (self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir)
+ print(" - done.")
+
+ def __init__(self, archive_name, database, host, user, password):
+
+ self.archive_name = archive_name
+ self.db_con = connect_db(database, host, user, password)
+
+ def __init__(self, archive_name, config):
+
+ self.archive_name = archive_name
+ self.db_con = connect_db(config['database'], config['host'], config['user'], config['password'])
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ if self.db_con is not None:
+ self.db_con.close()
+
+
+ def create_db(self, host, database, user, password):
+
+ print("creating table: " + self.archive_name, end='')
+ self.db_con = connect_db(database, host, user, password)
+ if self.db_con is None:
+ return
+
+ try:
+ cursor = self.db_con.cursor()
+ cursor.execute(archive.sql.CREATE.format(self.archive_name))
+ except mariadb.Error as error:
+ print("Error: {}".format(error))
+ finally:
+ cursor.close()
+
+ print(" - done.")
+
+ def insert_db(self, host, database, user, password):
+
+ self.db_con = connect_db(database, host, user, password)
+
+ if self.db_con is None:
+ return
+
+ try:
+ cursor = self.db_con.cursor()
+
+ progress = terminal.progress.ProgressBar(self.archive_name, len(self.data), fmt=terminal.progress.ProgressBar.FULL)
+
+ for t in self.data:
+
+ n_inserted = self.recursive_insert_db(cursor, t["threads"])
+ # print(" - insert: " + str(n_inserted), end='')
+ if n_inserted > 0:
+ self.db_con.commit()
+
+ progress.current += 1
+ progress()
+
+ progress.done()
+ self.db_con.commit()
+
+ except mariadb.Error as error:
+ pass
+ # print("Error: {}".format(error))
+ finally:
+ cursor.close()
+
+ def recursive_insert_db(self, cursor, thread):
+
+ n_inserted = 0
+ for m in thread:
+ try:
+
+ from_ = archive.util.format_from(m)
+ author_name_ = archive.util.format_author(m)
+ to_ = archive.util.format_to(m)
+ date_ = archive.util.format_date(m, self.archive_name)
+
+ if date_ is None or from_ is None:
+ # print("\nerrorororororo")
+ # print(m['from'] + " -- " + m['date'])
+ continue
+
+ cursor.execute(archive.sql.INSERT, (from_,author_name_,to_,m["subject"],date_,m["content-type"],m["content"],m["url"]))
+ n_inserted += 1
+
+ if "follow-up" in m:
+ n_inserted += self.recursive_insert_db(cursor, m["follow-up"])
+
+ except mariadb.Error as error:
+ if error.errno == 1062:
+ #duplication continue <------------------------- look this up...
+ # print("\nError: {}".format(error))
+ continue
+
+ return n_inserted
+
+ def content_search(self, term, bool=True):
+
+ if self.db_con is None:
+ print("Not connection to database...")
+ return
+
+ try:
+ cursor = self.db_con.cursor(buffered=True)
+ if bool:
+ cursor.execute(archive.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term))
+ else:
+ cursor.execute(archive.sql.CONTENT_QUERY.format(self.archive_name, term))
+
+ # print(cursor.rowcount)
+ results = []
+ for (from_, author_name_, subject_, date_, url_) in cursor:
+ results.append((from_, author_name_, subject_, date_, url_))
+ # print("{} {} {}".format(from_, str(date_), url_))
+ return results
+
+ except mariadb.Error as error:
+ print("Error: {}".format(error))
+ finally:
+ cursor.close()
+
+ def from_search(self, term, bool=True):
+
+ if self.db_con is None:
+ print("Not connection to database...")
+ return
+
+ try:
+ cursor = self.db_con.cursor(buffered=True)
+ if bool:
+ cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term))
+ else:
+ cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term))
+
+ # print(cursor.rowcount)
+ results = []
+ for (from_, author_name_, subject_, date_, url_) in cursor:
+ results.append((from_, author_name_, subject_, date_, url_))
+ # print("{} {} {}".format(from_, str(date_), url_))
+ return results
+
+ except mariadb.Error as error:
+ print("Error: {}".format(error))
+ finally:
+ cursor.close()
+
+ # analysis
+ def longest_field(self, field, thread, max_length=0):
+ import archive.util
+ for m in thread:
+ if not field in m:
+ if "threads" in m:
+ max_length = self.longest_field(field, m["threads"], max_length)
+ continue
+ if m[field] is None:
+ continue
+ if field == "from":
+ m[field] = archive.util.format_from(m)
+ elif field == "author_name":
+ m[field] = archive.util.format_author(m)
+ elif field == "to":
+ m[field] = archive.util.format_to(m)
+ elif field == "date":
+ m[field] = str(archive.util.format_date(m, self.archive_name))
+
+
+ if m[field] is None:
+ continue
+
+ l = len(m[field])
+ if l > max_length:
+ max_length = l
+ print(">> " + m[field])
+ if "follow-up" in m:
+ max_length = self.longest_field(field, m["follow-up"], max_length)
+ return max_length
+
+
+
diff --git a/archive/sql.py b/archive/sql.py
new file mode 100644
index 0000000..eee0474
--- /dev/null
+++ b/archive/sql.py
@@ -0,0 +1,31 @@
+CREATE = "CREATE TABLE `{}` (" \
+ "`from_` varchar(85) NOT NULL," \
+ "`author_name_` varchar(200) NOT NULL," \
+ "`to_` text(60)," \
+ "`subject_` varchar(3500) NOT NULL," \
+ "`date_` datetime NOT NULL," \
+ "`content_type_` varchar(15) NOT NULL," \
+ "`content_` mediumtext NOT NULL," \
+ "`url_` varchar(100) NOT NULL," \
+"PRIMARY KEY(`from_`, `date_`)," \
+"FULLTEXT (`subject_`, `content_`)," \
+"FULLTEXT (`from_`, `author_name_`)" \
+") ENGINE = InnoDB;"
+
+INSERT = ("INSERT INTO nettime_l"
+ "(from_, author_name_, to_, subject_, date_, content_type_, content_, url_) "
+ "VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")
+
+CONTENT_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
+ "WHERE MATCH(subject_, content_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
+
+CONTENT_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
+ "WHERE MATCH(subject_, content_) AGAINST('{}') ORDER BY date_")
+
+FROM_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
+ "WHERE MATCH(from_, author_name_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
+
+FROM_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
+ "WHERE MATCH(from_, author_name_) AGAINST('{}') ORDER BY date_")
+
+# SELECT from_, author_name_, subject_, date_, url_ FROM nettime_l WHERE MATCH(content_) AGAINST('%s' IN BOOLEAN MODE)
\ No newline at end of file
diff --git a/archive/util.py b/archive/util.py
new file mode 100755
index 0000000..70a9d67
--- /dev/null
+++ b/archive/util.py
@@ -0,0 +1,225 @@
+import email, datetime, sys
+import hashlib
+import dateparser
+
+def format_content(msg):
+ return msg['content']
+
+def format_url(msg):
+ return msg['url']
+
+def format_author(msg):
+
+ if 'author_name' not in msg or msg['author_name'] is None:
+ return None
+
+ author_str = msg['author_name'].replace('"', '')
+
+ if "by way of" in author_str:
+ toks = author_str.split("by way of")
+ if toks[0] == "":
+ author_str = format_from(msg)
+ elif toks[0][-1] == "(":
+ author_str = toks[0][:-1].strip()
+ else:
+ author_str = toks[0]
+
+ if ("(" in author_str) or ("<" in author_str):
+ # ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault
+ # print("±±±±±±")
+ # print("name: " + author_str)
+ # print("from: " + msg['from'])
+ if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'):
+ author_str = author_str.split('(')[0].strip()
+ else:
+ author_str = email.utils.parseaddr(author_str)[0]
+ # print(" Name:" + author_str.replace('"', ''))
+ # print(" From:" + format_from(msg))
+
+ if " ," in author_str:
+ # nettime's_roving_reporter , thing.net {AT} bbs.thing.net
+ author_str = author_str.split(' ,')[0]
+
+
+ return author_str
+
+def format_from_token(from_str, sep):
+ from_addr = email.utils.parseaddr(from_str)[1]
+ if sep not in from_addr:
+ tok = from_str.split()
+ try:
+ at = tok.index(sep)
+ from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
+ if from_addr.startswith('<') or from_addr.endswith('>'):
+ from_addr = from_addr.strip('<').strip('>')
+ except ValueError:
+ print(tok)
+ print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
+ return None
+ else:
+ from_addr = from_addr.replace(sep, '{AT}')
+ return "".join(from_addr.lower().split())
+
+def format_from(msg):
+
+ if 'from' not in msg or msg['from'] is None:
+ return None
+
+ from_str = msg['from']
+
+ if " {AT} " in from_str:
+ return format_from_token(from_str, '{AT}')
+ elif " at " in from_str:
+ return format_from_token(from_str, 'at')
+ elif "@" in from_str:
+ return format_from_token(from_str, '@')
+ else:
+ return "".join(from_str.split())
+
+def format_to(msg):
+
+ if "to" not in msg or msg["to"] is None:
+ return None
+
+ to_str = msg["to"]
+ toks = email.utils.parseaddr(to_str)
+ # print(toks)
+
+ if len(toks) == 2:
+ to_str = toks[1]
+
+ return "".join(to_str.lower().split())
+
+
+# returns utc timestamp --- old...
+def format_date_utc(msg, archive_name):
+
+ if 'date' not in msg or msg['date'] is None:
+ return None
+
+ date_str = msg['date'].replace('.', '')
+ time_tz = None
+ try:
+ date_tz = email.utils.parsedate_tz(date_str)
+ time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
+ except TypeError:
+ print("Format Date TypeError")
+ print(" > " + date_str)
+ return None
+ except ValueError:
+ print("Format Date ValueError")
+ print(" > " + date_str)
+ return None
+ finally:
+ return time_tz
+
+def format_date(msg, archive_name):
+
+ if 'date' not in msg or msg['date'] is None:
+ return None
+
+ # date_str = msg['date'].replace('.', '')
+ date_str = msg['date']
+
+ # fix Thu, 01 Aug 2002 17:33:08 +0900 (JST)
+ if '(' in date_str:
+ date_str = date_str.split('(')[0].rstrip()
+
+
+ date_time = dateparser.parse(date_str)
+ if date_time is None:
+
+ # random stuff...
+ fix = False
+ toks = date_str.split()
+
+ if len(toks[-1]) == 5 or len(toks[-1]) == 4:
+ # ex. Thu, 24 Jan 2002 15:21:31 -0000
+ if toks[-1] in ['+0000', '-0000', '0000']:
+ date_str = date_str[:-5]
+ fix = True
+ # ex. Fri, 25 Jan 2002 13:21:49 +1050
+ elif toks[-1][-2] == '5':
+ d = list(date_str)
+ d[-2] = '3'
+ date_str = "".join(d)
+ fix = True
+
+ if toks[-1][-1] != '0':
+ #ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005'
+ date_str = date_str[:-5]
+ fix = True
+
+ if 'Fru' in toks[0]:
+ date_str = date_str.replace('Fru', 'Fri')
+ fix = True
+ elif 'Thur' in toks[0]:
+ date_str = date_str.replace('Thur', 'Thu')
+ fix = True
+
+ if not fix:
+ # print("----")
+ return None
+
+ date_time = dateparser.parse(date_str)
+ if date_time is None:
+
+ if 'GMT' in date_str:
+ # ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100'
+ date_str = date_str.split('GMT')[0].rstrip()
+ fix = True
+
+ if 'METDST' in toks[-1]:
+ # ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST'
+ date_str = date_str.replace('METDST', 'MET')
+ fix = True
+
+
+ if not fix:
+ # print("++++")
+ return None
+
+ date_time = dateparser.parse(date_str)
+ return date_time
+
+ # else:
+ # print(date_str)
+
+ # date_time = datetime.datetime.fromtimestamp(time_tz)
+
+ min_d = datetime.datetime.strptime(min_date(archive_name), "%d/%m/%Y")
+ max_d = datetime.datetime.now()
+
+ date_time_naive = date_time.replace(tzinfo=None)
+
+ if date_time_naive < min_d or date_time_naive > max_d:
+ return None
+
+ return date_time
+
+def format_subject(msg, archive_name):
+
+ if 'subject' not in msg or msg['subject'] is None:
+ return None
+
+ return msg['subject']
+
+def format_id(msg, archive_name):
+ if "message-id" in msg:
+ return msg['message-id']
+ else:
+ # create hash with author_name + date
+ s = msg['author_name'] + msg['date']
+ sha = hashlib.sha1(s.encode('utf-8'))
+ return sha.hexdigest()
+
+# format='%d/%m/%Y'
+def min_date(archive_name):
+ if "nettime" in archive_name:
+ return '01/10/1995'
+ elif archive_name == "spectre":
+ return '01/08/2001'
+ elif archive_name == "empyre":
+ return '01/01/2002'
+ elif archive_name == "crumb":
+ return '01/02/2001'
diff --git a/conda_env.yml b/conda_env.yml
index 370d12c..d5896ed 100644
--- a/conda_env.yml
+++ b/conda_env.yml
@@ -1,29 +1,34 @@
-name: listservs
+name: listserv
channels:
-- defaults
+ - defaults
dependencies:
-- beautiful-soup=4.3.2=py34_0
-- click=6.7=py34_0
-- flask=0.12=py34_0
-- gunicorn=19.1.0=py34_0
-- html5lib=0.999=py34_0
-- itsdangerous=0.24=py34_0
-- jinja2=2.9.6=py34_0
-- markupsafe=0.23=py34_2
-- openssl=1.0.2l=0
-- pastedeploy=1.5.2=py34_1
-- pip=9.0.1=py34_1
-- python=3.4.5=0
-- readline=6.2=2
-- setuptools=27.2.0=py34_0
-- six=1.10.0=py34_0
-- sqlite=3.13.0=0
-- tk=8.5.18=0
-- werkzeug=0.11.15=py34_0
-- wheel=0.29.0=py34_0
-- xz=5.2.2=1
-- zlib=1.2.8=3
-- pip:
- - beautifulsoup4==4.3.2
- - webencodings==0.5.1
+ - ca-certificates=2019.5.15=0
+ - openssl=1.0.2s=h1de35cc_0
+ - pip=9.0.1=py34_1
+ - python=3.4.5=0
+ - readline=6.2=2
+ - setuptools=27.2.0=py34_0
+ - sqlite=3.13.0=0
+ - tk=8.5.18=0
+ - wheel=0.29.0=py34_0
+ - xz=5.2.4=h1de35cc_4
+ - zlib=1.2.11=h1de35cc_3
+ - pip:
+ - beautifulsoup4==4.7.1
+ - click==7.0
+ - dateparser==0.7.1
+ - flask==1.0.4
+ - gunicorn==19.9.0
+ - itsdangerous==1.1.0
+ - jinja2==2.10.1
+ - markupsafe==1.1.1
+ - mysql-connector-python==8.0.16
+ - protobuf==3.8.0
+ - python-dateutil==2.8.0
+ - pytz==2019.1
+ - regex==2019.6.8
+ - six==1.12.0
+ - soupsieve==1.9.2
+ - tzlocal==1.5.1
+ - werkzeug==0.15.4
diff --git a/search/archive.py b/search/archive.py
deleted file mode 100644
index 7b4631a..0000000
--- a/search/archive.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import logging, os, json, re
-from datetime import datetime
-
-import analysis.archive ## circular...
-import analysis.query
-import analysis.format
-
-import threading
-
-class Archive():
-
- def __init__(self, archives_dir=None):
- if archives_dir==None:
- from www import config
- self.archives_dir = config.ARCHIVES_PATH
- else:
- self.archives_dir = archives_dir
-
- self.loaded = False
-
- self.lock_search = threading.Lock()
- self.lock_threads_ranking = threading.Lock()
-
- def load(self, archive_name=None):
-
- if archive_name == None:
- raise Exception('Archive is not specified')
-
- archive_path = os.path.join(self.archives_dir, archive_name)
- if not os.path.isdir(archive_path):
- raise Exception('Archive ' + path + ' does not exist')
-
- self.archive_name = archive_name
- self.archive_path = archive_path
-
- files = [f for f in os.listdir(archive_path) if f.endswith('.json')]
-
- self.archive = {}
-
- for f in files:
- file_path = os.path.join(archive_path, f)
- label = f.replace('.json', '')
- with open(file_path) as fdata:
- self.archive[label] = json.load(fdata)
-
- self.loaded = True
-
- def search_message(self, keyword, msg, index_str, results, field='content'):
-
- nbr_hits = 0
- if msg[field] is not None and msg[field].lower().find(keyword.lower()) > 0:
- nbr_hits += 1
- results.append({ "index_str": index_str, "subject": msg['subject'], "date": msg['date'], "author_name": msg['author_name'], "url": msg['url'] })
-
- if 'follow-up' in msg:
- i = 0
- for m in msg['follow-up']:
- current_index_str = index_str + '/' + str(i)
- nbr_hits += self.search_message(keyword, m, current_index_str, results, field)
- i += 1
-
- return nbr_hits
-
-
- def search(self, keyword, field='content', min_hits=0):
-
- with self.lock_search:
-
- search_results = { "keyword": keyword, "field": field, "archive": self.archive_name, "results": [] }
-
- for k, v in sorted(self.archive.items(), key=get_key, reverse=True):
-
- current_index_str = self.archive_name + '/' + k
- hits = []
- nbr_hits = 0
- i = 0
- for m in v['threads']:
- current_index_str = self.archive_name + '/' + k + '/' + str(i)
- nbr_hits += self.search_message(keyword, m, current_index_str, hits, field)
- i += 1
-
- if nbr_hits > min_hits:
- # nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001')
- if k.startswith("nettime-l_"):
- dt = datetime.strptime(k, "nettime-l_%b_%y")
- k = dt.strftime("%B_%Y")
- search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
-
- return search_results
-
- def threads_ranking(self, rank=5):
-
- with self.lock_threads_ranking:
-
- search_results = { "keyword": "thread ranking", "field": "ranking", "archive": self.archive_name, "results": [] }
-
- a = analysis.archive.Archive(self)
- q = a.query();
-
- ranking = q.threads_ranking(rank=rank)
-
- for i in ranking:
- r = analysis.format.frame_to_dictionary_threads_ranking(ranking[i])
- for h in r:
- hit = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}]
- search_results['results'].append({'thread': h['date'], 'nbr_hits': h['nbr-references'], 'hits': hit})
- del a
- del q
-
- return search_results
-
-
-
-def get_key(kv_tuple):
-
- k = kv_tuple[0]
-
- # k is of the form "Month_Year" - ex.: "January_2001"
- try:
- return datetime.strptime(k, "%B_%Y")
- except Exception:
- pass
-
- # k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
- try:
- return datetime.strptime(k, "%b_%y")
- except Exception:
- pass
-
- # k is of the form "Year" - ex.: "2001"
- try:
- return datetime.strptime(k, "%Y")
- except Exception:
- pass
-
- # nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01"
- try:
- return datetime.strptime(k, "nettime-l_%b_%y")
- except Exception:
- pass
-
- print("--------------")
- print(k)
-
- return None
-
-
-
-
-
diff --git a/setenv b/setenv
index af68e55..fa717b3 100644
--- a/setenv
+++ b/setenv
@@ -1 +1 @@
-source activate listservs
\ No newline at end of file
+source activate listserv
diff --git a/terminal/progress.py b/terminal/progress.py
new file mode 100644
index 0000000..c66f3b7
--- /dev/null
+++ b/terminal/progress.py
@@ -0,0 +1,43 @@
+from __future__ import print_function
+import sys
+import re
+
+# https://stackoverflow.com/questions/3160699/python-progress-bar
+
+class ProgressBar(object):
+ DEFAULT = 'Progress: %(bar)s %(percent)3d%%'
+ FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'
+
+ def __init__(self, title, total, width=40, fmt=DEFAULT, symbol='=',
+ output=sys.stderr):
+ assert len(symbol) == 1
+
+ self.title = title
+ self.total = total
+ self.width = width
+ self.symbol = symbol
+ self.output = output
+ self.fmt = re.sub(r'(?P%\(.+?\))d',
+ r'\g%dd' % len(str(total)), fmt)
+
+ self.current = 0
+
+ def __call__(self):
+ percent = self.current / float(self.total)
+ size = int(self.width * percent)
+ remaining = self.total - self.current
+ bar = self.title + ' [' + self.symbol * size + ' ' * (self.width - size) + ']'
+
+ args = {
+ 'total': self.total,
+ 'bar': bar,
+ 'current': self.current,
+ 'percent': percent * 100,
+ 'remaining': remaining
+ }
+ print('\r' + self.fmt % args, file=self.output, end='')
+
+ def done(self):
+ self.current = self.total
+ self()
+ print('', file=self.output)
\ No newline at end of file
diff --git a/terminal/util.py b/terminal/util.py
new file mode 100644
index 0000000..695b671
--- /dev/null
+++ b/terminal/util.py
@@ -0,0 +1,16 @@
+
+def y_n_question(question_str):
+
+ yes = {'yes','y', 'ye', ''}
+ no = {'no','n'}
+
+ while True:
+ sys.stdout.write(question_str + " [Y/n]: ")
+ choice = input().lower()
+ if choice in yes:
+ return True
+ elif choice in no:
+ return False
+ else:
+ sys.stdout.write("\nPlease respond with 'yes' or 'no'\n")
+ continue
diff --git a/www-serve b/www-serve
index 318a511..ee4b738 100644
--- a/www-serve
+++ b/www-serve
@@ -1,2 +1 @@
-
-gunicorn -w 1 -b 127.0.0.1:5555 www-serve:app
\ No newline at end of file
+gunicorn -w 1 --bind 0.0.0.0:5555 www-serve:app
\ No newline at end of file
diff --git a/www-serve.py b/www-serve.py
index c9c8833..6bac083 100644
--- a/www-serve.py
+++ b/www-serve.py
@@ -1,2 +1,4 @@
from www import app
-#app.run(debug=True, threaded=True, use_reloader=False) # uncomment this line to run flask's server
+
+if __name__ == "__main__":
+ app.run(debug=True, use_reloader=False)
\ No newline at end of file
diff --git a/www/routes.py b/www/routes.py
index ec29df4..f2b33be 100644
--- a/www/routes.py
+++ b/www/routes.py
@@ -1,144 +1,46 @@
from flask import render_template, request, jsonify
from www import app
-from www import archives
-import search.archive
+import archive.archive as archive
+import config
+import www.config as wconfig
from datetime import datetime
-
import logging
-logging.info(' ------- arch = Archives() -------- ')
-arch = archives.Archives()
-arch.load()
-archives_data = arch.data
@app.route('/')
def index():
- k = archives_data.keys()
- return render_template("index.html", archives=k)
-
-# def get_key(kv_tuple):
-
-# k = kv_tuple[0]
-
-# # k is of the form "Month_Year" - ex.: "January_2001"
-# try:
-# return datetime.strptime(k, "%B_%Y")
-# except Exception:
-# pass
-
-# # k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
-# try:
-# return datetime.strptime(k, "%b_%y")
-# except Exception:
-# pass
-
-# # k is of the form "Year" - ex.: "2001"
-# try:
-# return datetime.strptime(k, "%Y")
-# except Exception:
-# pass
-
-# return None
-
-@app.route('/')
-def get_list(list):
- if list in archives_data:
- d = []
- for k, v in sorted(archives_data[list].archive.items(), key=search.archive.get_key, reverse=True):
- d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])})
- return render_template("list.html", list_name=list, list=d)
-
- else:
- return 'nee nee'
-
-@app.route('//')
-def get_sublist(list, sublist):
-
- print(list)
- print(sublist)
-
- sublist = sublist.replace(' ', '_')
- if list in archives_data and sublist in archives_data[list].archive:
- return render_template("threads.html", sublist_name=sublist, threads=archives_data[list].archive[sublist]['threads'])
- else:
- return 'na na'
-
-@app.route('///')
-def get_message(list, sublist, index):
-
- sublist = sublist.replace(' ', '_')
- index = int(index)
- if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']):
- return render_template("message.html", message=archives_data[list].archive[sublist]['threads'][index])
- else:
- 'non non'
-
-@app.route('////')
-def get_follow_ups(list, sublist, index, follow_ups):
-
- sublist = sublist.replace(' ', '_')
- index = int(index)
-
- ups = follow_ups.split('/')
- follow = []
- for u in ups:
- follow.append(int(u))
-
- if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']):
- message = archives_data[list].archive[sublist]['threads'][index]
- for f in follow:
- message = message['follow-up'][f]
- return render_template("message.html", message=message)
- else:
- 'nope nope'
+ return render_template("index.html")
@app.route('/search')
def searh():
if len(request.args) < 1:
- k = archives_data.keys()
- return render_template("search.html", archives=k, fields=['content', 'from(name)', 'from(email)'], hits=['n/a', '2', '3', '4', '5', '6', '7', '8', '9'])
+ return render_template("search.html", archives=wconfig.lists_to_serve, fields=['content', 'from'])
k_arg = request.args.get('keyword')
l_arg = request.args.get('list')
- sl_arg = request.args.get('sublist')
f_arg = request.args.get('field')
- h_arg = request.args.get('hits')
if k_arg is None or k_arg.strip() == '':
return "no keyword..."
-
- if l_arg is None:
- return "no list..."
- if not (l_arg == "all") and not (l_arg in archives_data):
+ if l_arg != "all" and l_arg not in wconfig.lists_to_serve:
return "list '" + l_arg + "' does not exist"
- if sl_arg is not None:
- if not sl_arg in archives_data[l]:
- return "sublist '" + sl_arg + "' does not exist in list '" + l_arg + "'"
+ if f_arg not in ['content', 'from']:
+ return "field '" + f_arg + "' does not exist"
- if f_arg == "from(name)":
- f_arg = 'author_name'
- elif f_arg == "from(email)":
- f_arg = 'from'
lists = []
if l_arg == "all":
- for k in archives_data.keys():
- lists.append(k)
+ lists = wconfig.lists_to_serve
else:
lists.append(l_arg)
- nbr_hits = 0
- if h_arg in ['2', '3', '4', '5', '6', '7', '8', '9']:
- nbr_hits = int(h_arg)
-
-
################################
##
- ## need to cache all the below
+ ## need to cache all the below.....
##
################################
@@ -147,18 +49,41 @@ def searh():
logging.info("search keyword = " + k_arg)
for l in lists:
- if k_arg == "rank":
- logging.info(" ranking " + l)
- s = archives_data[l].threads_ranking()
- else:
- s = archives_data[l].search(keyword=k_arg, field=f_arg, min_hits=nbr_hits)
+
+ with archive.Archive(l, config=config.db) as a:
+ if f_arg == 'content':
+ r = a.content_search(k_arg)
+ else:
+ r = a.from_search(k_arg)
- results.append(s)
+ # format data to return
+ search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] }
+ month_year_results = {}
- ## -- sort results?
- search_results = sorted(results, key=get_result_key)
+ for (from_, author_name_, subject_, date_, url_) in r:
+ m_y = date_.strftime("%B_%Y")
+ if m_y not in month_year_results:
+ month_year_results[m_y] = []
+ month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_})
- return jsonify(result=search_results)
+ for k, v in sorted(month_year_results.items(), key=get_key, reverse=True):
+ search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v})
+
+ # search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
+ # where:
+ # 'thread' = "%B_%Y" aka. January 2001
+ # 'nbr_hits' = nbr hits for that month
+ # 'hits' = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}]
+
+ results.append(search_results)
+
+
+ sorted_results = sorted(results, key=get_result_key)
+ return jsonify(result=sorted_results)
+
+
+def get_key(kv):
+ return datetime.strptime(kv[0], "%B_%Y")
def get_result_key(r):
return r['archive']
diff --git a/www/templates/index.html b/www/templates/index.html
index 1476481..646c4a0 100644
--- a/www/templates/index.html
+++ b/www/templates/index.html
@@ -1,8 +1,6 @@
- {% for a in archives %}
- {{ a }}
- {% endfor %}
+ ---> SEARCH <---
\ No newline at end of file
diff --git a/www/templates/list.html b/www/templates/list.html
deleted file mode 100644
index 9a47098..0000000
--- a/www/templates/list.html
+++ /dev/null
@@ -1,10 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/www/templates/message.html b/www/templates/message.html
deleted file mode 100644
index 7125408..0000000
--- a/www/templates/message.html
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-
-
-
- {{ message.subject }}
- {{ message.author_name }}
- {{ message.date }}
- {{ message.content }}
-
-
\ No newline at end of file
diff --git a/www/templates/search.html b/www/templates/search.html
index a92048e..c5b5715 100644
--- a/www/templates/search.html
+++ b/www/templates/search.html
@@ -20,11 +20,6 @@
{% endfor %}
-
Loading...
diff --git a/www/templates/threads.html b/www/templates/threads.html
deleted file mode 100644
index 050e3cf..0000000
--- a/www/templates/threads.html
+++ /dev/null
@@ -1,25 +0,0 @@
-
-
-
-{% macro message(m, index, urlpath)-%}
-{% set path = urlpath + '/' + index|string %}
-
- {{ index }}. {{ m.subject }} {{ m.author_name }}
- {% if m.get('follow-up') %}
-
- {% for msg in m.get('follow-up') %}
- {{ message(m=msg, index=loop.index - 1, urlpath=path) }}
- {% endfor %}
-
- {% endif %}
-
-{%- endmacro %}
-
-
- {% for m in threads recursive %}
- {{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }}
- {% endfor %}
-
-
-
-
\ No newline at end of file