many many things...

This commit is contained in:
David Gauthier 2017-11-04 13:34:05 +01:00
parent f540b26e4e
commit 874a27a8c9
18 changed files with 1574 additions and 23 deletions

1
.gitignore vendored Normal file → Executable file
View File

@ -1,5 +1,6 @@
# mailinglists specific
archives/
figs/
config.py
# Byte-compiled / optimized / DLL files

230
analyse.py Normal file
View File

@ -0,0 +1,230 @@
import os
# matplot view/windows
import matplotlib
matplotlib.interactive(True)
# pd display
import pandas as pd
pd.set_option('display.max_colwidth', 100)
from analysis.archive import Archive
from analysis.query import Query
from analysis.plot import Plot
import analysis.format
# spectre: slategrey
# nettime: red
# crumb: purple
# empyre: darkblue
def save_fig_cohort(q, name, dir, color):
t = name + " - Cohorts"
pp = q.cohort().plot(color=color, title=t)
ts = name + "_cohorts.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_messages_total(q, name, dir, color):
t = name + " - Nbr. Messages"
pp = q.activity_overall().plot(kind='bar', color=color, title=t)
ts = name + "_messages.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_threads_total(q, name, dir, color):
t = name + " - Nbr. Threads"
pp = q.threads_overall().plot(kind='bar', color=color, title=t)
ts = name + "_threads.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_messages_constituency(q, name, dir):
t = name + " - Messages Constituency"
replies = pd.Series(q.replies_overall(series=True))
# threads = pd.Series(q.single_threads_overall(series=True))
threads = pd.Series(q.threads_overall(series=True))
messages = pd.Series(q.activity_overall(series=True))
single_messages = messages - (replies + threads)
# df = {'a': single_messages, 'b': threads, 'c': replies}
# df = pd.DataFrame([single_messages, threads, replies], columns=['a', 'b', 'c'])
df = pd.concat([single_messages.to_frame('single-messages').astype(int), threads.to_frame('threads').astype(int), replies.to_frame('replies').astype(int)], axis=1)
pp = df.plot(kind='bar', stacked=True, title=t)
# pp = [single_messages, threads, replies].plot(kind='bar', stacked=True)
ts = name + "_constituency.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_avg_threads_replies(q, name, dir, color):
t = name + " - Avg. Threads + Replies"
replies = pd.Series(q.replies_overall(series=True))
threads = pd.Series(q.threads_overall(series=True))
messages = pd.Series(q.activity_overall(series=True))
avg_threads_messages = (replies + threads) / messages
pp = pd.DataFrame(avg_threads_messages).plot(kind='bar', color=color, title=t)
ts = name + "_avg_threads_replies.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_diff_threads_replies_vs_messages(q, name, dir, color):
t = name + " - Diff. Threads + Replies vs Single Messages"
replies = pd.Series(q.replies_overall(series=True))
threads = pd.Series(q.threads_overall(series=True))
rt = replies + threads
messages = pd.Series(q.activity_overall(series=True))
diff_threads_messages = (2 * rt) - messages
pp = pd.DataFrame(diff_threads_messages).plot(kind='bar', color=color, title=t)
ts = name + "_diff_threads_replies_messages.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_ratio_replies_threads(q, name, dir, color):
t = name + " - Ratio Replies per Thread"
replies = pd.Series(q.replies_overall(series=True))
threads = pd.Series(q.threads_overall(series=True))
ratio_replies_threads = replies / threads
pp = pd.DataFrame(ratio_replies_threads).plot(kind='bar', color=color, title=t)
ts = name + "_ratio_replies_threads.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def html_td_rank_year(year, data):
td_str = '<td class="td_list">'
if year in data:
td_str += analysis.format.table_threads_ranking(data[year])
td_str += '</td>'
return td_str
def html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre):
html_str = '<table id="rankings">'
html_str += '<tr>'
html_str += '<td class="td_year_t">year</td>'
html_str += '<td class="td_list_t">nettime</td>'
html_str += '<td class="td_list_t">crumb</td>'
html_str += '<td class="td_list_t">spectre</td>'
html_str += '<td class="td_list_t">empyre</td>'
html_str += '</tr>'
years = sorted(ranking_nettime.keys())
print(years)
for i in years:
html_str += '<tr>'
html_str += '<td class="td_list">' + i + '</td>'
html_str += html_td_rank_year(i, ranking_nettime)
html_str += html_td_rank_year(i, ranking_crumb)
html_str += html_td_rank_year(i, ranking_spectre)
html_str += html_td_rank_year(i, ranking_empyre)
html_str += '</tr>'
html_str += '</table>'
return html_str
print("nettime")
#nettime
nt = Archive('nettime-l')
ntq = nt.query()
ntp = Plot(ntq)
# save_fig_cohort(ntq, 'nettime', 'figs/', 'red')
# save_fig_messages_total(ntq, 'nettime', 'figs/', 'red')
# save_fig_threads_total(ntq, 'nettime', 'figs/', 'red')
# save_fig_messages_constituency(ntq, 'nettime', 'figs/')
# save_fig_avg_threads_replies(ntq, 'nettime', 'figs/', 'red')
# save_fig_diff_threads_replies_vs_messages(ntq, 'nettime', 'figs/', 'red')
# save_fig_ratio_replies_threads(ntq, 'nettime', 'figs/', 'red')
ranking_nettime = ntq.threads_ranking(rank=15)
# print(r['2000'])
# print(analysis.format.table_threads_ranking(r['2000']))
print("crumb")
#crumb
cr = Archive('crumb')
crq = cr.query()
crp = Plot(crq)
# save_fig_cohort(crq, 'crumb', 'figs/', 'purple')
# save_fig_messages_total(crq, 'crumb', 'figs/', 'purple')
# save_fig_threads_total(crq, 'crumb', 'figs/', 'purple')
# save_fig_messages_constituency(crq, 'crumb', 'figs/')
# save_fig_avg_threads_replies(crq, 'crumb', 'figs/', 'purple')
# save_fig_diff_threads_replies_vs_messages(crq, 'crumb', 'figs/', 'purple')
# save_fig_ratio_replies_threads(crq, 'crumb', 'figs/', 'purple')
ranking_crumb = crq.threads_ranking(rank=15)
print("empyre")
#empyre
em = Archive('empyre')
emq = em.query()
emp = Plot(emq)
# save_fig_cohort(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_messages_total(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_threads_total(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_messages_constituency(emq, 'empyre', 'figs/')
# save_fig_avg_threads_replies(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_diff_threads_replies_vs_messages(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_ratio_replies_threads(emq, 'empyre', 'figs/', 'darkblue')
ranking_empyre = emq.threads_ranking(rank=15)
print("spectre")
#spectre
sp = Archive('spectre')
spq = sp.query()
spp = Plot(spq)
# save_fig_cohort(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_messages_total(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_threads_total(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_messages_constituency(spq, 'spectre', 'figs/')
# save_fig_avg_threads_replies(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_diff_threads_replies_vs_messages(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_ratio_replies_threads(spq, 'spectre', 'figs/', 'slategrey')
ranking_spectre = spq.threads_ranking(rank=15)
## comparative ranking
rankings = html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre)
html_template = 'figs/ranking/index_template.html'
with open(html_template, 'r') as fp:
h = fp.read()
html = h.replace("--table--", rankings)
html_output = 'figs/ranking/index.html'
with open(html_output, 'w+') as fp:
fp.write(html)

152
analysis/archive.py Normal file
View File

@ -0,0 +1,152 @@
import numpy as np
import pandas as pd
import email, email.parser
import os, datetime, json, gzip, re
import analysis.util
import analysis.query
def filter_date(msg, archive_name):
time_tz = analysis.util.format_date(msg, archive_name)
if not time_tz:
return None
dt = datetime.datetime.fromtimestamp(time_tz)
try:
date_time = pd.to_datetime(dt)
except pd.tslib.OutOfBoundsDatetime:
print('time out of bound')
print(dt)
return None
min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y')
max_date = pd.to_datetime(datetime.datetime.now())
if date_time < min_date or date_time > max_date:
return None
return date_time
def message_to_tuple_record(msg, records, archive_name, references='X'):
# check date first?
date = filter_date(msg, archive_name)
if not date:
print("Archive::filter_date returned None. Skip.")
return
# check / filter from email address second?
from_addr = analysis.util.format_from(msg, archive_name)
if not from_addr:
print("Archive::analysis.util.format_from returned None. Skip.")
return
url = analysis.util.format_url(msg, archive_name)
author = analysis.util.format_author(msg, archive_name)
subject = analysis.util.format_subject(msg, archive_name)
message_id = analysis.util.format_id(msg, archive_name)
content = analysis.util.format_content(msg, archive_name)
records.append((message_id,
from_addr,
author,
subject,
date,
url,
len(content),
0 if not 'follow-up' in msg else len(msg['follow-up']),
references))
# recursive follow up -- but references is not keeping track really...
if 'follow-up' in msg:
for f in msg['follow-up']:
message_to_tuple_record(f, records, archive_name, references=message_id)
return
def json_data_to_pd_dataframe(json_data, archive_name):
records = []
for d in json_data:
for dd in d['threads']:
message_to_tuple_record(dd, records, archive_name)
print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records)))
df = pd.DataFrame.from_records(records,
index='date',
columns=['message-id',
'from',
'author',
'subject',
'date',
'url',
'content-length',
'nbr-references',
'references'])
df.index.name = 'date'
return df
def load_from_file(filename, archive_name, archive_dir, json_data=None):
if not filename.endswith('.json.gz'):
file_path = os.path.join(archive_dir, filename + '.json.gz')
else:
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return json_data_to_pd_dataframe(json_data['threads'], archive_name)
else:
#list of all "filename[...].json.gz" in archive_dir
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
if files:
filename = files[-1] # take the most recent (listed alpha-chronological)
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return json_data_to_pd_dataframe(json_data['threads'], archive_name)
else:
#list of all json files in archive_dir/filename
dir_path = os.path.join(archive_dir, filename)
if not os.path.isdir(dir_path):
return None
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
if not files:
return None
# load all json files
threads = []
for file_path in files:
with open(file_path, 'r') as fp:
json_data = json.load(fp)
threads.append(json_data)
print('---> ' + archive_name)
return json_data_to_pd_dataframe(threads, archive_name)
class Archive:
data = None # "raw" json data
dataframe = None # main pd dataframe
def __init__(self, archive_name, archive_dir="archives"):
if isinstance(archive_name, pd.core.frame.DataFrame):
self.dataframe = archive_name.copy()
if isinstance(archive_name, str):
# need a filename or a dir name....
self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data)
def query(self):
q = analysis.query.Query(self)
return q

158
analysis/format.py Normal file
View File

@ -0,0 +1,158 @@
import analysis.query
import logging, html, numpy
from tabulate import tabulate
def makeurl(text, url):
return '<a href="' + url + '">' + text + "</a>"
def table_threads_ranking(ranking_dataframe):
html_str = '<table class="threads_ranking">'
html_str += '<tr>'
html_str += '<td class="td_date_t">date</td>'
html_str += '<td class="td_subject_t">subject</td>'
html_str += '<td class="td_from_t">from</td>'
html_str += '<td class="td_rep_t">replies</td>'
html_str += '</tr>'
for i, row in ranking_dataframe.iterrows():
html_str += '<tr>'
html_str += '<td class="td_date">' + str(i) + '</td>'
html_str += '<td class="td_subject">' + makeurl(row['subject'], row['url']) + '</td>'
html_str += '<td class="td_from">' + row['from'] + '</td>'
html_str += '<td class="td_rep">' + str(row['nbr-references']) + '</td>'
html_str += '</tr>'
html_str += "</table>"
return html_str
class Html:
query = None
def __init__(self, q=None):
if not isinstance(q, query.Query):
logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
raise Exception()
self.query = q
def threads_ranking(self, rank=5, resolution=None):
data = self.query.threads_ranking(rank=rank)
h = html.HTML()
t = h.table()
r = t.tr
r.td('date', klass='td_date_t')
r.td('from', klass='td_from_t')
r.td('replies', klass='td_rep_t')
r.td('subject', klass='td_subject_t')
for i, row in data.iterrows():
r = t.tr
print(row.index)
r.td(str(row['date']), klass='td_date')
r.td(row['from'], klass='td_from')
r.td(str(row['nbr-references']), klass='td_rep')
r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
return str(t)
@staticmethod
def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}):
header = []
if data_frame.index.name in name_map:
header.append(name_map[data_frame.index.name])
else:
header.append(data_frame.index.name)
for h in data_frame.columns:
if h in name_map:
h = name_map[h]
header.append(h)
css_header = []
css_element = []
for i in header:
css_header.append('td_' + i + '_t')
css_element.append('td_' + i)
h = html.HTML()
if table_name:
t = h.table(id=table_name, klass=table_name + '_t')
else:
t = h.table()
# url map
url_hash = {}
url_skip = []
url_keys = url_map.keys()
for u in url_keys:
if u in header and url_map[u] in header:
url_indx = header.index(url_map[u])
url_hash[header.index(u)] = url_indx
url_skip.append(url_indx)
header.pop(url_indx)
#header
r = t.tr
n = 0
for j in header:
r.td(str(j), klass=css_header[n])
n += 1
#elements
for k, row in data_frame.iterrows():
r = t.tr
r.td(str(k), klass=css_element[0])
n = 1
for l in row:
if n in url_skip:
continue
if isinstance(l, float):
if l % 1 > 0:
l = '{0:.4f}'.format(l)
else:
l = int(l)
if n in url_hash.keys():
url = row[url_hash[n] - 1]
r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
else:
r.td(str(l), klass=css_element[n])
n += 1
return str(t)
class Tab:
@staticmethod
def from_dataframe(data_frame, name_map={}, format=".0f"):
header = []
header.append(data_frame.index.name)
for h in data_frame.columns:
if h in name_map:
h = name_map[h]
header.append(h)
return tabulate(data_frame, headers=header, floatfmt=format)

79
analysis/plot.py Normal file
View File

@ -0,0 +1,79 @@
import numpy as np
import pandas as pd
import analysis.query
# for colormaps see:
# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps
# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps
# http://matplotlib.org/examples/color/colormaps_reference.html
# for colors see:
# http://matplotlib.org/examples/color/named_colors.html
# spectre: slategrey
# nettime: red
# crumb: purple
# empyre: darkblue
def bar_plot_series(series, title, color='blueviolet', ylim=None):
return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim)
def save(plot, name):
fig = plot.get_figure()
fig.savefig(name)
class Plot:
query = None
def __init__(self, q=None):
if not isinstance(q, analysis.query.Query):
logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query")
raise Exception()
self.query = q
'''
activity
'''
def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys()
series = []
for k in activity_rank:
series.append(self.query.activity_from(k, resolution, series=True))
df = pd.concat(series, axis=1)
return df.plot.area(colormap='spectral', figsize=figsize, stacked=False)
'''
content lenght
'''
def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys()
series = []
for k in content_rank:
series.append(self.query.content_length_from(k, resolution, series=True))
df = pd.concat(series, axis=1)
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
'''
threads
'''
def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys()
series = []
for k in threads_rank:
series.append(self.query.threads_from(k, resolution, series=True))
df = pd.concat(series, axis=1)
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)

573
analysis/query.py Normal file
View File

@ -0,0 +1,573 @@
import numpy as np
import pandas as pd
import analysis.archive
import logging
class Query:
archive = None # analysis.archive.Archive object
activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
threads = None # ...
single_threads = None
replies = None # ...
def __init__(self, arch=None):
if not isinstance(arch, analysis.archive.Archive):
logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
raise Exception()
self.archive = arch
'''
activity
'''
def _activity(self):
if self.activity is None:
from_index = self.archive.dataframe.reindex(columns=['from'])
self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
return self.activity
def activity_from(self, email_address, resolution='y', series=False):
eaddr = email_address.replace('@', '{at}').lower()
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._activity()
try:
af = self.activity[eaddr]
except KeyError:
return None
activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
activity_from.index.name = 'year'
else:
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
activity_from.index.name = 'year-month'
if series:
return activity_from
return activity_from.to_frame('nbr-messages').astype(int)
def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
self._activity()
afr = self.activity.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
if series:
return afr[:rank]
return afr[:rank].to_frame('nbr-messages').astype(int)
# def activity_overall(self, resolution='y', series=False):
# freq = 'M'
# if resolution.lower() == 'y':
# freq = 'AS'
# elif resolution.lower() == 'm':
# freq = 'M'
# else:
# return None
# self._activity()
# y = self.activity.sum(axis=1)
# y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
# if freq == 'AS':
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
# y.index.name = 'year'
# else:
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
# y.index.name = 'year-month'
# if series:
# return y
# return y.to_frame('nbr-messages').astype(int)
def activity_overall(self, resolution='y', series=False):
a = self.archive.dataframe['url']
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-messages').astype(int)
def cohort(self, resolution='m', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._activity()
c = self.activity.idxmax().order().to_frame('date')
c.index = c['date']
cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()
if freq == 'AS':
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
cohort.index.name = 'year'
else:
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
cohort.index.name = 'year-month'
if series:
return cohort
return cohort.to_frame('first-messages').astype(int)
'''
content lenght
'''
def _content_length(self):
if self.content_length is None:
from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
return self.content_length
def content_length_from(self, email_address, resolution='y', series=False):
eaddr = email_address.replace('@', '{at}').lower()
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._content_length()
try:
af = self.content_length[eaddr]
except KeyError:
return None
content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
content_length_from.index.name = 'year'
else:
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
content_length_from.index.name = 'year-month'
if series:
return content_length_from
return content_length_from.to_frame('nbr-bytes').astype(int)
def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
self._content_length()
cfr = self.content_length.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
cfr = cfr[cfr.index.str.contains(p)]
if series:
return cfr[:rank]
return cfr[:rank].to_frame('nbr-bytes').astype(int)
def content_length_overall(self, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._content_length()
y = self.content_length.sum(axis=1)
y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-bytes').astype(int)
'''
threads
'''
def _threads(self, thresh=0):
print("doing threads")
if self.threads is None:
self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
if self.single_threads is None:
self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
return self.threads;
def threads_ranking(self, rank=5, resolution='y'):
self._threads()
if resolution == None:
data = self.threads.drop('message-id', axis=1)[:rank]
return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
# get the threads ranking per time resolution
#
data = self.threads.drop('message-id', axis=1)
data = data.groupby([pd.TimeGrouper(freq=freq)])
r = {}
for k, v in data:
if freq == 'AS':
time_key = k.strftime('%Y')
else:
time_key = k.strftime('%Y-%m')
frame = v[:rank]
frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
r[time_key] = frame
return r
def threads_replies_to(self, email_address, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._threads()
eaddr = email_address.replace('@', '{at}').lower()
self._threads()
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
if series:
return threads_from_ranking[eaddr]
threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
if freq == 'AS':
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
threads_from_ranking.index.name = 'year'
else:
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
threads_from_ranking.index.name = 'year-month'
return threads_from_ranking
def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
self._threads()
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
tfr = tfr[:rank].astype(int)
return tfr
def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
self._threads()
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
if series:
return tir[:rank]
return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
# activity
self._activity()
afr = self.activity.sum(axis=0).astype(int)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
# initiated threads [top 25]
self._threads()
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
inter = afr.index.intersection(tir.index)
avg = tir[inter] / afr[inter]
labels = ['messages', 'threads', 'avg.threads']
return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
self._threads()
#initiated
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
#replies [top 25]
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
if filter_nettime:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
tfr = tfr['nbr-references'] # dataframe to series
inter = tir.index.intersection(tfr.index)
avg = tfr[inter] / tir[inter]
labels = ['threads', 'replies', 'avg.replies']
return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
agg = aggregate.lower()
if not agg in ['sum', 'mean', 'count']:
return None
if not self.threads is None:
del self.threads
self.threads = None
self._threads(tresh)
if agg == 'sum':
# number of replies total (re: sum all the replies)
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
elif agg == 'mean':
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
else:
# number of threads (re: msgs with at least one reply)
y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-threads').astype(int)
def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
agg = aggregate.lower()
if not agg in ['sum', 'mean', 'count']:
return None
if not self.single_threads is None:
del self.single_threads
self.single_threads = None
self._threads(tresh)
y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-threads').astype(int)
'''
replies
'''
def _replies(self):
if self.replies is None:
self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
return self.replies;
def replies_ranking(self, rank=5, resolution=None):
self._replies()
if resolution == None:
data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
return data.to_frame('nbr_replies')
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
# get the threads ranking per time resolution
#
data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
r = {}
for k, v in data:
if freq == 'AS':
time_key = k.strftime('%Y')
else:
time_key = k.strftime('%Y-%m')
frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
r[time_key] = frame.to_frame('nbr-replies')
return r
def replies_avg_ranking(self, rank=5, filter_nettime=True):
# activity
self._activity()
afr = self.activity.sum(axis=0)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
# replies in thread [top 25]
self._replies()
rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
inter = afr.index.intersection(rpl.index)
avg = rpl[inter] / afr[inter]
labels = ['messages', 'replies', 'avg.replies']
return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
def replies_overall(self, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
if not self.replies is None:
del self.replies
self.replies = None
self._replies()
y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-replies').astype(int)

81
analysis/util.py Normal file
View File

@ -0,0 +1,81 @@
import email
import hashlib
def format_content(msg, archive_name):
return msg['content']
def format_url(msg, archive_name):
return msg['url']
def format_author(msg, archive_name):
return msg['author_name']
def format_from_token(from_str, sep):
from_addr = email.utils.parseaddr(from_str)[1]
if sep not in from_addr:
tok = from_str.split()
try:
at = tok.index(sep)
from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
if from_addr.startswith('<') or from_addr.endswith('>'):
from_addr = from_addr.strip('<').strip('>')
except ValueError:
print(tok)
print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
return None
else:
from_addr = from_addr.replace(sep, '{AT}')
return from_addr.lower()
def format_from(msg, archive_name):
from_str = msg['from']
if " {AT} " in from_str:
return format_from_token(from_str, '{AT}')
elif " at " in from_str:
return format_from_token(from_str, 'at')
elif "@" in from_str:
return format_from_token(from_str, '@')
else:
return from_str
# returns utc timestamp
def format_date(msg, archive_name):
date_str = msg['date']
time_tz = None
try:
date_tz = email.utils.parsedate_tz(date_str)
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
except TypeError:
print("Format Date TypeError")
print(" > " + date_str)
return None
except ValueError:
print("Format Date ValueError")
print(" > " + date_str)
return None
finally:
return time_tz
def format_subject(msg, archive_name):
return msg['subject']
def format_id(msg, archive_name):
if "message-id" in msg:
return msg['message-id']
else:
# create hash with author_name + date
s = msg['author_name'] + msg['date']
sha = hashlib.sha1(s.encode('utf-8'))
return sha.hexdigest()
# format='%d/%m/%Y'
def min_date(archive_name):
if "nettime" in archive_name:
return '01/10/1995'
elif archive_name == "spectre":
return '01/08/2001'
elif archive_name == "empyre":
return '01/01/2002'
elif archive_name == "crumb":
return '01/02/2001'

View File

@ -1,10 +1,12 @@
from urllib.parse import urlparse
import lists.pipermail as pipermail
import lists.listserv as listserv
import lists.mhonarc as mhonarc
import lists.mhonarc_nettime as mhonarc_nettime
DELAY = 0.2
def crawl(url, name, archive_dir):
def crawl(url, name, sublist_name=None, archive_dir="archives"):
u = urlparse(url)
# the following type 'tests' are very weak...
@ -21,6 +23,11 @@ def crawl(url, name, archive_dir):
elif 'cgi-bin' in u.path:
listserv.collect_from_url(url, name, archive_dir)
# special case -- nettime.
# the name should be the sublist_name (i.e nettime-l)
elif "nettime" in name:
mhonarc_nettime.collect_from_url(url, name, name, archive_dir)
else:
print('mhonarc?')

View File

@ -43,6 +43,17 @@ def collect_from_url(url, name, base_archive_dir):
del tb
continue
# archive['name'] = name
# archive['list'] = threads
# file_path = os.path.join(base_arch_dir, name + '.json')
# with open(file_path, 'w') as fp:
# json.dump(archive, fp, indent=4)
# logging.info("done.")
def collect_threads_from_url(url, name, base_arch_dir):
threads = {'name' : name, 'url' : url, 'threads' : []}

View File

@ -4,22 +4,27 @@ from bs4 import BeautifulSoup
DELAY = 0.2
def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
html = response.read()
soup = BeautifulSoup(html, "html5lib")
# base url
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
#collect name
list_name = soup.select('body p:nth-of-type(2) base title')[0].string
list_name = soup.select('body p:nth-of-type(2) title')[0].string
logging.info("Getting " + list_name + " list archive for " + sublist_name)
lists = soup.select('ul:nth-of-type(2) li')
# create (main) directory
# this is where all temp files will be created
d = os.path.join(base_archive_dir, name)
if not os.path.exists(d):
os.makedirs(d)
threads = []
lists = soup.select('ul:nth-of-type(2) li')
for l in lists:
@ -33,31 +38,41 @@ def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
threads_url_list = []
threads_links = l.select('ul li a')
for t in threads_links:
thread_url = urlparse.urljoin(base_url, t.get('href'))
thread_url = urllib.parse.urljoin(base_url, t.get('href'))
threads_url_list.append(thread_url)
nbr_threads = str(len(threads_url_list))
n = 0
for u in threads_url_list:
time.sleep(DELAY)
n += 1
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
threads.append(collect_threads_from_url(u, base_arch_dir, mbox))
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
try:
threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))
except KeyboardInterrupt:
sys.exit(0)
except:
logging.warning("Error archiving: " + l[1] + "... Continuing.")
ex_t, ex, tb = sys.exc_info()
print(ex_t)
traceback.print_tb(tb)
del tb
continue
return threads
# for u in threads_url_list[0:10]:
# print "---------------------------------------"
# tt = collect_threads_from_url(u, base_arch_dir, mbox)
# threads.append(tt)
# tt = collect_threads_from_url(u, base_archive_dir, mbox)
# threads.append(tt)
return None
def collect_threads_from_url(url, base_arch_dir, mbox):
def collect_threads_from_url(url, base_archive_dir, mbox=False):
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
html = response.read()
soup = BeautifulSoup(html, "html5lib")
# base url
@ -73,7 +88,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
logging.info("Collecting Threads of: " + threads_name)
# check if archive already exists
file_path = os.path.join(base_arch_dir, threads['name'] + ".json")
file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
if os.path.isfile(file_path):
logging.info("archive already exists. loading from file " + file_path)
with open(file_path, 'r') as fpin:
@ -114,7 +129,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
def archive_thread(li, base_url, parent_thread_data):
thread_link = li.select('strong a')[0]
thread_url = urlparse.urljoin(base_url, thread_link.get('href'))
thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
thread_id = thread_link.get('name')
thread_title = thread_link.string
thread_author_name = li.select('em')[0].string
@ -145,6 +160,7 @@ def collect_message(url, message):
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
# html = response.read()
soup = BeautifulSoup(html, "html5lib")
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
@ -184,6 +200,8 @@ def collect_message(url, message):
else:
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
def parse_xcomment(soup, xcom):

214
lists/mhonarc_nettime.py Normal file
View File

@ -0,0 +1,214 @@
import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip
from bs4 import BeautifulSoup
DELAY = 0.2
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
response = urllib.request.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html5lib")
# base url
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
#collect name
list_name = soup.select('body p:nth-of-type(2) title')[0].string
logging.info("Getting " + list_name + " list archive for " + sublist_name)
# create (main) directory
# this is where all temp files will be created
d = os.path.join(base_archive_dir, name)
if not os.path.exists(d):
os.makedirs(d)
threads = []
lists = soup.select('ul:nth-of-type(2) li')
for l in lists:
if l.strong is None:
continue
name = l.strong.string
if name.lower() == sublist_name.lower():
threads_url_list = []
threads_links = l.select('ul li a')
for t in threads_links:
thread_url = urllib.parse.urljoin(base_url, t.get('href'))
threads_url_list.append(thread_url)
nbr_threads = str(len(threads_url_list))
n = 0
for u in threads_url_list:
time.sleep(DELAY)
n += 1
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
try:
threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))
except KeyboardInterrupt:
sys.exit(0)
except:
logging.warning("Error archiving: " + l[1] + "... Continuing.")
ex_t, ex, tb = sys.exc_info()
print(ex_t)
traceback.print_tb(tb)
del tb
continue
return threads
# for u in threads_url_list[0:10]:
# print "---------------------------------------"
# tt = collect_threads_from_url(u, base_archive_dir, mbox)
# threads.append(tt)
return None
def collect_threads_from_url(url, base_archive_dir, mbox=False):
response = urllib.request.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html5lib")
# base url
base_url = url
# collect name
threads_name = soup.select('p:nth-of-type(1) title')[0].string
threads_name = threads_name.replace(' ', '_')
# thread data struct
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
logging.info("Collecting Threads of: " + threads_name)
# check if archive already exists
file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
if os.path.isfile(file_path):
logging.info("archive already exists. loading from file " + file_path)
with open(file_path, 'r') as fpin:
threads = json.load(fpin)
else:
lists = soup.select('ul:nth-of-type(1) > li')
nbr_threads = str(len(lists))
n = 0
for l in lists:
n += 1
logging.info("> " + str(n) + " / " + nbr_threads)
try:
thread = archive_thread(l, base_url, None)
threads['threads'].append(thread)
except:
ex_type, ex, tb = sys.exc_info()
traceback.print_tb(tb)
del tb
continue
time.sleep(DELAY)
# write
logging.info("writing archive to file " + file_path)
with open(file_path, 'w') as fp:
json.dump(threads, fp, indent=4)
logging.info("done. ")
return threads
def archive_thread(li, base_url, parent_thread_data):
thread_link = li.select('strong a')[0]
thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
thread_id = thread_link.get('name')
thread_title = thread_link.string
thread_author_name = li.select('em')[0].string
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
collect_message(thread_url, message)
follow = li.select('ul > li')
if len(follow) > 0:
for f in follow:
follow_link = f.select('strong a')
if len (follow_link) > 0:
archive_thread(f, base_url, message) ## recursion
if parent_thread_data is None:
return message
if u'follow-up' not in parent_thread_data:
parent_thread_data[u'follow-up'] = []
parent_thread_data[u'follow-up'].append(message)
return message
def collect_message(url, message):
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
# html = response.read()
soup = BeautifulSoup(html, "html5lib")
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
message['subject'] = parse_xcomment(soup, "X-Subject")
message['date'] = parse_xcomment(soup, "X-Date")
message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
# parse what is displayed on the page
info = soup.select('ul:nth-of-type(1) > li')
for i in info:
if i.em == None:
continue
field = i.em.string
if field.lower() in message_labels:
message[field.lower()] = i.text.strip(field + ": ")
## reformat from -- [author_name, email_addr]
# from_addr = email.utils.parseaddr(message['from'])
# message['author_name'] = from_addr[0]
# message['from'] = from_addr[1]
## -- content --
# test
# c1 = soup.select('pre:nth-of-type(1)')
# if len(c1) > 0:
# message['content'] = c1[0].text
# else:
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
def parse_xcomment(soup, xcom):
com = soup.find(text=re.compile(xcom))
if com is not None:
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
return com
def test_xcomment(soup):
return soup.find(text=re.compile('X-Message-Id')) is not None

View File

@ -8,7 +8,8 @@ DELAY = 0.2
def collect_from_url(url, name, base_archive_dir):
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
# html = response.read().decode(encoding="utf-8")
html = response.read()
soup = BeautifulSoup(html, "html5lib")
threads_list = soup.find_all('tr')
@ -195,7 +196,8 @@ def collect_message(url, message):
# logging.info(" + " + url)
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
# html = response.read().decode(encoding="utf-8")
html = response.read()
soup = BeautifulSoup(html, "html5lib")
if lists.mhonarc.test_xcomment(soup):

View File

@ -69,6 +69,10 @@ class Archive():
i += 1
if nbr_hits > 0:
# nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001')
if k.startswith("nettime-l_"):
dt = datetime.strptime(k, "nettime-l_%b_%y")
k = dt.strftime("%B_%Y")
search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
return search_results
@ -97,6 +101,12 @@ def get_key(kv_tuple):
except Exception:
pass
# nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01"
try:
return datetime.strptime(k, "nettime-l_%b_%y")
except Exception:
pass
print("--------------")
print(k)

View File

@ -118,7 +118,7 @@ def searh():
################################
##
## need to chache all the below
## need to cache all the below??
##
################################
@ -128,7 +128,13 @@ def searh():
a.load(l)
results.append(a.search(k_arg))
return jsonify(result=results)
## -- sort results?
search_results = sorted(results, key=get_result_key)
return jsonify(result=search_results)
def get_result_key(r):
return r['archive']

0
www/static/c3.min.css vendored Executable file → Normal file
View File

0
www/static/c3.min.js vendored Executable file → Normal file
View File

View File

@ -1,18 +1,26 @@
$(document).ready(function(){
$('#search').on('submit', function(e) {
$('#loading').hide()
$('#search').submit(function(e) {
e.preventDefault();
args = $(this).serialize();
$('#graph').empty();
$('#results').empty();
$('#loading').show()
$.get('/search?'+args, function(data) {
$('#loading').hide()
console.log(data);
$('#graph').empty();
$('#results').empty();
// $('#graph').empty();
// $('#results').empty();
$.each(data.result, function(i, item) {
search_result_archive(item);
});
graph(data);
graph(data);
});
});
});
function search_result_archive(a) {

View File

@ -16,6 +16,7 @@
{% endfor %}
</select>
<input type="submit" value="search" id="submit">
<div id="loading">Loading...</div>
</form>
<div id="graph"></div>
<div id="results"></div>