MEGA -- DB
This commit is contained in:
parent
3703dcc169
commit
4197cd4d32
8
.gitignore
vendored
8
.gitignore
vendored
@ -1,7 +1,11 @@
|
||||
# mailinglists specific
|
||||
# listservs specific
|
||||
archives/
|
||||
figs/
|
||||
config/
|
||||
config.py
|
||||
test.py
|
||||
|
||||
#macos
|
||||
.DS_Store
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
6
README
6
README
@ -1,3 +1,9 @@
|
||||
|
||||
TODO (July 2019):
|
||||
- refactor archive.py and search.py
|
||||
- test lists import with mariadb backend
|
||||
|
||||
|
||||
usage: archive.py [-h] [--arch ARCH] url [url ...]
|
||||
|
||||
Mailinglists are dead. Long live mailinglists!
|
||||
|
||||
230
analyse.py
230
analyse.py
@ -1,230 +0,0 @@
|
||||
import os
|
||||
|
||||
# matplot view/windows
|
||||
import matplotlib
|
||||
matplotlib.interactive(True)
|
||||
|
||||
# pd display
|
||||
import pandas as pd
|
||||
pd.set_option('display.max_colwidth', 100)
|
||||
|
||||
from analysis.archive import Archive
|
||||
from analysis.query import Query
|
||||
from analysis.plot import Plot
|
||||
|
||||
import analysis.format
|
||||
|
||||
# spectre: slategrey
|
||||
# nettime: red
|
||||
# crumb: purple
|
||||
# empyre: darkblue
|
||||
|
||||
def save_fig_cohort(q, name, dir, color):
|
||||
t = name + " - Cohorts"
|
||||
pp = q.cohort().plot(color=color, title=t)
|
||||
ts = name + "_cohorts.png"
|
||||
filename = os.path.join(dir, ts)
|
||||
pp.get_figure().savefig(filename)
|
||||
|
||||
def save_fig_messages_total(q, name, dir, color):
|
||||
t = name + " - Nbr. Messages"
|
||||
pp = q.activity_overall().plot(kind='bar', color=color, title=t)
|
||||
ts = name + "_messages.png"
|
||||
filename = os.path.join(dir, ts)
|
||||
pp.get_figure().savefig(filename)
|
||||
|
||||
def save_fig_threads_total(q, name, dir, color):
|
||||
t = name + " - Nbr. Threads"
|
||||
pp = q.threads_overall().plot(kind='bar', color=color, title=t)
|
||||
ts = name + "_threads.png"
|
||||
filename = os.path.join(dir, ts)
|
||||
pp.get_figure().savefig(filename)
|
||||
|
||||
def save_fig_messages_constituency(q, name, dir):
|
||||
t = name + " - Messages Constituency"
|
||||
replies = pd.Series(q.replies_overall(series=True))
|
||||
# threads = pd.Series(q.single_threads_overall(series=True))
|
||||
threads = pd.Series(q.threads_overall(series=True))
|
||||
messages = pd.Series(q.activity_overall(series=True))
|
||||
single_messages = messages - (replies + threads)
|
||||
|
||||
# df = {'a': single_messages, 'b': threads, 'c': replies}
|
||||
# df = pd.DataFrame([single_messages, threads, replies], columns=['a', 'b', 'c'])
|
||||
df = pd.concat([single_messages.to_frame('single-messages').astype(int), threads.to_frame('threads').astype(int), replies.to_frame('replies').astype(int)], axis=1)
|
||||
pp = df.plot(kind='bar', stacked=True, title=t)
|
||||
|
||||
# pp = [single_messages, threads, replies].plot(kind='bar', stacked=True)
|
||||
|
||||
ts = name + "_constituency.png"
|
||||
filename = os.path.join(dir, ts)
|
||||
pp.get_figure().savefig(filename)
|
||||
|
||||
def save_fig_avg_threads_replies(q, name, dir, color):
|
||||
t = name + " - Avg. Threads + Replies"
|
||||
replies = pd.Series(q.replies_overall(series=True))
|
||||
threads = pd.Series(q.threads_overall(series=True))
|
||||
messages = pd.Series(q.activity_overall(series=True))
|
||||
|
||||
avg_threads_messages = (replies + threads) / messages
|
||||
|
||||
pp = pd.DataFrame(avg_threads_messages).plot(kind='bar', color=color, title=t)
|
||||
|
||||
ts = name + "_avg_threads_replies.png"
|
||||
filename = os.path.join(dir, ts)
|
||||
pp.get_figure().savefig(filename)
|
||||
|
||||
def save_fig_diff_threads_replies_vs_messages(q, name, dir, color):
|
||||
t = name + " - Diff. Threads + Replies vs Single Messages"
|
||||
replies = pd.Series(q.replies_overall(series=True))
|
||||
threads = pd.Series(q.threads_overall(series=True))
|
||||
rt = replies + threads
|
||||
messages = pd.Series(q.activity_overall(series=True))
|
||||
|
||||
diff_threads_messages = (2 * rt) - messages
|
||||
|
||||
pp = pd.DataFrame(diff_threads_messages).plot(kind='bar', color=color, title=t)
|
||||
|
||||
ts = name + "_diff_threads_replies_messages.png"
|
||||
filename = os.path.join(dir, ts)
|
||||
pp.get_figure().savefig(filename)
|
||||
|
||||
def save_fig_ratio_replies_threads(q, name, dir, color):
|
||||
t = name + " - Ratio Replies per Thread"
|
||||
replies = pd.Series(q.replies_overall(series=True))
|
||||
threads = pd.Series(q.threads_overall(series=True))
|
||||
|
||||
ratio_replies_threads = replies / threads
|
||||
|
||||
pp = pd.DataFrame(ratio_replies_threads).plot(kind='bar', color=color, title=t)
|
||||
|
||||
ts = name + "_ratio_replies_threads.png"
|
||||
filename = os.path.join(dir, ts)
|
||||
pp.get_figure().savefig(filename)
|
||||
|
||||
def html_td_rank_year(year, data):
|
||||
td_str = '<td class="td_list">'
|
||||
if year in data:
|
||||
td_str += analysis.format.table_threads_ranking(data[year])
|
||||
td_str += '</td>'
|
||||
return td_str
|
||||
|
||||
def html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre):
|
||||
|
||||
html_str = '<table id="rankings">'
|
||||
|
||||
html_str += '<tr>'
|
||||
html_str += '<td class="td_year_t">year</td>'
|
||||
html_str += '<td class="td_list_t">nettime</td>'
|
||||
html_str += '<td class="td_list_t">crumb</td>'
|
||||
html_str += '<td class="td_list_t">spectre</td>'
|
||||
html_str += '<td class="td_list_t">empyre</td>'
|
||||
html_str += '</tr>'
|
||||
|
||||
years = sorted(ranking_nettime.keys())
|
||||
|
||||
print(years)
|
||||
|
||||
for i in years:
|
||||
html_str += '<tr>'
|
||||
html_str += '<td class="td_list">' + i + '</td>'
|
||||
html_str += html_td_rank_year(i, ranking_nettime)
|
||||
html_str += html_td_rank_year(i, ranking_crumb)
|
||||
html_str += html_td_rank_year(i, ranking_spectre)
|
||||
html_str += html_td_rank_year(i, ranking_empyre)
|
||||
html_str += '</tr>'
|
||||
|
||||
html_str += '</table>'
|
||||
return html_str
|
||||
|
||||
|
||||
print("nettime")
|
||||
#nettime
|
||||
nt = Archive('nettime-l')
|
||||
ntq = nt.query()
|
||||
ntp = Plot(ntq)
|
||||
|
||||
|
||||
|
||||
# save_fig_cohort(ntq, 'nettime', 'figs/', 'red')
|
||||
# save_fig_messages_total(ntq, 'nettime', 'figs/', 'red')
|
||||
# save_fig_threads_total(ntq, 'nettime', 'figs/', 'red')
|
||||
# save_fig_messages_constituency(ntq, 'nettime', 'figs/')
|
||||
|
||||
# save_fig_avg_threads_replies(ntq, 'nettime', 'figs/', 'red')
|
||||
# save_fig_diff_threads_replies_vs_messages(ntq, 'nettime', 'figs/', 'red')
|
||||
# save_fig_ratio_replies_threads(ntq, 'nettime', 'figs/', 'red')
|
||||
|
||||
ranking_nettime = ntq.threads_ranking(rank=15)
|
||||
|
||||
# print(r['2000'])
|
||||
|
||||
# print(analysis.format.table_threads_ranking(r['2000']))
|
||||
|
||||
|
||||
print("crumb")
|
||||
#crumb
|
||||
cr = Archive('crumb')
|
||||
crq = cr.query()
|
||||
crp = Plot(crq)
|
||||
|
||||
# save_fig_cohort(crq, 'crumb', 'figs/', 'purple')
|
||||
# save_fig_messages_total(crq, 'crumb', 'figs/', 'purple')
|
||||
# save_fig_threads_total(crq, 'crumb', 'figs/', 'purple')
|
||||
# save_fig_messages_constituency(crq, 'crumb', 'figs/')
|
||||
|
||||
# save_fig_avg_threads_replies(crq, 'crumb', 'figs/', 'purple')
|
||||
# save_fig_diff_threads_replies_vs_messages(crq, 'crumb', 'figs/', 'purple')
|
||||
# save_fig_ratio_replies_threads(crq, 'crumb', 'figs/', 'purple')
|
||||
|
||||
ranking_crumb = crq.threads_ranking(rank=15)
|
||||
|
||||
|
||||
print("empyre")
|
||||
#empyre
|
||||
em = Archive('empyre')
|
||||
emq = em.query()
|
||||
emp = Plot(emq)
|
||||
|
||||
# save_fig_cohort(emq, 'empyre', 'figs/', 'darkblue')
|
||||
# save_fig_messages_total(emq, 'empyre', 'figs/', 'darkblue')
|
||||
# save_fig_threads_total(emq, 'empyre', 'figs/', 'darkblue')
|
||||
# save_fig_messages_constituency(emq, 'empyre', 'figs/')
|
||||
|
||||
# save_fig_avg_threads_replies(emq, 'empyre', 'figs/', 'darkblue')
|
||||
# save_fig_diff_threads_replies_vs_messages(emq, 'empyre', 'figs/', 'darkblue')
|
||||
# save_fig_ratio_replies_threads(emq, 'empyre', 'figs/', 'darkblue')
|
||||
|
||||
ranking_empyre = emq.threads_ranking(rank=15)
|
||||
|
||||
print("spectre")
|
||||
#spectre
|
||||
sp = Archive('spectre')
|
||||
spq = sp.query()
|
||||
spp = Plot(spq)
|
||||
|
||||
# save_fig_cohort(spq, 'spectre', 'figs/', 'slategrey')
|
||||
# save_fig_messages_total(spq, 'spectre', 'figs/', 'slategrey')
|
||||
# save_fig_threads_total(spq, 'spectre', 'figs/', 'slategrey')
|
||||
# save_fig_messages_constituency(spq, 'spectre', 'figs/')
|
||||
|
||||
# save_fig_avg_threads_replies(spq, 'spectre', 'figs/', 'slategrey')
|
||||
# save_fig_diff_threads_replies_vs_messages(spq, 'spectre', 'figs/', 'slategrey')
|
||||
# save_fig_ratio_replies_threads(spq, 'spectre', 'figs/', 'slategrey')
|
||||
|
||||
ranking_spectre = spq.threads_ranking(rank=15)
|
||||
|
||||
|
||||
## comparative ranking
|
||||
|
||||
rankings = html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre)
|
||||
|
||||
html_template = 'figs/ranking/index_template.html'
|
||||
with open(html_template, 'r') as fp:
|
||||
h = fp.read()
|
||||
|
||||
html = h.replace("--table--", rankings)
|
||||
|
||||
html_output = 'figs/ranking/index.html'
|
||||
with open(html_output, 'w+') as fp:
|
||||
fp.write(html)
|
||||
|
||||
@ -1,165 +0,0 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import email, email.parser
|
||||
import os, datetime, json, gzip, re
|
||||
import analysis.util
|
||||
import analysis.query
|
||||
|
||||
import search.archive ## circular...
|
||||
|
||||
|
||||
def filter_date(msg, archive_name):
|
||||
|
||||
time_tz = analysis.util.format_date(msg, archive_name)
|
||||
if not time_tz:
|
||||
return None
|
||||
|
||||
dt = datetime.datetime.fromtimestamp(time_tz)
|
||||
try:
|
||||
date_time = pd.to_datetime(dt)
|
||||
except pd.tslib.OutOfBoundsDatetime:
|
||||
print('time out of bound')
|
||||
print(dt)
|
||||
return None
|
||||
|
||||
min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y')
|
||||
max_date = pd.to_datetime(datetime.datetime.now())
|
||||
if date_time < min_date or date_time > max_date:
|
||||
return None
|
||||
|
||||
return date_time
|
||||
|
||||
|
||||
def message_to_tuple_record(msg, records, archive_name, references='X'):
|
||||
|
||||
# check date first?
|
||||
date = filter_date(msg, archive_name)
|
||||
if not date:
|
||||
print("Archive::filter_date returned None. Skip.")
|
||||
return
|
||||
|
||||
# check / filter from email address second?
|
||||
from_addr = analysis.util.format_from(msg, archive_name)
|
||||
if not from_addr:
|
||||
print("Archive::analysis.util.format_from returned None. Skip.")
|
||||
return
|
||||
|
||||
url = analysis.util.format_url(msg, archive_name)
|
||||
author = analysis.util.format_author(msg, archive_name)
|
||||
subject = analysis.util.format_subject(msg, archive_name)
|
||||
message_id = analysis.util.format_id(msg, archive_name)
|
||||
content = analysis.util.format_content(msg, archive_name)
|
||||
|
||||
records.append((message_id,
|
||||
from_addr,
|
||||
author,
|
||||
subject,
|
||||
date,
|
||||
url,
|
||||
len(content),
|
||||
0 if not 'follow-up' in msg else len(msg['follow-up']),
|
||||
references))
|
||||
|
||||
# recursive follow up -- but references is not keeping track really...
|
||||
if 'follow-up' in msg:
|
||||
for f in msg['follow-up']:
|
||||
message_to_tuple_record(f, records, archive_name, references=message_id)
|
||||
|
||||
return
|
||||
|
||||
def json_data_to_pd_dataframe(json_data, archive_name):
|
||||
|
||||
records = []
|
||||
for d in json_data:
|
||||
for dd in d['threads']:
|
||||
message_to_tuple_record(dd, records, archive_name)
|
||||
|
||||
print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records)))
|
||||
|
||||
df = pd.DataFrame.from_records(records,
|
||||
index='date',
|
||||
columns=['message-id',
|
||||
'from',
|
||||
'author',
|
||||
'subject',
|
||||
'date',
|
||||
'url',
|
||||
'content-length',
|
||||
'nbr-references',
|
||||
'references'])
|
||||
|
||||
df.index.name = 'date'
|
||||
|
||||
return df
|
||||
|
||||
def load_from_file(filename, archive_name, archive_dir, json_data=None):
|
||||
|
||||
if not filename.endswith('.json.gz'):
|
||||
file_path = os.path.join(archive_dir, filename + '.json.gz')
|
||||
else:
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return json_data_to_pd_dataframe(json_data['threads'], archive_name)
|
||||
else:
|
||||
#list of all "filename[...].json.gz" in archive_dir
|
||||
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
|
||||
if files:
|
||||
filename = files[-1] # take the most recent (listed alpha-chronological)
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return json_data_to_pd_dataframe(json_data['threads'], archive_name)
|
||||
else:
|
||||
#list of all json files in archive_dir/filename
|
||||
dir_path = os.path.join(archive_dir, filename)
|
||||
if not os.path.isdir(dir_path):
|
||||
return None
|
||||
|
||||
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
|
||||
if not files:
|
||||
return None
|
||||
|
||||
# load all json files
|
||||
threads = []
|
||||
for file_path in files:
|
||||
with open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
threads.append(json_data)
|
||||
|
||||
print('---> ' + archive_name)
|
||||
return json_data_to_pd_dataframe(threads, archive_name)
|
||||
|
||||
def load_from_search_archive(archive):
|
||||
threads = []
|
||||
for k, v in archive.archive.items():
|
||||
threads.append(v)
|
||||
return json_data_to_pd_dataframe(threads, archive.archive_name)
|
||||
|
||||
|
||||
|
||||
|
||||
class Archive:
|
||||
|
||||
data = None # "raw" json data
|
||||
dataframe = None # main pd dataframe
|
||||
|
||||
def __init__(self, archive_name, archive_dir="archives"):
|
||||
|
||||
if isinstance(archive_name, pd.core.frame.DataFrame):
|
||||
self.dataframe = archive_name ## no copies here
|
||||
|
||||
if isinstance(archive_name, search.archive.Archive):
|
||||
self.dataframe = load_from_search_archive(archive_name)
|
||||
|
||||
if isinstance(archive_name, str):
|
||||
# need a filename or a dir name....
|
||||
self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data)
|
||||
|
||||
def query(self):
|
||||
q = analysis.query.Query(self)
|
||||
return q
|
||||
|
||||
@ -1,165 +0,0 @@
|
||||
import analysis.query
|
||||
import logging, html, numpy
|
||||
from tabulate import tabulate
|
||||
|
||||
def makeurl(text, url):
|
||||
return '<a href="' + url + '">' + text + "</a>"
|
||||
|
||||
def table_threads_ranking(ranking_dataframe):
|
||||
|
||||
html_str = '<table class="threads_ranking">'
|
||||
|
||||
|
||||
html_str += '<tr>'
|
||||
html_str += '<td class="td_date_t">date</td>'
|
||||
html_str += '<td class="td_subject_t">subject</td>'
|
||||
html_str += '<td class="td_from_t">from</td>'
|
||||
html_str += '<td class="td_rep_t">replies</td>'
|
||||
html_str += '</tr>'
|
||||
|
||||
|
||||
for i, row in ranking_dataframe.iterrows():
|
||||
|
||||
html_str += '<tr>'
|
||||
html_str += '<td class="td_date">' + str(i) + '</td>'
|
||||
html_str += '<td class="td_subject">' + makeurl(row['subject'], row['url']) + '</td>'
|
||||
html_str += '<td class="td_from">' + row['from'] + '</td>'
|
||||
html_str += '<td class="td_rep">' + str(row['nbr-references']) + '</td>'
|
||||
html_str += '</tr>'
|
||||
|
||||
html_str += "</table>"
|
||||
|
||||
return html_str
|
||||
|
||||
def frame_to_dictionary_threads_ranking(ranking_dataframe):
|
||||
|
||||
results = []
|
||||
for i, row in ranking_dataframe.iterrows():
|
||||
d = {'date': str(i), 'subject': row['subject'], 'url': row['url'], 'from': row['from'], 'nbr-references': row['nbr-references']}
|
||||
results.append(d)
|
||||
return results
|
||||
|
||||
|
||||
|
||||
class Html:
|
||||
|
||||
query = None
|
||||
|
||||
def __init__(self, q=None):
|
||||
|
||||
if not isinstance(q, query.Query):
|
||||
logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
|
||||
raise Exception()
|
||||
|
||||
self.query = q
|
||||
|
||||
def threads_ranking(self, rank=5, resolution=None):
|
||||
|
||||
data = self.query.threads_ranking(rank=rank)
|
||||
|
||||
h = html.HTML()
|
||||
t = h.table()
|
||||
|
||||
r = t.tr
|
||||
r.td('date', klass='td_date_t')
|
||||
r.td('from', klass='td_from_t')
|
||||
r.td('replies', klass='td_rep_t')
|
||||
r.td('subject', klass='td_subject_t')
|
||||
|
||||
for i, row in data.iterrows():
|
||||
r = t.tr
|
||||
|
||||
print(row.index)
|
||||
|
||||
r.td(str(row['date']), klass='td_date')
|
||||
r.td(row['from'], klass='td_from')
|
||||
r.td(str(row['nbr-references']), klass='td_rep')
|
||||
r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
|
||||
|
||||
return str(t)
|
||||
|
||||
@staticmethod
|
||||
def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}):
|
||||
|
||||
header = []
|
||||
if data_frame.index.name in name_map:
|
||||
header.append(name_map[data_frame.index.name])
|
||||
else:
|
||||
header.append(data_frame.index.name)
|
||||
for h in data_frame.columns:
|
||||
if h in name_map:
|
||||
h = name_map[h]
|
||||
header.append(h)
|
||||
|
||||
css_header = []
|
||||
css_element = []
|
||||
for i in header:
|
||||
css_header.append('td_' + i + '_t')
|
||||
css_element.append('td_' + i)
|
||||
|
||||
h = html.HTML()
|
||||
if table_name:
|
||||
t = h.table(id=table_name, klass=table_name + '_t')
|
||||
else:
|
||||
t = h.table()
|
||||
|
||||
# url map
|
||||
url_hash = {}
|
||||
url_skip = []
|
||||
url_keys = url_map.keys()
|
||||
for u in url_keys:
|
||||
if u in header and url_map[u] in header:
|
||||
url_indx = header.index(url_map[u])
|
||||
url_hash[header.index(u)] = url_indx
|
||||
url_skip.append(url_indx)
|
||||
header.pop(url_indx)
|
||||
|
||||
#header
|
||||
r = t.tr
|
||||
n = 0
|
||||
for j in header:
|
||||
r.td(str(j), klass=css_header[n])
|
||||
n += 1
|
||||
|
||||
|
||||
#elements
|
||||
for k, row in data_frame.iterrows():
|
||||
r = t.tr
|
||||
r.td(str(k), klass=css_element[0])
|
||||
n = 1
|
||||
for l in row:
|
||||
|
||||
if n in url_skip:
|
||||
continue
|
||||
|
||||
if isinstance(l, float):
|
||||
if l % 1 > 0:
|
||||
l = '{0:.4f}'.format(l)
|
||||
else:
|
||||
l = int(l)
|
||||
|
||||
if n in url_hash.keys():
|
||||
url = row[url_hash[n] - 1]
|
||||
r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
|
||||
|
||||
else:
|
||||
r.td(str(l), klass=css_element[n])
|
||||
n += 1
|
||||
|
||||
return str(t)
|
||||
|
||||
class Tab:
|
||||
|
||||
@staticmethod
|
||||
def from_dataframe(data_frame, name_map={}, format=".0f"):
|
||||
|
||||
header = []
|
||||
header.append(data_frame.index.name)
|
||||
for h in data_frame.columns:
|
||||
if h in name_map:
|
||||
h = name_map[h]
|
||||
header.append(h)
|
||||
|
||||
return tabulate(data_frame, headers=header, floatfmt=format)
|
||||
|
||||
|
||||
@ -1,79 +0,0 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import analysis.query
|
||||
|
||||
# for colormaps see:
|
||||
# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps
|
||||
# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps
|
||||
# http://matplotlib.org/examples/color/colormaps_reference.html
|
||||
# for colors see:
|
||||
# http://matplotlib.org/examples/color/named_colors.html
|
||||
|
||||
# spectre: slategrey
|
||||
# nettime: red
|
||||
# crumb: purple
|
||||
# empyre: darkblue
|
||||
|
||||
def bar_plot_series(series, title, color='blueviolet', ylim=None):
|
||||
return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim)
|
||||
|
||||
def save(plot, name):
|
||||
fig = plot.get_figure()
|
||||
fig.savefig(name)
|
||||
|
||||
class Plot:
|
||||
|
||||
query = None
|
||||
|
||||
def __init__(self, q=None):
|
||||
|
||||
if not isinstance(q, analysis.query.Query):
|
||||
logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query")
|
||||
raise Exception()
|
||||
|
||||
self.query = q
|
||||
|
||||
'''
|
||||
activity
|
||||
'''
|
||||
|
||||
def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
|
||||
|
||||
activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys()
|
||||
series = []
|
||||
for k in activity_rank:
|
||||
series.append(self.query.activity_from(k, resolution, series=True))
|
||||
|
||||
df = pd.concat(series, axis=1)
|
||||
|
||||
return df.plot.area(colormap='spectral', figsize=figsize, stacked=False)
|
||||
|
||||
'''
|
||||
content lenght
|
||||
'''
|
||||
|
||||
def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
|
||||
|
||||
content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys()
|
||||
series = []
|
||||
for k in content_rank:
|
||||
series.append(self.query.content_length_from(k, resolution, series=True))
|
||||
|
||||
df = pd.concat(series, axis=1)
|
||||
|
||||
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
|
||||
|
||||
'''
|
||||
threads
|
||||
'''
|
||||
|
||||
def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
|
||||
|
||||
threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys()
|
||||
series = []
|
||||
for k in threads_rank:
|
||||
series.append(self.query.threads_from(k, resolution, series=True))
|
||||
|
||||
df = pd.concat(series, axis=1)
|
||||
|
||||
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
|
||||
@ -1,573 +0,0 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import analysis.archive
|
||||
import logging
|
||||
|
||||
class Query:
|
||||
|
||||
archive = None # analysis.archive.Archive object
|
||||
activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
|
||||
content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
|
||||
threads = None # ...
|
||||
single_threads = None
|
||||
replies = None # ...
|
||||
|
||||
def __init__(self, arch=None):
|
||||
|
||||
if not isinstance(arch, analysis.archive.Archive):
|
||||
logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
|
||||
raise Exception()
|
||||
|
||||
self.archive = arch
|
||||
|
||||
'''
|
||||
activity
|
||||
'''
|
||||
|
||||
def _activity(self):
|
||||
|
||||
if self.activity is None:
|
||||
from_index = self.archive.dataframe.reindex(columns=['from'])
|
||||
self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
|
||||
|
||||
return self.activity
|
||||
|
||||
def activity_from(self, email_address, resolution='y', series=False):
|
||||
|
||||
eaddr = email_address.replace('@', '{at}').lower()
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._activity()
|
||||
try:
|
||||
af = self.activity[eaddr]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
|
||||
if freq == 'AS':
|
||||
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
activity_from.index.name = 'year'
|
||||
else:
|
||||
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
activity_from.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return activity_from
|
||||
|
||||
return activity_from.to_frame('nbr-messages').astype(int)
|
||||
|
||||
def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
|
||||
|
||||
self._activity()
|
||||
afr = self.activity.sum(axis=0).order(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
afr = afr[afr.index.str.contains(p)]
|
||||
|
||||
if series:
|
||||
return afr[:rank]
|
||||
|
||||
return afr[:rank].to_frame('nbr-messages').astype(int)
|
||||
|
||||
|
||||
# def activity_overall(self, resolution='y', series=False):
|
||||
|
||||
# freq = 'M'
|
||||
# if resolution.lower() == 'y':
|
||||
# freq = 'AS'
|
||||
# elif resolution.lower() == 'm':
|
||||
# freq = 'M'
|
||||
# else:
|
||||
# return None
|
||||
|
||||
# self._activity()
|
||||
|
||||
# y = self.activity.sum(axis=1)
|
||||
# y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
|
||||
# if freq == 'AS':
|
||||
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
# y.index.name = 'year'
|
||||
# else:
|
||||
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
# y.index.name = 'year-month'
|
||||
|
||||
# if series:
|
||||
# return y
|
||||
|
||||
# return y.to_frame('nbr-messages').astype(int)
|
||||
|
||||
def activity_overall(self, resolution='y', series=False):
|
||||
|
||||
a = self.archive.dataframe['url']
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-messages').astype(int)
|
||||
|
||||
def cohort(self, resolution='m', series=False):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._activity()
|
||||
|
||||
c = self.activity.idxmax().order().to_frame('date')
|
||||
c.index = c['date']
|
||||
|
||||
cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()
|
||||
|
||||
if freq == 'AS':
|
||||
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
cohort.index.name = 'year'
|
||||
else:
|
||||
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
cohort.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return cohort
|
||||
|
||||
return cohort.to_frame('first-messages').astype(int)
|
||||
|
||||
'''
|
||||
content lenght
|
||||
'''
|
||||
|
||||
def _content_length(self):
|
||||
|
||||
if self.content_length is None:
|
||||
from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
|
||||
self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
|
||||
self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
|
||||
|
||||
return self.content_length
|
||||
|
||||
def content_length_from(self, email_address, resolution='y', series=False):
|
||||
|
||||
eaddr = email_address.replace('@', '{at}').lower()
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._content_length()
|
||||
try:
|
||||
af = self.content_length[eaddr]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
|
||||
if freq == 'AS':
|
||||
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
content_length_from.index.name = 'year'
|
||||
else:
|
||||
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
content_length_from.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return content_length_from
|
||||
|
||||
return content_length_from.to_frame('nbr-bytes').astype(int)
|
||||
|
||||
def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
|
||||
|
||||
self._content_length()
|
||||
cfr = self.content_length.sum(axis=0).order(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
cfr = cfr[cfr.index.str.contains(p)]
|
||||
|
||||
if series:
|
||||
return cfr[:rank]
|
||||
|
||||
return cfr[:rank].to_frame('nbr-bytes').astype(int)
|
||||
|
||||
def content_length_overall(self, resolution='y', series=False):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._content_length()
|
||||
|
||||
y = self.content_length.sum(axis=1)
|
||||
y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-bytes').astype(int)
|
||||
|
||||
|
||||
'''
|
||||
threads
|
||||
'''
|
||||
|
||||
def _threads(self, thresh=0):
|
||||
|
||||
print("doing threads")
|
||||
|
||||
if self.threads is None:
|
||||
self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
|
||||
|
||||
if self.single_threads is None:
|
||||
self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
|
||||
|
||||
return self.threads;
|
||||
|
||||
def threads_ranking(self, rank=5, resolution='y'):
|
||||
|
||||
self._threads()
|
||||
|
||||
if resolution == None:
|
||||
data = self.threads.drop('message-id', axis=1)[:rank]
|
||||
return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
# get the threads ranking per time resolution
|
||||
#
|
||||
data = self.threads.drop('message-id', axis=1)
|
||||
data = data.groupby([pd.TimeGrouper(freq=freq)])
|
||||
r = {}
|
||||
for k, v in data:
|
||||
if freq == 'AS':
|
||||
time_key = k.strftime('%Y')
|
||||
else:
|
||||
time_key = k.strftime('%Y-%m')
|
||||
frame = v[:rank]
|
||||
frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
|
||||
r[time_key] = frame
|
||||
return r
|
||||
|
||||
def threads_replies_to(self, email_address, resolution='y', series=False):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._threads()
|
||||
|
||||
eaddr = email_address.replace('@', '{at}').lower()
|
||||
|
||||
self._threads()
|
||||
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
|
||||
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references
|
||||
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
|
||||
|
||||
if series:
|
||||
return threads_from_ranking[eaddr]
|
||||
|
||||
threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
|
||||
|
||||
if freq == 'AS':
|
||||
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
threads_from_ranking.index.name = 'year'
|
||||
else:
|
||||
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
threads_from_ranking.index.name = 'year-month'
|
||||
|
||||
return threads_from_ranking
|
||||
|
||||
def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
self._threads()
|
||||
|
||||
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
|
||||
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tfr = tfr[tfr.index.str.contains(p)]
|
||||
|
||||
tfr = tfr[:rank].astype(int)
|
||||
return tfr
|
||||
|
||||
def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
|
||||
|
||||
self._threads()
|
||||
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tir = tir[tir.index.str.contains(p)]
|
||||
|
||||
if series:
|
||||
return tir[:rank]
|
||||
|
||||
return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
|
||||
|
||||
def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
# activity
|
||||
self._activity()
|
||||
afr = self.activity.sum(axis=0).astype(int)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
afr = afr[afr.index.str.contains(p)]
|
||||
|
||||
# initiated threads [top 25]
|
||||
self._threads()
|
||||
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tir = tir[tir.index.str.contains(p)]
|
||||
|
||||
inter = afr.index.intersection(tir.index)
|
||||
avg = tir[inter] / afr[inter]
|
||||
|
||||
labels = ['messages', 'threads', 'avg.threads']
|
||||
return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
|
||||
|
||||
def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
self._threads()
|
||||
|
||||
#initiated
|
||||
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tir = tir[tir.index.str.contains(p)]
|
||||
|
||||
#replies [top 25]
|
||||
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tfr = tfr[tfr.index.str.contains(p)]
|
||||
tfr = tfr['nbr-references'] # dataframe to series
|
||||
|
||||
|
||||
inter = tir.index.intersection(tfr.index)
|
||||
avg = tfr[inter] / tir[inter]
|
||||
|
||||
labels = ['threads', 'replies', 'avg.replies']
|
||||
return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
|
||||
|
||||
|
||||
def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
agg = aggregate.lower()
|
||||
if not agg in ['sum', 'mean', 'count']:
|
||||
return None
|
||||
|
||||
if not self.threads is None:
|
||||
del self.threads
|
||||
self.threads = None
|
||||
|
||||
self._threads(tresh)
|
||||
|
||||
if agg == 'sum':
|
||||
# number of replies total (re: sum all the replies)
|
||||
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
elif agg == 'mean':
|
||||
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
|
||||
else:
|
||||
# number of threads (re: msgs with at least one reply)
|
||||
y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-threads').astype(int)
|
||||
|
||||
def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
agg = aggregate.lower()
|
||||
if not agg in ['sum', 'mean', 'count']:
|
||||
return None
|
||||
|
||||
if not self.single_threads is None:
|
||||
del self.single_threads
|
||||
self.single_threads = None
|
||||
|
||||
self._threads(tresh)
|
||||
|
||||
|
||||
y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
|
||||
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-threads').astype(int)
|
||||
|
||||
|
||||
'''
|
||||
replies
|
||||
'''
|
||||
|
||||
def _replies(self):
|
||||
|
||||
if self.replies is None:
|
||||
self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
|
||||
self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
|
||||
return self.replies;
|
||||
|
||||
def replies_ranking(self, rank=5, resolution=None):
|
||||
|
||||
self._replies()
|
||||
|
||||
if resolution == None:
|
||||
data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
|
||||
return data.to_frame('nbr_replies')
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
# get the threads ranking per time resolution
|
||||
#
|
||||
data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
|
||||
r = {}
|
||||
for k, v in data:
|
||||
if freq == 'AS':
|
||||
time_key = k.strftime('%Y')
|
||||
else:
|
||||
time_key = k.strftime('%Y-%m')
|
||||
frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
|
||||
r[time_key] = frame.to_frame('nbr-replies')
|
||||
return r
|
||||
|
||||
def replies_avg_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
# activity
|
||||
self._activity()
|
||||
afr = self.activity.sum(axis=0)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
afr = afr[afr.index.str.contains(p)]
|
||||
|
||||
# replies in thread [top 25]
|
||||
|
||||
self._replies()
|
||||
rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
|
||||
|
||||
inter = afr.index.intersection(rpl.index)
|
||||
avg = rpl[inter] / afr[inter]
|
||||
|
||||
labels = ['messages', 'replies', 'avg.replies']
|
||||
return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
|
||||
|
||||
def replies_overall(self, resolution='y', series=False):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
if not self.replies is None:
|
||||
del self.replies
|
||||
self.replies = None
|
||||
|
||||
self._replies()
|
||||
|
||||
y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()
|
||||
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
|
||||
y.index.name = 'year'
|
||||
else:
|
||||
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
|
||||
y.index.name = 'year-month'
|
||||
|
||||
if series:
|
||||
return y
|
||||
|
||||
return y.to_frame('nbr-replies').astype(int)
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,92 +0,0 @@
|
||||
import email
|
||||
import hashlib
|
||||
|
||||
def format_content(msg, archive_name):
|
||||
return msg['content']
|
||||
|
||||
def format_url(msg, archive_name):
|
||||
return msg['url']
|
||||
|
||||
def format_author(msg, archive_name):
|
||||
return msg['author_name']
|
||||
|
||||
def format_from_token(from_str, sep):
|
||||
|
||||
fff = from_str
|
||||
|
||||
from_addr = email.utils.parseaddr(from_str)[1]
|
||||
|
||||
fffa = email.utils.parseaddr(from_str)
|
||||
|
||||
if sep not in from_addr:
|
||||
tok = from_str.split()
|
||||
try:
|
||||
at = tok.index(sep)
|
||||
from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
|
||||
if from_addr.startswith('<') or from_addr.endswith('>'):
|
||||
from_addr = from_addr.strip('<').strip('>')
|
||||
except ValueError:
|
||||
print(tok)
|
||||
print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
|
||||
print("*** " + fff)
|
||||
print("+++")
|
||||
print(fffa)
|
||||
print("----")
|
||||
|
||||
return None
|
||||
else:
|
||||
from_addr = from_addr.replace(sep, '{AT}')
|
||||
return from_addr.lower()
|
||||
|
||||
def format_from(msg, archive_name):
|
||||
from_str = msg['from']
|
||||
|
||||
if " {AT} " in from_str:
|
||||
return format_from_token(from_str, '{AT}')
|
||||
elif " at " in from_str:
|
||||
return format_from_token(from_str, 'at')
|
||||
elif "@" in from_str:
|
||||
return format_from_token(from_str, '@')
|
||||
else:
|
||||
return from_str
|
||||
|
||||
# returns utc timestamp
|
||||
def format_date(msg, archive_name):
|
||||
date_str = msg['date']
|
||||
time_tz = None
|
||||
try:
|
||||
date_tz = email.utils.parsedate_tz(date_str)
|
||||
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
||||
except TypeError:
|
||||
print("Format Date TypeError")
|
||||
print(" > " + date_str)
|
||||
return None
|
||||
except ValueError:
|
||||
print("Format Date ValueError")
|
||||
print(" > " + date_str)
|
||||
return None
|
||||
finally:
|
||||
return time_tz
|
||||
|
||||
def format_subject(msg, archive_name):
|
||||
return msg['subject']
|
||||
|
||||
def format_id(msg, archive_name):
|
||||
if "message-id" in msg:
|
||||
return msg['message-id']
|
||||
else:
|
||||
# create hash with author_name + date
|
||||
s = msg['author_name'] + msg['date']
|
||||
sha = hashlib.sha1(s.encode('utf-8'))
|
||||
return sha.hexdigest()
|
||||
|
||||
# format='%d/%m/%Y'
|
||||
def min_date(archive_name):
|
||||
if "nettime" in archive_name:
|
||||
return '01/10/1995'
|
||||
elif archive_name == "spectre":
|
||||
return '01/08/2001'
|
||||
elif archive_name == "empyre":
|
||||
return '01/01/2002'
|
||||
elif archive_name == "crumb":
|
||||
return '01/02/2001'
|
||||
257
archive/archive.py
Normal file
257
archive/archive.py
Normal file
@ -0,0 +1,257 @@
|
||||
import email, email.parser
|
||||
import os, json, gzip, re
|
||||
import mysql.connector as mariadb
|
||||
import archive.sql, archive.util
|
||||
from datetime import date, datetime
|
||||
from dateutil import parser
|
||||
import terminal.progress
|
||||
|
||||
def load_from_file(filename, archive_name, archive_dir):
|
||||
|
||||
if not filename.endswith('.json.gz'):
|
||||
file_path = os.path.join(archive_dir, filename + '.json.gz')
|
||||
else:
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return (json_data, archive_name)
|
||||
else:
|
||||
#list of all "filename[...].json.gz" in archive_dir
|
||||
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
|
||||
if files:
|
||||
filename = files[-1] # take the most recent (listed alpha-chronological)
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return (json_data, archive_name) # <--- this makes no sense....
|
||||
|
||||
else:
|
||||
#list of all json files in archive_dir/filename
|
||||
dir_path = os.path.join(archive_dir, filename)
|
||||
if not os.path.isdir(dir_path):
|
||||
return None
|
||||
|
||||
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
|
||||
if not files:
|
||||
return None
|
||||
|
||||
# load all json files
|
||||
threads = []
|
||||
for file_path in files:
|
||||
with open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
threads.append(json_data)
|
||||
|
||||
return (threads, archive_name)
|
||||
|
||||
def connect_db(database, host, user, password):
|
||||
|
||||
try:
|
||||
con = mariadb.connect(host=host, user=user, password=password, database=database)
|
||||
except mariadb.Error as error:
|
||||
print("Error: {}".format(error))
|
||||
if error.errno == 1049:
|
||||
if util.y_n_question("Table " + archive_name + " does not exist. Create it?"):
|
||||
print("creating")
|
||||
else:
|
||||
print("not creating")
|
||||
return None
|
||||
finally:
|
||||
return con
|
||||
|
||||
|
||||
class Archive:
|
||||
|
||||
data = None # "raw" json data
|
||||
db_con = None
|
||||
|
||||
def __init__(self, archive_name, archive_dir):
|
||||
|
||||
if isinstance(archive_name, str):
|
||||
# need a filename or a dir name....
|
||||
print("reading archive " + archive_name, end='')
|
||||
(self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir)
|
||||
print(" - done.")
|
||||
|
||||
def __init__(self, archive_name, database, host, user, password):
|
||||
|
||||
self.archive_name = archive_name
|
||||
self.db_con = connect_db(database, host, user, password)
|
||||
|
||||
def __init__(self, archive_name, config):
|
||||
|
||||
self.archive_name = archive_name
|
||||
self.db_con = connect_db(config['database'], config['host'], config['user'], config['password'])
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
if self.db_con is not None:
|
||||
self.db_con.close()
|
||||
|
||||
|
||||
def create_db(self, host, database, user, password):
|
||||
|
||||
print("creating table: " + self.archive_name, end='')
|
||||
self.db_con = connect_db(database, host, user, password)
|
||||
if self.db_con is None:
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor()
|
||||
cursor.execute(archive.sql.CREATE.format(self.archive_name))
|
||||
except mariadb.Error as error:
|
||||
print("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
print(" - done.")
|
||||
|
||||
def insert_db(self, host, database, user, password):
|
||||
|
||||
self.db_con = connect_db(database, host, user, password)
|
||||
|
||||
if self.db_con is None:
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor()
|
||||
|
||||
progress = terminal.progress.ProgressBar(self.archive_name, len(self.data), fmt=terminal.progress.ProgressBar.FULL)
|
||||
|
||||
for t in self.data:
|
||||
|
||||
n_inserted = self.recursive_insert_db(cursor, t["threads"])
|
||||
# print(" - insert: " + str(n_inserted), end='')
|
||||
if n_inserted > 0:
|
||||
self.db_con.commit()
|
||||
|
||||
progress.current += 1
|
||||
progress()
|
||||
|
||||
progress.done()
|
||||
self.db_con.commit()
|
||||
|
||||
except mariadb.Error as error:
|
||||
pass
|
||||
# print("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
def recursive_insert_db(self, cursor, thread):
|
||||
|
||||
n_inserted = 0
|
||||
for m in thread:
|
||||
try:
|
||||
|
||||
from_ = archive.util.format_from(m)
|
||||
author_name_ = archive.util.format_author(m)
|
||||
to_ = archive.util.format_to(m)
|
||||
date_ = archive.util.format_date(m, self.archive_name)
|
||||
|
||||
if date_ is None or from_ is None:
|
||||
# print("\nerrorororororo")
|
||||
# print(m['from'] + " -- " + m['date'])
|
||||
continue
|
||||
|
||||
cursor.execute(archive.sql.INSERT, (from_,author_name_,to_,m["subject"],date_,m["content-type"],m["content"],m["url"]))
|
||||
n_inserted += 1
|
||||
|
||||
if "follow-up" in m:
|
||||
n_inserted += self.recursive_insert_db(cursor, m["follow-up"])
|
||||
|
||||
except mariadb.Error as error:
|
||||
if error.errno == 1062:
|
||||
#duplication continue <------------------------- look this up...
|
||||
# print("\nError: {}".format(error))
|
||||
continue
|
||||
|
||||
return n_inserted
|
||||
|
||||
def content_search(self, term, bool=True):
|
||||
|
||||
if self.db_con is None:
|
||||
print("Not connection to database...")
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor(buffered=True)
|
||||
if bool:
|
||||
cursor.execute(archive.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term))
|
||||
else:
|
||||
cursor.execute(archive.sql.CONTENT_QUERY.format(self.archive_name, term))
|
||||
|
||||
# print(cursor.rowcount)
|
||||
results = []
|
||||
for (from_, author_name_, subject_, date_, url_) in cursor:
|
||||
results.append((from_, author_name_, subject_, date_, url_))
|
||||
# print("{} {} {}".format(from_, str(date_), url_))
|
||||
return results
|
||||
|
||||
except mariadb.Error as error:
|
||||
print("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
def from_search(self, term, bool=True):
|
||||
|
||||
if self.db_con is None:
|
||||
print("Not connection to database...")
|
||||
return
|
||||
|
||||
try:
|
||||
cursor = self.db_con.cursor(buffered=True)
|
||||
if bool:
|
||||
cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term))
|
||||
else:
|
||||
cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term))
|
||||
|
||||
# print(cursor.rowcount)
|
||||
results = []
|
||||
for (from_, author_name_, subject_, date_, url_) in cursor:
|
||||
results.append((from_, author_name_, subject_, date_, url_))
|
||||
# print("{} {} {}".format(from_, str(date_), url_))
|
||||
return results
|
||||
|
||||
except mariadb.Error as error:
|
||||
print("Error: {}".format(error))
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
# analysis
|
||||
def longest_field(self, field, thread, max_length=0):
|
||||
import archive.util
|
||||
for m in thread:
|
||||
if not field in m:
|
||||
if "threads" in m:
|
||||
max_length = self.longest_field(field, m["threads"], max_length)
|
||||
continue
|
||||
if m[field] is None:
|
||||
continue
|
||||
if field == "from":
|
||||
m[field] = archive.util.format_from(m)
|
||||
elif field == "author_name":
|
||||
m[field] = archive.util.format_author(m)
|
||||
elif field == "to":
|
||||
m[field] = archive.util.format_to(m)
|
||||
elif field == "date":
|
||||
m[field] = str(archive.util.format_date(m, self.archive_name))
|
||||
|
||||
|
||||
if m[field] is None:
|
||||
continue
|
||||
|
||||
l = len(m[field])
|
||||
if l > max_length:
|
||||
max_length = l
|
||||
print(">> " + m[field])
|
||||
if "follow-up" in m:
|
||||
max_length = self.longest_field(field, m["follow-up"], max_length)
|
||||
return max_length
|
||||
|
||||
|
||||
|
||||
31
archive/sql.py
Normal file
31
archive/sql.py
Normal file
@ -0,0 +1,31 @@
|
||||
CREATE = "CREATE TABLE `{}` (" \
|
||||
"`from_` varchar(85) NOT NULL," \
|
||||
"`author_name_` varchar(200) NOT NULL," \
|
||||
"`to_` text(60)," \
|
||||
"`subject_` varchar(3500) NOT NULL," \
|
||||
"`date_` datetime NOT NULL," \
|
||||
"`content_type_` varchar(15) NOT NULL," \
|
||||
"`content_` mediumtext NOT NULL," \
|
||||
"`url_` varchar(100) NOT NULL," \
|
||||
"PRIMARY KEY(`from_`, `date_`)," \
|
||||
"FULLTEXT (`subject_`, `content_`)," \
|
||||
"FULLTEXT (`from_`, `author_name_`)" \
|
||||
") ENGINE = InnoDB;"
|
||||
|
||||
INSERT = ("INSERT INTO nettime_l"
|
||||
"(from_, author_name_, to_, subject_, date_, content_type_, content_, url_) "
|
||||
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")
|
||||
|
||||
CONTENT_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
|
||||
"WHERE MATCH(subject_, content_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
|
||||
|
||||
CONTENT_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
|
||||
"WHERE MATCH(subject_, content_) AGAINST('{}') ORDER BY date_")
|
||||
|
||||
FROM_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
|
||||
"WHERE MATCH(from_, author_name_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
|
||||
|
||||
FROM_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
|
||||
"WHERE MATCH(from_, author_name_) AGAINST('{}') ORDER BY date_")
|
||||
|
||||
# SELECT from_, author_name_, subject_, date_, url_ FROM nettime_l WHERE MATCH(content_) AGAINST('%s' IN BOOLEAN MODE)
|
||||
225
archive/util.py
Executable file
225
archive/util.py
Executable file
@ -0,0 +1,225 @@
|
||||
import email, datetime, sys
|
||||
import hashlib
|
||||
import dateparser
|
||||
|
||||
def format_content(msg):
|
||||
return msg['content']
|
||||
|
||||
def format_url(msg):
|
||||
return msg['url']
|
||||
|
||||
def format_author(msg):
|
||||
|
||||
if 'author_name' not in msg or msg['author_name'] is None:
|
||||
return None
|
||||
|
||||
author_str = msg['author_name'].replace('"', '')
|
||||
|
||||
if "by way of" in author_str:
|
||||
toks = author_str.split("by way of")
|
||||
if toks[0] == "":
|
||||
author_str = format_from(msg)
|
||||
elif toks[0][-1] == "(":
|
||||
author_str = toks[0][:-1].strip()
|
||||
else:
|
||||
author_str = toks[0]
|
||||
|
||||
if ("(" in author_str) or ("<" in author_str):
|
||||
# ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault <zx {AT} xyz.net>
|
||||
# print("±±±±±±")
|
||||
# print("name: " + author_str)
|
||||
# print("from: " + msg['from'])
|
||||
if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'):
|
||||
author_str = author_str.split('(')[0].strip()
|
||||
else:
|
||||
author_str = email.utils.parseaddr(author_str)[0]
|
||||
# print(" Name:" + author_str.replace('"', ''))
|
||||
# print(" From:" + format_from(msg))
|
||||
|
||||
if " ," in author_str:
|
||||
# nettime's_roving_reporter , thing.net {AT} bbs.thing.net
|
||||
author_str = author_str.split(' ,')[0]
|
||||
|
||||
|
||||
return author_str
|
||||
|
||||
def format_from_token(from_str, sep):
|
||||
from_addr = email.utils.parseaddr(from_str)[1]
|
||||
if sep not in from_addr:
|
||||
tok = from_str.split()
|
||||
try:
|
||||
at = tok.index(sep)
|
||||
from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
|
||||
if from_addr.startswith('<') or from_addr.endswith('>'):
|
||||
from_addr = from_addr.strip('<').strip('>')
|
||||
except ValueError:
|
||||
print(tok)
|
||||
print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
|
||||
return None
|
||||
else:
|
||||
from_addr = from_addr.replace(sep, '{AT}')
|
||||
return "".join(from_addr.lower().split())
|
||||
|
||||
def format_from(msg):
|
||||
|
||||
if 'from' not in msg or msg['from'] is None:
|
||||
return None
|
||||
|
||||
from_str = msg['from']
|
||||
|
||||
if " {AT} " in from_str:
|
||||
return format_from_token(from_str, '{AT}')
|
||||
elif " at " in from_str:
|
||||
return format_from_token(from_str, 'at')
|
||||
elif "@" in from_str:
|
||||
return format_from_token(from_str, '@')
|
||||
else:
|
||||
return "".join(from_str.split())
|
||||
|
||||
def format_to(msg):
|
||||
|
||||
if "to" not in msg or msg["to"] is None:
|
||||
return None
|
||||
|
||||
to_str = msg["to"]
|
||||
toks = email.utils.parseaddr(to_str)
|
||||
# print(toks)
|
||||
|
||||
if len(toks) == 2:
|
||||
to_str = toks[1]
|
||||
|
||||
return "".join(to_str.lower().split())
|
||||
|
||||
|
||||
# returns utc timestamp --- old...
|
||||
def format_date_utc(msg, archive_name):
|
||||
|
||||
if 'date' not in msg or msg['date'] is None:
|
||||
return None
|
||||
|
||||
date_str = msg['date'].replace('.', '')
|
||||
time_tz = None
|
||||
try:
|
||||
date_tz = email.utils.parsedate_tz(date_str)
|
||||
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
||||
except TypeError:
|
||||
print("Format Date TypeError")
|
||||
print(" > " + date_str)
|
||||
return None
|
||||
except ValueError:
|
||||
print("Format Date ValueError")
|
||||
print(" > " + date_str)
|
||||
return None
|
||||
finally:
|
||||
return time_tz
|
||||
|
||||
def format_date(msg, archive_name):
|
||||
|
||||
if 'date' not in msg or msg['date'] is None:
|
||||
return None
|
||||
|
||||
# date_str = msg['date'].replace('.', '')
|
||||
date_str = msg['date']
|
||||
|
||||
# fix Thu, 01 Aug 2002 17:33:08 +0900 (JST)
|
||||
if '(' in date_str:
|
||||
date_str = date_str.split('(')[0].rstrip()
|
||||
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
if date_time is None:
|
||||
|
||||
# random stuff...
|
||||
fix = False
|
||||
toks = date_str.split()
|
||||
|
||||
if len(toks[-1]) == 5 or len(toks[-1]) == 4:
|
||||
# ex. Thu, 24 Jan 2002 15:21:31 -0000
|
||||
if toks[-1] in ['+0000', '-0000', '0000']:
|
||||
date_str = date_str[:-5]
|
||||
fix = True
|
||||
# ex. Fri, 25 Jan 2002 13:21:49 +1050
|
||||
elif toks[-1][-2] == '5':
|
||||
d = list(date_str)
|
||||
d[-2] = '3'
|
||||
date_str = "".join(d)
|
||||
fix = True
|
||||
|
||||
if toks[-1][-1] != '0':
|
||||
#ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005'
|
||||
date_str = date_str[:-5]
|
||||
fix = True
|
||||
|
||||
if 'Fru' in toks[0]:
|
||||
date_str = date_str.replace('Fru', 'Fri')
|
||||
fix = True
|
||||
elif 'Thur' in toks[0]:
|
||||
date_str = date_str.replace('Thur', 'Thu')
|
||||
fix = True
|
||||
|
||||
if not fix:
|
||||
# print("----")
|
||||
return None
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
if date_time is None:
|
||||
|
||||
if 'GMT' in date_str:
|
||||
# ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100'
|
||||
date_str = date_str.split('GMT')[0].rstrip()
|
||||
fix = True
|
||||
|
||||
if 'METDST' in toks[-1]:
|
||||
# ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST'
|
||||
date_str = date_str.replace('METDST', 'MET')
|
||||
fix = True
|
||||
|
||||
|
||||
if not fix:
|
||||
# print("++++")
|
||||
return None
|
||||
|
||||
date_time = dateparser.parse(date_str)
|
||||
return date_time
|
||||
|
||||
# else:
|
||||
# print(date_str)
|
||||
|
||||
# date_time = datetime.datetime.fromtimestamp(time_tz)
|
||||
|
||||
min_d = datetime.datetime.strptime(min_date(archive_name), "%d/%m/%Y")
|
||||
max_d = datetime.datetime.now()
|
||||
|
||||
date_time_naive = date_time.replace(tzinfo=None)
|
||||
|
||||
if date_time_naive < min_d or date_time_naive > max_d:
|
||||
return None
|
||||
|
||||
return date_time
|
||||
|
||||
def format_subject(msg, archive_name):
|
||||
|
||||
if 'subject' not in msg or msg['subject'] is None:
|
||||
return None
|
||||
|
||||
return msg['subject']
|
||||
|
||||
def format_id(msg, archive_name):
|
||||
if "message-id" in msg:
|
||||
return msg['message-id']
|
||||
else:
|
||||
# create hash with author_name + date
|
||||
s = msg['author_name'] + msg['date']
|
||||
sha = hashlib.sha1(s.encode('utf-8'))
|
||||
return sha.hexdigest()
|
||||
|
||||
# format='%d/%m/%Y'
|
||||
def min_date(archive_name):
|
||||
if "nettime" in archive_name:
|
||||
return '01/10/1995'
|
||||
elif archive_name == "spectre":
|
||||
return '01/08/2001'
|
||||
elif archive_name == "empyre":
|
||||
return '01/01/2002'
|
||||
elif archive_name == "crumb":
|
||||
return '01/02/2001'
|
||||
@ -1,29 +1,34 @@
|
||||
name: listservs
|
||||
name: listserv
|
||||
channels:
|
||||
- defaults
|
||||
- defaults
|
||||
dependencies:
|
||||
- beautiful-soup=4.3.2=py34_0
|
||||
- click=6.7=py34_0
|
||||
- flask=0.12=py34_0
|
||||
- gunicorn=19.1.0=py34_0
|
||||
- html5lib=0.999=py34_0
|
||||
- itsdangerous=0.24=py34_0
|
||||
- jinja2=2.9.6=py34_0
|
||||
- markupsafe=0.23=py34_2
|
||||
- openssl=1.0.2l=0
|
||||
- pastedeploy=1.5.2=py34_1
|
||||
- pip=9.0.1=py34_1
|
||||
- python=3.4.5=0
|
||||
- readline=6.2=2
|
||||
- setuptools=27.2.0=py34_0
|
||||
- six=1.10.0=py34_0
|
||||
- sqlite=3.13.0=0
|
||||
- tk=8.5.18=0
|
||||
- werkzeug=0.11.15=py34_0
|
||||
- wheel=0.29.0=py34_0
|
||||
- xz=5.2.2=1
|
||||
- zlib=1.2.8=3
|
||||
- pip:
|
||||
- beautifulsoup4==4.3.2
|
||||
- webencodings==0.5.1
|
||||
- ca-certificates=2019.5.15=0
|
||||
- openssl=1.0.2s=h1de35cc_0
|
||||
- pip=9.0.1=py34_1
|
||||
- python=3.4.5=0
|
||||
- readline=6.2=2
|
||||
- setuptools=27.2.0=py34_0
|
||||
- sqlite=3.13.0=0
|
||||
- tk=8.5.18=0
|
||||
- wheel=0.29.0=py34_0
|
||||
- xz=5.2.4=h1de35cc_4
|
||||
- zlib=1.2.11=h1de35cc_3
|
||||
- pip:
|
||||
- beautifulsoup4==4.7.1
|
||||
- click==7.0
|
||||
- dateparser==0.7.1
|
||||
- flask==1.0.4
|
||||
- gunicorn==19.9.0
|
||||
- itsdangerous==1.1.0
|
||||
- jinja2==2.10.1
|
||||
- markupsafe==1.1.1
|
||||
- mysql-connector-python==8.0.16
|
||||
- protobuf==3.8.0
|
||||
- python-dateutil==2.8.0
|
||||
- pytz==2019.1
|
||||
- regex==2019.6.8
|
||||
- six==1.12.0
|
||||
- soupsieve==1.9.2
|
||||
- tzlocal==1.5.1
|
||||
- werkzeug==0.15.4
|
||||
|
||||
|
||||
@ -1,150 +0,0 @@
|
||||
import logging, os, json, re
|
||||
from datetime import datetime
|
||||
|
||||
import analysis.archive ## circular...
|
||||
import analysis.query
|
||||
import analysis.format
|
||||
|
||||
import threading
|
||||
|
||||
class Archive():
|
||||
|
||||
def __init__(self, archives_dir=None):
|
||||
if archives_dir==None:
|
||||
from www import config
|
||||
self.archives_dir = config.ARCHIVES_PATH
|
||||
else:
|
||||
self.archives_dir = archives_dir
|
||||
|
||||
self.loaded = False
|
||||
|
||||
self.lock_search = threading.Lock()
|
||||
self.lock_threads_ranking = threading.Lock()
|
||||
|
||||
def load(self, archive_name=None):
|
||||
|
||||
if archive_name == None:
|
||||
raise Exception('Archive is not specified')
|
||||
|
||||
archive_path = os.path.join(self.archives_dir, archive_name)
|
||||
if not os.path.isdir(archive_path):
|
||||
raise Exception('Archive ' + path + ' does not exist')
|
||||
|
||||
self.archive_name = archive_name
|
||||
self.archive_path = archive_path
|
||||
|
||||
files = [f for f in os.listdir(archive_path) if f.endswith('.json')]
|
||||
|
||||
self.archive = {}
|
||||
|
||||
for f in files:
|
||||
file_path = os.path.join(archive_path, f)
|
||||
label = f.replace('.json', '')
|
||||
with open(file_path) as fdata:
|
||||
self.archive[label] = json.load(fdata)
|
||||
|
||||
self.loaded = True
|
||||
|
||||
def search_message(self, keyword, msg, index_str, results, field='content'):
|
||||
|
||||
nbr_hits = 0
|
||||
if msg[field] is not None and msg[field].lower().find(keyword.lower()) > 0:
|
||||
nbr_hits += 1
|
||||
results.append({ "index_str": index_str, "subject": msg['subject'], "date": msg['date'], "author_name": msg['author_name'], "url": msg['url'] })
|
||||
|
||||
if 'follow-up' in msg:
|
||||
i = 0
|
||||
for m in msg['follow-up']:
|
||||
current_index_str = index_str + '/' + str(i)
|
||||
nbr_hits += self.search_message(keyword, m, current_index_str, results, field)
|
||||
i += 1
|
||||
|
||||
return nbr_hits
|
||||
|
||||
|
||||
def search(self, keyword, field='content', min_hits=0):
|
||||
|
||||
with self.lock_search:
|
||||
|
||||
search_results = { "keyword": keyword, "field": field, "archive": self.archive_name, "results": [] }
|
||||
|
||||
for k, v in sorted(self.archive.items(), key=get_key, reverse=True):
|
||||
|
||||
current_index_str = self.archive_name + '/' + k
|
||||
hits = []
|
||||
nbr_hits = 0
|
||||
i = 0
|
||||
for m in v['threads']:
|
||||
current_index_str = self.archive_name + '/' + k + '/' + str(i)
|
||||
nbr_hits += self.search_message(keyword, m, current_index_str, hits, field)
|
||||
i += 1
|
||||
|
||||
if nbr_hits > min_hits:
|
||||
# nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001')
|
||||
if k.startswith("nettime-l_"):
|
||||
dt = datetime.strptime(k, "nettime-l_%b_%y")
|
||||
k = dt.strftime("%B_%Y")
|
||||
search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
|
||||
|
||||
return search_results
|
||||
|
||||
def threads_ranking(self, rank=5):
|
||||
|
||||
with self.lock_threads_ranking:
|
||||
|
||||
search_results = { "keyword": "thread ranking", "field": "ranking", "archive": self.archive_name, "results": [] }
|
||||
|
||||
a = analysis.archive.Archive(self)
|
||||
q = a.query();
|
||||
|
||||
ranking = q.threads_ranking(rank=rank)
|
||||
|
||||
for i in ranking:
|
||||
r = analysis.format.frame_to_dictionary_threads_ranking(ranking[i])
|
||||
for h in r:
|
||||
hit = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}]
|
||||
search_results['results'].append({'thread': h['date'], 'nbr_hits': h['nbr-references'], 'hits': hit})
|
||||
del a
|
||||
del q
|
||||
|
||||
return search_results
|
||||
|
||||
|
||||
|
||||
def get_key(kv_tuple):
|
||||
|
||||
k = kv_tuple[0]
|
||||
|
||||
# k is of the form "Month_Year" - ex.: "January_2001"
|
||||
try:
|
||||
return datetime.strptime(k, "%B_%Y")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
|
||||
try:
|
||||
return datetime.strptime(k, "%b_%y")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# k is of the form "Year" - ex.: "2001"
|
||||
try:
|
||||
return datetime.strptime(k, "%Y")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01"
|
||||
try:
|
||||
return datetime.strptime(k, "nettime-l_%b_%y")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print("--------------")
|
||||
print(k)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
43
terminal/progress.py
Normal file
43
terminal/progress.py
Normal file
@ -0,0 +1,43 @@
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import re
|
||||
|
||||
# https://stackoverflow.com/questions/3160699/python-progress-bar
|
||||
|
||||
class ProgressBar(object):
|
||||
DEFAULT = 'Progress: %(bar)s %(percent)3d%%'
|
||||
FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'
|
||||
|
||||
def __init__(self, title, total, width=40, fmt=DEFAULT, symbol='=',
|
||||
output=sys.stderr):
|
||||
assert len(symbol) == 1
|
||||
|
||||
self.title = title
|
||||
self.total = total
|
||||
self.width = width
|
||||
self.symbol = symbol
|
||||
self.output = output
|
||||
self.fmt = re.sub(r'(?P<name>%\(.+?\))d',
|
||||
r'\g<name>%dd' % len(str(total)), fmt)
|
||||
|
||||
self.current = 0
|
||||
|
||||
def __call__(self):
|
||||
percent = self.current / float(self.total)
|
||||
size = int(self.width * percent)
|
||||
remaining = self.total - self.current
|
||||
bar = self.title + ' [' + self.symbol * size + ' ' * (self.width - size) + ']'
|
||||
|
||||
args = {
|
||||
'total': self.total,
|
||||
'bar': bar,
|
||||
'current': self.current,
|
||||
'percent': percent * 100,
|
||||
'remaining': remaining
|
||||
}
|
||||
print('\r' + self.fmt % args, file=self.output, end='')
|
||||
|
||||
def done(self):
|
||||
self.current = self.total
|
||||
self()
|
||||
print('', file=self.output)
|
||||
16
terminal/util.py
Normal file
16
terminal/util.py
Normal file
@ -0,0 +1,16 @@
|
||||
|
||||
def y_n_question(question_str):
|
||||
|
||||
yes = {'yes','y', 'ye', ''}
|
||||
no = {'no','n'}
|
||||
|
||||
while True:
|
||||
sys.stdout.write(question_str + " [Y/n]: ")
|
||||
choice = input().lower()
|
||||
if choice in yes:
|
||||
return True
|
||||
elif choice in no:
|
||||
return False
|
||||
else:
|
||||
sys.stdout.write("\nPlease respond with 'yes' or 'no'\n")
|
||||
continue
|
||||
@ -1,2 +1 @@
|
||||
|
||||
gunicorn -w 1 -b 127.0.0.1:5555 www-serve:app
|
||||
gunicorn -w 1 --bind 0.0.0.0:5555 www-serve:app
|
||||
@ -1,2 +1,4 @@
|
||||
from www import app
|
||||
#app.run(debug=True, threaded=True, use_reloader=False) # uncomment this line to run flask's server
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True, use_reloader=False)
|
||||
159
www/routes.py
159
www/routes.py
@ -1,144 +1,46 @@
|
||||
from flask import render_template, request, jsonify
|
||||
from www import app
|
||||
from www import archives
|
||||
import search.archive
|
||||
import archive.archive as archive
|
||||
import config
|
||||
import www.config as wconfig
|
||||
from datetime import datetime
|
||||
|
||||
import logging
|
||||
logging.info(' ------- arch = Archives() -------- ')
|
||||
arch = archives.Archives()
|
||||
arch.load()
|
||||
archives_data = arch.data
|
||||
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
k = archives_data.keys()
|
||||
return render_template("index.html", archives=k)
|
||||
|
||||
# def get_key(kv_tuple):
|
||||
|
||||
# k = kv_tuple[0]
|
||||
|
||||
# # k is of the form "Month_Year" - ex.: "January_2001"
|
||||
# try:
|
||||
# return datetime.strptime(k, "%B_%Y")
|
||||
# except Exception:
|
||||
# pass
|
||||
|
||||
# # k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
|
||||
# try:
|
||||
# return datetime.strptime(k, "%b_%y")
|
||||
# except Exception:
|
||||
# pass
|
||||
|
||||
# # k is of the form "Year" - ex.: "2001"
|
||||
# try:
|
||||
# return datetime.strptime(k, "%Y")
|
||||
# except Exception:
|
||||
# pass
|
||||
|
||||
# return None
|
||||
|
||||
@app.route('/<list>')
|
||||
def get_list(list):
|
||||
if list in archives_data:
|
||||
d = []
|
||||
for k, v in sorted(archives_data[list].archive.items(), key=search.archive.get_key, reverse=True):
|
||||
d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])})
|
||||
return render_template("list.html", list_name=list, list=d)
|
||||
|
||||
else:
|
||||
return 'nee nee'
|
||||
|
||||
@app.route('/<list>/<sublist>')
|
||||
def get_sublist(list, sublist):
|
||||
|
||||
print(list)
|
||||
print(sublist)
|
||||
|
||||
sublist = sublist.replace(' ', '_')
|
||||
if list in archives_data and sublist in archives_data[list].archive:
|
||||
return render_template("threads.html", sublist_name=sublist, threads=archives_data[list].archive[sublist]['threads'])
|
||||
else:
|
||||
return 'na na'
|
||||
|
||||
@app.route('/<list>/<sublist>/<int:index>')
|
||||
def get_message(list, sublist, index):
|
||||
|
||||
sublist = sublist.replace(' ', '_')
|
||||
index = int(index)
|
||||
if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']):
|
||||
return render_template("message.html", message=archives_data[list].archive[sublist]['threads'][index])
|
||||
else:
|
||||
'non non'
|
||||
|
||||
@app.route('/<list>/<sublist>/<int:index>/<path:follow_ups>')
|
||||
def get_follow_ups(list, sublist, index, follow_ups):
|
||||
|
||||
sublist = sublist.replace(' ', '_')
|
||||
index = int(index)
|
||||
|
||||
ups = follow_ups.split('/')
|
||||
follow = []
|
||||
for u in ups:
|
||||
follow.append(int(u))
|
||||
|
||||
if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']):
|
||||
message = archives_data[list].archive[sublist]['threads'][index]
|
||||
for f in follow:
|
||||
message = message['follow-up'][f]
|
||||
return render_template("message.html", message=message)
|
||||
else:
|
||||
'nope nope'
|
||||
return render_template("index.html")
|
||||
|
||||
@app.route('/search')
|
||||
def searh():
|
||||
|
||||
if len(request.args) < 1:
|
||||
k = archives_data.keys()
|
||||
return render_template("search.html", archives=k, fields=['content', 'from(name)', 'from(email)'], hits=['n/a', '2', '3', '4', '5', '6', '7', '8', '9'])
|
||||
return render_template("search.html", archives=wconfig.lists_to_serve, fields=['content', 'from'])
|
||||
|
||||
k_arg = request.args.get('keyword')
|
||||
l_arg = request.args.get('list')
|
||||
sl_arg = request.args.get('sublist')
|
||||
f_arg = request.args.get('field')
|
||||
h_arg = request.args.get('hits')
|
||||
|
||||
if k_arg is None or k_arg.strip() == '':
|
||||
return "no keyword..."
|
||||
|
||||
if l_arg is None:
|
||||
return "no list..."
|
||||
|
||||
if not (l_arg == "all") and not (l_arg in archives_data):
|
||||
if l_arg != "all" and l_arg not in wconfig.lists_to_serve:
|
||||
return "list '" + l_arg + "' does not exist"
|
||||
|
||||
if sl_arg is not None:
|
||||
if not sl_arg in archives_data[l]:
|
||||
return "sublist '" + sl_arg + "' does not exist in list '" + l_arg + "'"
|
||||
if f_arg not in ['content', 'from']:
|
||||
return "field '" + f_arg + "' does not exist"
|
||||
|
||||
if f_arg == "from(name)":
|
||||
f_arg = 'author_name'
|
||||
elif f_arg == "from(email)":
|
||||
f_arg = 'from'
|
||||
|
||||
lists = []
|
||||
if l_arg == "all":
|
||||
for k in archives_data.keys():
|
||||
lists.append(k)
|
||||
lists = wconfig.lists_to_serve
|
||||
else:
|
||||
lists.append(l_arg)
|
||||
|
||||
nbr_hits = 0
|
||||
if h_arg in ['2', '3', '4', '5', '6', '7', '8', '9']:
|
||||
nbr_hits = int(h_arg)
|
||||
|
||||
|
||||
|
||||
################################
|
||||
##
|
||||
## need to cache all the below
|
||||
## need to cache all the below.....
|
||||
##
|
||||
################################
|
||||
|
||||
@ -147,18 +49,41 @@ def searh():
|
||||
logging.info("search keyword = " + k_arg)
|
||||
|
||||
for l in lists:
|
||||
if k_arg == "rank":
|
||||
logging.info(" ranking " + l)
|
||||
s = archives_data[l].threads_ranking()
|
||||
else:
|
||||
s = archives_data[l].search(keyword=k_arg, field=f_arg, min_hits=nbr_hits)
|
||||
|
||||
with archive.Archive(l, config=config.db) as a:
|
||||
if f_arg == 'content':
|
||||
r = a.content_search(k_arg)
|
||||
else:
|
||||
r = a.from_search(k_arg)
|
||||
|
||||
results.append(s)
|
||||
# format data to return
|
||||
search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] }
|
||||
month_year_results = {}
|
||||
|
||||
## -- sort results?
|
||||
search_results = sorted(results, key=get_result_key)
|
||||
for (from_, author_name_, subject_, date_, url_) in r:
|
||||
m_y = date_.strftime("%B_%Y")
|
||||
if m_y not in month_year_results:
|
||||
month_year_results[m_y] = []
|
||||
month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_})
|
||||
|
||||
return jsonify(result=search_results)
|
||||
for k, v in sorted(month_year_results.items(), key=get_key, reverse=True):
|
||||
search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v})
|
||||
|
||||
# search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
|
||||
# where:
|
||||
# 'thread' = "%B_%Y" aka. January 2001
|
||||
# 'nbr_hits' = nbr hits for that month
|
||||
# 'hits' = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}]
|
||||
|
||||
results.append(search_results)
|
||||
|
||||
|
||||
sorted_results = sorted(results, key=get_result_key)
|
||||
return jsonify(result=sorted_results)
|
||||
|
||||
|
||||
def get_key(kv):
|
||||
return datetime.strptime(kv[0], "%B_%Y")
|
||||
|
||||
def get_result_key(r):
|
||||
return r['archive']
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
<html>
|
||||
<head></head>
|
||||
<body>
|
||||
{% for a in archives %}
|
||||
<a href="/{{ a }}"><h3>{{ a }}</h3></a>
|
||||
{% endfor %}
|
||||
<a href="/search"><h3>---> SEARCH <---</h3></a>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,10 +0,0 @@
|
||||
<html>
|
||||
<head></head>
|
||||
<body>
|
||||
<ul>
|
||||
{% for t in list %}
|
||||
<li><a href="{{ list_name }}/{{ t.name }}"><h3>{{ t.name }} -- {{ t.nbr_threads }}</h3></a></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,11 +0,0 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
</head>
|
||||
<body>
|
||||
<h3>{{ message.subject }}</h3>
|
||||
<h4>{{ message.author_name }}</h4>
|
||||
<h4>{{ message.date }}</h4>
|
||||
<p>{{ message.content }} </p>
|
||||
</body>
|
||||
</html>
|
||||
@ -20,11 +20,6 @@
|
||||
<option value="{{ a }}">{{ a }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<select form="search" name="hits">
|
||||
{% for a in hits %}
|
||||
<option value="{{ a }}">{{ a }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<input type="submit" value="search" id="submit">
|
||||
<div id="loading">Loading...</div>
|
||||
</form>
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
<html>
|
||||
<head></head>
|
||||
<body>
|
||||
{% macro message(m, index, urlpath)-%}
|
||||
{% set path = urlpath + '/' + index|string %}
|
||||
<li>
|
||||
{{ index }}. <a href="{{ path }}">{{ m.subject }}</a> <i>{{ m.author_name }}</i>
|
||||
{% if m.get('follow-up') %}
|
||||
<ul>
|
||||
{% for msg in m.get('follow-up') %}
|
||||
{{ message(m=msg, index=loop.index - 1, urlpath=path) }}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endif %}
|
||||
</li>
|
||||
{%- endmacro %}
|
||||
|
||||
<ul>
|
||||
{% for m in threads recursive %}
|
||||
{{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
Loading…
x
Reference in New Issue
Block a user