MEGA -- DB

This commit is contained in:
gauthiier 2019-07-11 13:21:42 +02:00
parent 3703dcc169
commit 4197cd4d32
25 changed files with 663 additions and 1657 deletions

8
.gitignore vendored
View File

@ -1,7 +1,11 @@
# mailinglists specific # listservs specific
archives/ archives/
figs/ config/
config.py config.py
test.py
#macos
.DS_Store
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/

6
README
View File

@ -1,3 +1,9 @@
TODO (July 2019):
- refactor archive.py and search.py
- test lists import with mariadb backend
usage: archive.py [-h] [--arch ARCH] url [url ...] usage: archive.py [-h] [--arch ARCH] url [url ...]
Mailinglists are dead. Long live mailinglists! Mailinglists are dead. Long live mailinglists!

View File

@ -1,230 +0,0 @@
import os
# matplot view/windows
import matplotlib
matplotlib.interactive(True)
# pd display
import pandas as pd
pd.set_option('display.max_colwidth', 100)
from analysis.archive import Archive
from analysis.query import Query
from analysis.plot import Plot
import analysis.format
# spectre: slategrey
# nettime: red
# crumb: purple
# empyre: darkblue
def save_fig_cohort(q, name, dir, color):
t = name + " - Cohorts"
pp = q.cohort().plot(color=color, title=t)
ts = name + "_cohorts.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_messages_total(q, name, dir, color):
t = name + " - Nbr. Messages"
pp = q.activity_overall().plot(kind='bar', color=color, title=t)
ts = name + "_messages.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_threads_total(q, name, dir, color):
t = name + " - Nbr. Threads"
pp = q.threads_overall().plot(kind='bar', color=color, title=t)
ts = name + "_threads.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_messages_constituency(q, name, dir):
t = name + " - Messages Constituency"
replies = pd.Series(q.replies_overall(series=True))
# threads = pd.Series(q.single_threads_overall(series=True))
threads = pd.Series(q.threads_overall(series=True))
messages = pd.Series(q.activity_overall(series=True))
single_messages = messages - (replies + threads)
# df = {'a': single_messages, 'b': threads, 'c': replies}
# df = pd.DataFrame([single_messages, threads, replies], columns=['a', 'b', 'c'])
df = pd.concat([single_messages.to_frame('single-messages').astype(int), threads.to_frame('threads').astype(int), replies.to_frame('replies').astype(int)], axis=1)
pp = df.plot(kind='bar', stacked=True, title=t)
# pp = [single_messages, threads, replies].plot(kind='bar', stacked=True)
ts = name + "_constituency.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_avg_threads_replies(q, name, dir, color):
t = name + " - Avg. Threads + Replies"
replies = pd.Series(q.replies_overall(series=True))
threads = pd.Series(q.threads_overall(series=True))
messages = pd.Series(q.activity_overall(series=True))
avg_threads_messages = (replies + threads) / messages
pp = pd.DataFrame(avg_threads_messages).plot(kind='bar', color=color, title=t)
ts = name + "_avg_threads_replies.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_diff_threads_replies_vs_messages(q, name, dir, color):
t = name + " - Diff. Threads + Replies vs Single Messages"
replies = pd.Series(q.replies_overall(series=True))
threads = pd.Series(q.threads_overall(series=True))
rt = replies + threads
messages = pd.Series(q.activity_overall(series=True))
diff_threads_messages = (2 * rt) - messages
pp = pd.DataFrame(diff_threads_messages).plot(kind='bar', color=color, title=t)
ts = name + "_diff_threads_replies_messages.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def save_fig_ratio_replies_threads(q, name, dir, color):
t = name + " - Ratio Replies per Thread"
replies = pd.Series(q.replies_overall(series=True))
threads = pd.Series(q.threads_overall(series=True))
ratio_replies_threads = replies / threads
pp = pd.DataFrame(ratio_replies_threads).plot(kind='bar', color=color, title=t)
ts = name + "_ratio_replies_threads.png"
filename = os.path.join(dir, ts)
pp.get_figure().savefig(filename)
def html_td_rank_year(year, data):
td_str = '<td class="td_list">'
if year in data:
td_str += analysis.format.table_threads_ranking(data[year])
td_str += '</td>'
return td_str
def html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre):
html_str = '<table id="rankings">'
html_str += '<tr>'
html_str += '<td class="td_year_t">year</td>'
html_str += '<td class="td_list_t">nettime</td>'
html_str += '<td class="td_list_t">crumb</td>'
html_str += '<td class="td_list_t">spectre</td>'
html_str += '<td class="td_list_t">empyre</td>'
html_str += '</tr>'
years = sorted(ranking_nettime.keys())
print(years)
for i in years:
html_str += '<tr>'
html_str += '<td class="td_list">' + i + '</td>'
html_str += html_td_rank_year(i, ranking_nettime)
html_str += html_td_rank_year(i, ranking_crumb)
html_str += html_td_rank_year(i, ranking_spectre)
html_str += html_td_rank_year(i, ranking_empyre)
html_str += '</tr>'
html_str += '</table>'
return html_str
print("nettime")
#nettime
nt = Archive('nettime-l')
ntq = nt.query()
ntp = Plot(ntq)
# save_fig_cohort(ntq, 'nettime', 'figs/', 'red')
# save_fig_messages_total(ntq, 'nettime', 'figs/', 'red')
# save_fig_threads_total(ntq, 'nettime', 'figs/', 'red')
# save_fig_messages_constituency(ntq, 'nettime', 'figs/')
# save_fig_avg_threads_replies(ntq, 'nettime', 'figs/', 'red')
# save_fig_diff_threads_replies_vs_messages(ntq, 'nettime', 'figs/', 'red')
# save_fig_ratio_replies_threads(ntq, 'nettime', 'figs/', 'red')
ranking_nettime = ntq.threads_ranking(rank=15)
# print(r['2000'])
# print(analysis.format.table_threads_ranking(r['2000']))
print("crumb")
#crumb
cr = Archive('crumb')
crq = cr.query()
crp = Plot(crq)
# save_fig_cohort(crq, 'crumb', 'figs/', 'purple')
# save_fig_messages_total(crq, 'crumb', 'figs/', 'purple')
# save_fig_threads_total(crq, 'crumb', 'figs/', 'purple')
# save_fig_messages_constituency(crq, 'crumb', 'figs/')
# save_fig_avg_threads_replies(crq, 'crumb', 'figs/', 'purple')
# save_fig_diff_threads_replies_vs_messages(crq, 'crumb', 'figs/', 'purple')
# save_fig_ratio_replies_threads(crq, 'crumb', 'figs/', 'purple')
ranking_crumb = crq.threads_ranking(rank=15)
print("empyre")
#empyre
em = Archive('empyre')
emq = em.query()
emp = Plot(emq)
# save_fig_cohort(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_messages_total(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_threads_total(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_messages_constituency(emq, 'empyre', 'figs/')
# save_fig_avg_threads_replies(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_diff_threads_replies_vs_messages(emq, 'empyre', 'figs/', 'darkblue')
# save_fig_ratio_replies_threads(emq, 'empyre', 'figs/', 'darkblue')
ranking_empyre = emq.threads_ranking(rank=15)
print("spectre")
#spectre
sp = Archive('spectre')
spq = sp.query()
spp = Plot(spq)
# save_fig_cohort(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_messages_total(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_threads_total(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_messages_constituency(spq, 'spectre', 'figs/')
# save_fig_avg_threads_replies(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_diff_threads_replies_vs_messages(spq, 'spectre', 'figs/', 'slategrey')
# save_fig_ratio_replies_threads(spq, 'spectre', 'figs/', 'slategrey')
ranking_spectre = spq.threads_ranking(rank=15)
## comparative ranking
rankings = html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre)
html_template = 'figs/ranking/index_template.html'
with open(html_template, 'r') as fp:
h = fp.read()
html = h.replace("--table--", rankings)
html_output = 'figs/ranking/index.html'
with open(html_output, 'w+') as fp:
fp.write(html)

View File

@ -1,165 +0,0 @@
import numpy as np
import pandas as pd
import email, email.parser
import os, datetime, json, gzip, re
import analysis.util
import analysis.query
import search.archive ## circular...
def filter_date(msg, archive_name):
time_tz = analysis.util.format_date(msg, archive_name)
if not time_tz:
return None
dt = datetime.datetime.fromtimestamp(time_tz)
try:
date_time = pd.to_datetime(dt)
except pd.tslib.OutOfBoundsDatetime:
print('time out of bound')
print(dt)
return None
min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y')
max_date = pd.to_datetime(datetime.datetime.now())
if date_time < min_date or date_time > max_date:
return None
return date_time
def message_to_tuple_record(msg, records, archive_name, references='X'):
# check date first?
date = filter_date(msg, archive_name)
if not date:
print("Archive::filter_date returned None. Skip.")
return
# check / filter from email address second?
from_addr = analysis.util.format_from(msg, archive_name)
if not from_addr:
print("Archive::analysis.util.format_from returned None. Skip.")
return
url = analysis.util.format_url(msg, archive_name)
author = analysis.util.format_author(msg, archive_name)
subject = analysis.util.format_subject(msg, archive_name)
message_id = analysis.util.format_id(msg, archive_name)
content = analysis.util.format_content(msg, archive_name)
records.append((message_id,
from_addr,
author,
subject,
date,
url,
len(content),
0 if not 'follow-up' in msg else len(msg['follow-up']),
references))
# recursive follow up -- but references is not keeping track really...
if 'follow-up' in msg:
for f in msg['follow-up']:
message_to_tuple_record(f, records, archive_name, references=message_id)
return
def json_data_to_pd_dataframe(json_data, archive_name):
records = []
for d in json_data:
for dd in d['threads']:
message_to_tuple_record(dd, records, archive_name)
print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records)))
df = pd.DataFrame.from_records(records,
index='date',
columns=['message-id',
'from',
'author',
'subject',
'date',
'url',
'content-length',
'nbr-references',
'references'])
df.index.name = 'date'
return df
def load_from_file(filename, archive_name, archive_dir, json_data=None):
if not filename.endswith('.json.gz'):
file_path = os.path.join(archive_dir, filename + '.json.gz')
else:
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return json_data_to_pd_dataframe(json_data['threads'], archive_name)
else:
#list of all "filename[...].json.gz" in archive_dir
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
if files:
filename = files[-1] # take the most recent (listed alpha-chronological)
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return json_data_to_pd_dataframe(json_data['threads'], archive_name)
else:
#list of all json files in archive_dir/filename
dir_path = os.path.join(archive_dir, filename)
if not os.path.isdir(dir_path):
return None
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
if not files:
return None
# load all json files
threads = []
for file_path in files:
with open(file_path, 'r') as fp:
json_data = json.load(fp)
threads.append(json_data)
print('---> ' + archive_name)
return json_data_to_pd_dataframe(threads, archive_name)
def load_from_search_archive(archive):
threads = []
for k, v in archive.archive.items():
threads.append(v)
return json_data_to_pd_dataframe(threads, archive.archive_name)
class Archive:
data = None # "raw" json data
dataframe = None # main pd dataframe
def __init__(self, archive_name, archive_dir="archives"):
if isinstance(archive_name, pd.core.frame.DataFrame):
self.dataframe = archive_name ## no copies here
if isinstance(archive_name, search.archive.Archive):
self.dataframe = load_from_search_archive(archive_name)
if isinstance(archive_name, str):
# need a filename or a dir name....
self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data)
def query(self):
q = analysis.query.Query(self)
return q

View File

@ -1,165 +0,0 @@
import analysis.query
import logging, html, numpy
from tabulate import tabulate
def makeurl(text, url):
return '<a href="' + url + '">' + text + "</a>"
def table_threads_ranking(ranking_dataframe):
html_str = '<table class="threads_ranking">'
html_str += '<tr>'
html_str += '<td class="td_date_t">date</td>'
html_str += '<td class="td_subject_t">subject</td>'
html_str += '<td class="td_from_t">from</td>'
html_str += '<td class="td_rep_t">replies</td>'
html_str += '</tr>'
for i, row in ranking_dataframe.iterrows():
html_str += '<tr>'
html_str += '<td class="td_date">' + str(i) + '</td>'
html_str += '<td class="td_subject">' + makeurl(row['subject'], row['url']) + '</td>'
html_str += '<td class="td_from">' + row['from'] + '</td>'
html_str += '<td class="td_rep">' + str(row['nbr-references']) + '</td>'
html_str += '</tr>'
html_str += "</table>"
return html_str
def frame_to_dictionary_threads_ranking(ranking_dataframe):
results = []
for i, row in ranking_dataframe.iterrows():
d = {'date': str(i), 'subject': row['subject'], 'url': row['url'], 'from': row['from'], 'nbr-references': row['nbr-references']}
results.append(d)
return results
class Html:
query = None
def __init__(self, q=None):
if not isinstance(q, query.Query):
logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query")
raise Exception()
self.query = q
def threads_ranking(self, rank=5, resolution=None):
data = self.query.threads_ranking(rank=rank)
h = html.HTML()
t = h.table()
r = t.tr
r.td('date', klass='td_date_t')
r.td('from', klass='td_from_t')
r.td('replies', klass='td_rep_t')
r.td('subject', klass='td_subject_t')
for i, row in data.iterrows():
r = t.tr
print(row.index)
r.td(str(row['date']), klass='td_date')
r.td(row['from'], klass='td_from')
r.td(str(row['nbr-references']), klass='td_rep')
r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False)
return str(t)
@staticmethod
def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}):
header = []
if data_frame.index.name in name_map:
header.append(name_map[data_frame.index.name])
else:
header.append(data_frame.index.name)
for h in data_frame.columns:
if h in name_map:
h = name_map[h]
header.append(h)
css_header = []
css_element = []
for i in header:
css_header.append('td_' + i + '_t')
css_element.append('td_' + i)
h = html.HTML()
if table_name:
t = h.table(id=table_name, klass=table_name + '_t')
else:
t = h.table()
# url map
url_hash = {}
url_skip = []
url_keys = url_map.keys()
for u in url_keys:
if u in header and url_map[u] in header:
url_indx = header.index(url_map[u])
url_hash[header.index(u)] = url_indx
url_skip.append(url_indx)
header.pop(url_indx)
#header
r = t.tr
n = 0
for j in header:
r.td(str(j), klass=css_header[n])
n += 1
#elements
for k, row in data_frame.iterrows():
r = t.tr
r.td(str(k), klass=css_element[0])
n = 1
for l in row:
if n in url_skip:
continue
if isinstance(l, float):
if l % 1 > 0:
l = '{0:.4f}'.format(l)
else:
l = int(l)
if n in url_hash.keys():
url = row[url_hash[n] - 1]
r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False)
else:
r.td(str(l), klass=css_element[n])
n += 1
return str(t)
class Tab:
@staticmethod
def from_dataframe(data_frame, name_map={}, format=".0f"):
header = []
header.append(data_frame.index.name)
for h in data_frame.columns:
if h in name_map:
h = name_map[h]
header.append(h)
return tabulate(data_frame, headers=header, floatfmt=format)

View File

@ -1,79 +0,0 @@
import numpy as np
import pandas as pd
import analysis.query
# for colormaps see:
# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps
# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps
# http://matplotlib.org/examples/color/colormaps_reference.html
# for colors see:
# http://matplotlib.org/examples/color/named_colors.html
# spectre: slategrey
# nettime: red
# crumb: purple
# empyre: darkblue
def bar_plot_series(series, title, color='blueviolet', ylim=None):
return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim)
def save(plot, name):
fig = plot.get_figure()
fig.savefig(name)
class Plot:
query = None
def __init__(self, q=None):
if not isinstance(q, analysis.query.Query):
logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query")
raise Exception()
self.query = q
'''
activity
'''
def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys()
series = []
for k in activity_rank:
series.append(self.query.activity_from(k, resolution, series=True))
df = pd.concat(series, axis=1)
return df.plot.area(colormap='spectral', figsize=figsize, stacked=False)
'''
content lenght
'''
def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys()
series = []
for k in content_rank:
series.append(self.query.content_length_from(k, resolution, series=True))
df = pd.concat(series, axis=1)
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)
'''
threads
'''
def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)):
threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys()
series = []
for k in threads_rank:
series.append(self.query.threads_from(k, resolution, series=True))
df = pd.concat(series, axis=1)
return df.plot.area(colormap=colormap, figsize=figsize, stacked=False)

View File

@ -1,573 +0,0 @@
import numpy as np
import pandas as pd
import analysis.archive
import logging
class Query:
archive = None # analysis.archive.Archive object
activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
threads = None # ...
single_threads = None
replies = None # ...
def __init__(self, arch=None):
if not isinstance(arch, analysis.archive.Archive):
logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
raise Exception()
self.archive = arch
'''
activity
'''
def _activity(self):
if self.activity is None:
from_index = self.archive.dataframe.reindex(columns=['from'])
self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
return self.activity
def activity_from(self, email_address, resolution='y', series=False):
eaddr = email_address.replace('@', '{at}').lower()
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._activity()
try:
af = self.activity[eaddr]
except KeyError:
return None
activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
activity_from.index.name = 'year'
else:
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
activity_from.index.name = 'year-month'
if series:
return activity_from
return activity_from.to_frame('nbr-messages').astype(int)
def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
self._activity()
afr = self.activity.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
if series:
return afr[:rank]
return afr[:rank].to_frame('nbr-messages').astype(int)
# def activity_overall(self, resolution='y', series=False):
# freq = 'M'
# if resolution.lower() == 'y':
# freq = 'AS'
# elif resolution.lower() == 'm':
# freq = 'M'
# else:
# return None
# self._activity()
# y = self.activity.sum(axis=1)
# y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
# if freq == 'AS':
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
# y.index.name = 'year'
# else:
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
# y.index.name = 'year-month'
# if series:
# return y
# return y.to_frame('nbr-messages').astype(int)
def activity_overall(self, resolution='y', series=False):
a = self.archive.dataframe['url']
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-messages').astype(int)
def cohort(self, resolution='m', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._activity()
c = self.activity.idxmax().order().to_frame('date')
c.index = c['date']
cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()
if freq == 'AS':
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
cohort.index.name = 'year'
else:
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
cohort.index.name = 'year-month'
if series:
return cohort
return cohort.to_frame('first-messages').astype(int)
'''
content lenght
'''
def _content_length(self):
if self.content_length is None:
from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
return self.content_length
def content_length_from(self, email_address, resolution='y', series=False):
eaddr = email_address.replace('@', '{at}').lower()
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._content_length()
try:
af = self.content_length[eaddr]
except KeyError:
return None
content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
content_length_from.index.name = 'year'
else:
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
content_length_from.index.name = 'year-month'
if series:
return content_length_from
return content_length_from.to_frame('nbr-bytes').astype(int)
def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
self._content_length()
cfr = self.content_length.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
cfr = cfr[cfr.index.str.contains(p)]
if series:
return cfr[:rank]
return cfr[:rank].to_frame('nbr-bytes').astype(int)
def content_length_overall(self, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._content_length()
y = self.content_length.sum(axis=1)
y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-bytes').astype(int)
'''
threads
'''
def _threads(self, thresh=0):
print("doing threads")
if self.threads is None:
self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
if self.single_threads is None:
self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
return self.threads;
def threads_ranking(self, rank=5, resolution='y'):
self._threads()
if resolution == None:
data = self.threads.drop('message-id', axis=1)[:rank]
return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
# get the threads ranking per time resolution
#
data = self.threads.drop('message-id', axis=1)
data = data.groupby([pd.TimeGrouper(freq=freq)])
r = {}
for k, v in data:
if freq == 'AS':
time_key = k.strftime('%Y')
else:
time_key = k.strftime('%Y-%m')
frame = v[:rank]
frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
r[time_key] = frame
return r
def threads_replies_to(self, email_address, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._threads()
eaddr = email_address.replace('@', '{at}').lower()
self._threads()
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
if series:
return threads_from_ranking[eaddr]
threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
if freq == 'AS':
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
threads_from_ranking.index.name = 'year'
else:
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
threads_from_ranking.index.name = 'year-month'
return threads_from_ranking
def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
self._threads()
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
tfr = tfr[:rank].astype(int)
return tfr
def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
self._threads()
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
if series:
return tir[:rank]
return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
# activity
self._activity()
afr = self.activity.sum(axis=0).astype(int)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
# initiated threads [top 25]
self._threads()
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
inter = afr.index.intersection(tir.index)
avg = tir[inter] / afr[inter]
labels = ['messages', 'threads', 'avg.threads']
return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
self._threads()
#initiated
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
#replies [top 25]
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
if filter_nettime:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
tfr = tfr['nbr-references'] # dataframe to series
inter = tir.index.intersection(tfr.index)
avg = tfr[inter] / tir[inter]
labels = ['threads', 'replies', 'avg.replies']
return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
agg = aggregate.lower()
if not agg in ['sum', 'mean', 'count']:
return None
if not self.threads is None:
del self.threads
self.threads = None
self._threads(tresh)
if agg == 'sum':
# number of replies total (re: sum all the replies)
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
elif agg == 'mean':
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
else:
# number of threads (re: msgs with at least one reply)
y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-threads').astype(int)
def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
agg = aggregate.lower()
if not agg in ['sum', 'mean', 'count']:
return None
if not self.single_threads is None:
del self.single_threads
self.single_threads = None
self._threads(tresh)
y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-threads').astype(int)
'''
replies
'''
def _replies(self):
if self.replies is None:
self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
return self.replies;
def replies_ranking(self, rank=5, resolution=None):
self._replies()
if resolution == None:
data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
return data.to_frame('nbr_replies')
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
# get the threads ranking per time resolution
#
data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
r = {}
for k, v in data:
if freq == 'AS':
time_key = k.strftime('%Y')
else:
time_key = k.strftime('%Y-%m')
frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
r[time_key] = frame.to_frame('nbr-replies')
return r
def replies_avg_ranking(self, rank=5, filter_nettime=True):
# activity
self._activity()
afr = self.activity.sum(axis=0)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
# replies in thread [top 25]
self._replies()
rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
inter = afr.index.intersection(rpl.index)
avg = rpl[inter] / afr[inter]
labels = ['messages', 'replies', 'avg.replies']
return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
def replies_overall(self, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
if not self.replies is None:
del self.replies
self.replies = None
self._replies()
y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-replies').astype(int)

View File

@ -1,92 +0,0 @@
import email
import hashlib
def format_content(msg, archive_name):
return msg['content']
def format_url(msg, archive_name):
return msg['url']
def format_author(msg, archive_name):
return msg['author_name']
def format_from_token(from_str, sep):
fff = from_str
from_addr = email.utils.parseaddr(from_str)[1]
fffa = email.utils.parseaddr(from_str)
if sep not in from_addr:
tok = from_str.split()
try:
at = tok.index(sep)
from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
if from_addr.startswith('<') or from_addr.endswith('>'):
from_addr = from_addr.strip('<').strip('>')
except ValueError:
print(tok)
print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
print("*** " + fff)
print("+++")
print(fffa)
print("----")
return None
else:
from_addr = from_addr.replace(sep, '{AT}')
return from_addr.lower()
def format_from(msg, archive_name):
from_str = msg['from']
if " {AT} " in from_str:
return format_from_token(from_str, '{AT}')
elif " at " in from_str:
return format_from_token(from_str, 'at')
elif "@" in from_str:
return format_from_token(from_str, '@')
else:
return from_str
# returns utc timestamp
def format_date(msg, archive_name):
date_str = msg['date']
time_tz = None
try:
date_tz = email.utils.parsedate_tz(date_str)
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
except TypeError:
print("Format Date TypeError")
print(" > " + date_str)
return None
except ValueError:
print("Format Date ValueError")
print(" > " + date_str)
return None
finally:
return time_tz
def format_subject(msg, archive_name):
return msg['subject']
def format_id(msg, archive_name):
if "message-id" in msg:
return msg['message-id']
else:
# create hash with author_name + date
s = msg['author_name'] + msg['date']
sha = hashlib.sha1(s.encode('utf-8'))
return sha.hexdigest()
# format='%d/%m/%Y'
def min_date(archive_name):
if "nettime" in archive_name:
return '01/10/1995'
elif archive_name == "spectre":
return '01/08/2001'
elif archive_name == "empyre":
return '01/01/2002'
elif archive_name == "crumb":
return '01/02/2001'

257
archive/archive.py Normal file
View File

@ -0,0 +1,257 @@
import email, email.parser
import os, json, gzip, re
import mysql.connector as mariadb
import archive.sql, archive.util
from datetime import date, datetime
from dateutil import parser
import terminal.progress
def load_from_file(filename, archive_name, archive_dir):
if not filename.endswith('.json.gz'):
file_path = os.path.join(archive_dir, filename + '.json.gz')
else:
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return (json_data, archive_name)
else:
#list of all "filename[...].json.gz" in archive_dir
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
if files:
filename = files[-1] # take the most recent (listed alpha-chronological)
file_path = os.path.join(archive_dir, filename)
if os.path.isfile(file_path):
with gzip.open(file_path, 'r') as fp:
json_data = json.load(fp)
return (json_data, archive_name) # <--- this makes no sense....
else:
#list of all json files in archive_dir/filename
dir_path = os.path.join(archive_dir, filename)
if not os.path.isdir(dir_path):
return None
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
if not files:
return None
# load all json files
threads = []
for file_path in files:
with open(file_path, 'r') as fp:
json_data = json.load(fp)
threads.append(json_data)
return (threads, archive_name)
def connect_db(database, host, user, password):
try:
con = mariadb.connect(host=host, user=user, password=password, database=database)
except mariadb.Error as error:
print("Error: {}".format(error))
if error.errno == 1049:
if util.y_n_question("Table " + archive_name + " does not exist. Create it?"):
print("creating")
else:
print("not creating")
return None
finally:
return con
class Archive:
data = None # "raw" json data
db_con = None
def __init__(self, archive_name, archive_dir):
if isinstance(archive_name, str):
# need a filename or a dir name....
print("reading archive " + archive_name, end='')
(self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir)
print(" - done.")
def __init__(self, archive_name, database, host, user, password):
self.archive_name = archive_name
self.db_con = connect_db(database, host, user, password)
def __init__(self, archive_name, config):
self.archive_name = archive_name
self.db_con = connect_db(config['database'], config['host'], config['user'], config['password'])
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
if self.db_con is not None:
self.db_con.close()
def create_db(self, host, database, user, password):
print("creating table: " + self.archive_name, end='')
self.db_con = connect_db(database, host, user, password)
if self.db_con is None:
return
try:
cursor = self.db_con.cursor()
cursor.execute(archive.sql.CREATE.format(self.archive_name))
except mariadb.Error as error:
print("Error: {}".format(error))
finally:
cursor.close()
print(" - done.")
def insert_db(self, host, database, user, password):
self.db_con = connect_db(database, host, user, password)
if self.db_con is None:
return
try:
cursor = self.db_con.cursor()
progress = terminal.progress.ProgressBar(self.archive_name, len(self.data), fmt=terminal.progress.ProgressBar.FULL)
for t in self.data:
n_inserted = self.recursive_insert_db(cursor, t["threads"])
# print(" - insert: " + str(n_inserted), end='')
if n_inserted > 0:
self.db_con.commit()
progress.current += 1
progress()
progress.done()
self.db_con.commit()
except mariadb.Error as error:
pass
# print("Error: {}".format(error))
finally:
cursor.close()
def recursive_insert_db(self, cursor, thread):
n_inserted = 0
for m in thread:
try:
from_ = archive.util.format_from(m)
author_name_ = archive.util.format_author(m)
to_ = archive.util.format_to(m)
date_ = archive.util.format_date(m, self.archive_name)
if date_ is None or from_ is None:
# print("\nerrorororororo")
# print(m['from'] + " -- " + m['date'])
continue
cursor.execute(archive.sql.INSERT, (from_,author_name_,to_,m["subject"],date_,m["content-type"],m["content"],m["url"]))
n_inserted += 1
if "follow-up" in m:
n_inserted += self.recursive_insert_db(cursor, m["follow-up"])
except mariadb.Error as error:
if error.errno == 1062:
#duplication continue <------------------------- look this up...
# print("\nError: {}".format(error))
continue
return n_inserted
def content_search(self, term, bool=True):
if self.db_con is None:
print("Not connection to database...")
return
try:
cursor = self.db_con.cursor(buffered=True)
if bool:
cursor.execute(archive.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term))
else:
cursor.execute(archive.sql.CONTENT_QUERY.format(self.archive_name, term))
# print(cursor.rowcount)
results = []
for (from_, author_name_, subject_, date_, url_) in cursor:
results.append((from_, author_name_, subject_, date_, url_))
# print("{} {} {}".format(from_, str(date_), url_))
return results
except mariadb.Error as error:
print("Error: {}".format(error))
finally:
cursor.close()
def from_search(self, term, bool=True):
if self.db_con is None:
print("Not connection to database...")
return
try:
cursor = self.db_con.cursor(buffered=True)
if bool:
cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term))
else:
cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term))
# print(cursor.rowcount)
results = []
for (from_, author_name_, subject_, date_, url_) in cursor:
results.append((from_, author_name_, subject_, date_, url_))
# print("{} {} {}".format(from_, str(date_), url_))
return results
except mariadb.Error as error:
print("Error: {}".format(error))
finally:
cursor.close()
# analysis
def longest_field(self, field, thread, max_length=0):
import archive.util
for m in thread:
if not field in m:
if "threads" in m:
max_length = self.longest_field(field, m["threads"], max_length)
continue
if m[field] is None:
continue
if field == "from":
m[field] = archive.util.format_from(m)
elif field == "author_name":
m[field] = archive.util.format_author(m)
elif field == "to":
m[field] = archive.util.format_to(m)
elif field == "date":
m[field] = str(archive.util.format_date(m, self.archive_name))
if m[field] is None:
continue
l = len(m[field])
if l > max_length:
max_length = l
print(">> " + m[field])
if "follow-up" in m:
max_length = self.longest_field(field, m["follow-up"], max_length)
return max_length

31
archive/sql.py Normal file
View File

@ -0,0 +1,31 @@
CREATE = "CREATE TABLE `{}` (" \
"`from_` varchar(85) NOT NULL," \
"`author_name_` varchar(200) NOT NULL," \
"`to_` text(60)," \
"`subject_` varchar(3500) NOT NULL," \
"`date_` datetime NOT NULL," \
"`content_type_` varchar(15) NOT NULL," \
"`content_` mediumtext NOT NULL," \
"`url_` varchar(100) NOT NULL," \
"PRIMARY KEY(`from_`, `date_`)," \
"FULLTEXT (`subject_`, `content_`)," \
"FULLTEXT (`from_`, `author_name_`)" \
") ENGINE = InnoDB;"
INSERT = ("INSERT INTO nettime_l"
"(from_, author_name_, to_, subject_, date_, content_type_, content_, url_) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")
CONTENT_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
"WHERE MATCH(subject_, content_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
CONTENT_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
"WHERE MATCH(subject_, content_) AGAINST('{}') ORDER BY date_")
FROM_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
"WHERE MATCH(from_, author_name_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_")
FROM_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} "
"WHERE MATCH(from_, author_name_) AGAINST('{}') ORDER BY date_")
# SELECT from_, author_name_, subject_, date_, url_ FROM nettime_l WHERE MATCH(content_) AGAINST('%s' IN BOOLEAN MODE)

225
archive/util.py Executable file
View File

@ -0,0 +1,225 @@
import email, datetime, sys
import hashlib
import dateparser
def format_content(msg):
return msg['content']
def format_url(msg):
return msg['url']
def format_author(msg):
if 'author_name' not in msg or msg['author_name'] is None:
return None
author_str = msg['author_name'].replace('"', '')
if "by way of" in author_str:
toks = author_str.split("by way of")
if toks[0] == "":
author_str = format_from(msg)
elif toks[0][-1] == "(":
author_str = toks[0][:-1].strip()
else:
author_str = toks[0]
if ("(" in author_str) or ("<" in author_str):
# ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault <zx {AT} xyz.net>
# print("±±±±±±")
# print("name: " + author_str)
# print("from: " + msg['from'])
if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'):
author_str = author_str.split('(')[0].strip()
else:
author_str = email.utils.parseaddr(author_str)[0]
# print(" Name:" + author_str.replace('"', ''))
# print(" From:" + format_from(msg))
if " ," in author_str:
# nettime's_roving_reporter , thing.net {AT} bbs.thing.net
author_str = author_str.split(' ,')[0]
return author_str
def format_from_token(from_str, sep):
from_addr = email.utils.parseaddr(from_str)[1]
if sep not in from_addr:
tok = from_str.split()
try:
at = tok.index(sep)
from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]])
if from_addr.startswith('<') or from_addr.endswith('>'):
from_addr = from_addr.strip('<').strip('>')
except ValueError:
print(tok)
print("error formating 'from' " + from_str + " -- expecting sep: " + sep)
return None
else:
from_addr = from_addr.replace(sep, '{AT}')
return "".join(from_addr.lower().split())
def format_from(msg):
if 'from' not in msg or msg['from'] is None:
return None
from_str = msg['from']
if " {AT} " in from_str:
return format_from_token(from_str, '{AT}')
elif " at " in from_str:
return format_from_token(from_str, 'at')
elif "@" in from_str:
return format_from_token(from_str, '@')
else:
return "".join(from_str.split())
def format_to(msg):
if "to" not in msg or msg["to"] is None:
return None
to_str = msg["to"]
toks = email.utils.parseaddr(to_str)
# print(toks)
if len(toks) == 2:
to_str = toks[1]
return "".join(to_str.lower().split())
# returns utc timestamp --- old...
def format_date_utc(msg, archive_name):
if 'date' not in msg or msg['date'] is None:
return None
date_str = msg['date'].replace('.', '')
time_tz = None
try:
date_tz = email.utils.parsedate_tz(date_str)
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
except TypeError:
print("Format Date TypeError")
print(" > " + date_str)
return None
except ValueError:
print("Format Date ValueError")
print(" > " + date_str)
return None
finally:
return time_tz
def format_date(msg, archive_name):
if 'date' not in msg or msg['date'] is None:
return None
# date_str = msg['date'].replace('.', '')
date_str = msg['date']
# fix Thu, 01 Aug 2002 17:33:08 +0900 (JST)
if '(' in date_str:
date_str = date_str.split('(')[0].rstrip()
date_time = dateparser.parse(date_str)
if date_time is None:
# random stuff...
fix = False
toks = date_str.split()
if len(toks[-1]) == 5 or len(toks[-1]) == 4:
# ex. Thu, 24 Jan 2002 15:21:31 -0000
if toks[-1] in ['+0000', '-0000', '0000']:
date_str = date_str[:-5]
fix = True
# ex. Fri, 25 Jan 2002 13:21:49 +1050
elif toks[-1][-2] == '5':
d = list(date_str)
d[-2] = '3'
date_str = "".join(d)
fix = True
if toks[-1][-1] != '0':
#ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005'
date_str = date_str[:-5]
fix = True
if 'Fru' in toks[0]:
date_str = date_str.replace('Fru', 'Fri')
fix = True
elif 'Thur' in toks[0]:
date_str = date_str.replace('Thur', 'Thu')
fix = True
if not fix:
# print("----")
return None
date_time = dateparser.parse(date_str)
if date_time is None:
if 'GMT' in date_str:
# ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100'
date_str = date_str.split('GMT')[0].rstrip()
fix = True
if 'METDST' in toks[-1]:
# ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST'
date_str = date_str.replace('METDST', 'MET')
fix = True
if not fix:
# print("++++")
return None
date_time = dateparser.parse(date_str)
return date_time
# else:
# print(date_str)
# date_time = datetime.datetime.fromtimestamp(time_tz)
min_d = datetime.datetime.strptime(min_date(archive_name), "%d/%m/%Y")
max_d = datetime.datetime.now()
date_time_naive = date_time.replace(tzinfo=None)
if date_time_naive < min_d or date_time_naive > max_d:
return None
return date_time
def format_subject(msg, archive_name):
if 'subject' not in msg or msg['subject'] is None:
return None
return msg['subject']
def format_id(msg, archive_name):
if "message-id" in msg:
return msg['message-id']
else:
# create hash with author_name + date
s = msg['author_name'] + msg['date']
sha = hashlib.sha1(s.encode('utf-8'))
return sha.hexdigest()
# format='%d/%m/%Y'
def min_date(archive_name):
if "nettime" in archive_name:
return '01/10/1995'
elif archive_name == "spectre":
return '01/08/2001'
elif archive_name == "empyre":
return '01/01/2002'
elif archive_name == "crumb":
return '01/02/2001'

View File

@ -1,29 +1,34 @@
name: listservs name: listserv
channels: channels:
- defaults - defaults
dependencies: dependencies:
- beautiful-soup=4.3.2=py34_0 - ca-certificates=2019.5.15=0
- click=6.7=py34_0 - openssl=1.0.2s=h1de35cc_0
- flask=0.12=py34_0 - pip=9.0.1=py34_1
- gunicorn=19.1.0=py34_0 - python=3.4.5=0
- html5lib=0.999=py34_0 - readline=6.2=2
- itsdangerous=0.24=py34_0 - setuptools=27.2.0=py34_0
- jinja2=2.9.6=py34_0 - sqlite=3.13.0=0
- markupsafe=0.23=py34_2 - tk=8.5.18=0
- openssl=1.0.2l=0 - wheel=0.29.0=py34_0
- pastedeploy=1.5.2=py34_1 - xz=5.2.4=h1de35cc_4
- pip=9.0.1=py34_1 - zlib=1.2.11=h1de35cc_3
- python=3.4.5=0 - pip:
- readline=6.2=2 - beautifulsoup4==4.7.1
- setuptools=27.2.0=py34_0 - click==7.0
- six=1.10.0=py34_0 - dateparser==0.7.1
- sqlite=3.13.0=0 - flask==1.0.4
- tk=8.5.18=0 - gunicorn==19.9.0
- werkzeug=0.11.15=py34_0 - itsdangerous==1.1.0
- wheel=0.29.0=py34_0 - jinja2==2.10.1
- xz=5.2.2=1 - markupsafe==1.1.1
- zlib=1.2.8=3 - mysql-connector-python==8.0.16
- pip: - protobuf==3.8.0
- beautifulsoup4==4.3.2 - python-dateutil==2.8.0
- webencodings==0.5.1 - pytz==2019.1
- regex==2019.6.8
- six==1.12.0
- soupsieve==1.9.2
- tzlocal==1.5.1
- werkzeug==0.15.4

View File

@ -1,150 +0,0 @@
import logging, os, json, re
from datetime import datetime
import analysis.archive ## circular...
import analysis.query
import analysis.format
import threading
class Archive():
def __init__(self, archives_dir=None):
if archives_dir==None:
from www import config
self.archives_dir = config.ARCHIVES_PATH
else:
self.archives_dir = archives_dir
self.loaded = False
self.lock_search = threading.Lock()
self.lock_threads_ranking = threading.Lock()
def load(self, archive_name=None):
if archive_name == None:
raise Exception('Archive is not specified')
archive_path = os.path.join(self.archives_dir, archive_name)
if not os.path.isdir(archive_path):
raise Exception('Archive ' + path + ' does not exist')
self.archive_name = archive_name
self.archive_path = archive_path
files = [f for f in os.listdir(archive_path) if f.endswith('.json')]
self.archive = {}
for f in files:
file_path = os.path.join(archive_path, f)
label = f.replace('.json', '')
with open(file_path) as fdata:
self.archive[label] = json.load(fdata)
self.loaded = True
def search_message(self, keyword, msg, index_str, results, field='content'):
nbr_hits = 0
if msg[field] is not None and msg[field].lower().find(keyword.lower()) > 0:
nbr_hits += 1
results.append({ "index_str": index_str, "subject": msg['subject'], "date": msg['date'], "author_name": msg['author_name'], "url": msg['url'] })
if 'follow-up' in msg:
i = 0
for m in msg['follow-up']:
current_index_str = index_str + '/' + str(i)
nbr_hits += self.search_message(keyword, m, current_index_str, results, field)
i += 1
return nbr_hits
def search(self, keyword, field='content', min_hits=0):
with self.lock_search:
search_results = { "keyword": keyword, "field": field, "archive": self.archive_name, "results": [] }
for k, v in sorted(self.archive.items(), key=get_key, reverse=True):
current_index_str = self.archive_name + '/' + k
hits = []
nbr_hits = 0
i = 0
for m in v['threads']:
current_index_str = self.archive_name + '/' + k + '/' + str(i)
nbr_hits += self.search_message(keyword, m, current_index_str, hits, field)
i += 1
if nbr_hits > min_hits:
# nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001')
if k.startswith("nettime-l_"):
dt = datetime.strptime(k, "nettime-l_%b_%y")
k = dt.strftime("%B_%Y")
search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
return search_results
def threads_ranking(self, rank=5):
with self.lock_threads_ranking:
search_results = { "keyword": "thread ranking", "field": "ranking", "archive": self.archive_name, "results": [] }
a = analysis.archive.Archive(self)
q = a.query();
ranking = q.threads_ranking(rank=rank)
for i in ranking:
r = analysis.format.frame_to_dictionary_threads_ranking(ranking[i])
for h in r:
hit = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}]
search_results['results'].append({'thread': h['date'], 'nbr_hits': h['nbr-references'], 'hits': hit})
del a
del q
return search_results
def get_key(kv_tuple):
k = kv_tuple[0]
# k is of the form "Month_Year" - ex.: "January_2001"
try:
return datetime.strptime(k, "%B_%Y")
except Exception:
pass
# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
try:
return datetime.strptime(k, "%b_%y")
except Exception:
pass
# k is of the form "Year" - ex.: "2001"
try:
return datetime.strptime(k, "%Y")
except Exception:
pass
# nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01"
try:
return datetime.strptime(k, "nettime-l_%b_%y")
except Exception:
pass
print("--------------")
print(k)
return None

2
setenv
View File

@ -1 +1 @@
source activate listservs source activate listserv

43
terminal/progress.py Normal file
View File

@ -0,0 +1,43 @@
from __future__ import print_function
import sys
import re
# https://stackoverflow.com/questions/3160699/python-progress-bar
class ProgressBar(object):
DEFAULT = 'Progress: %(bar)s %(percent)3d%%'
FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'
def __init__(self, title, total, width=40, fmt=DEFAULT, symbol='=',
output=sys.stderr):
assert len(symbol) == 1
self.title = title
self.total = total
self.width = width
self.symbol = symbol
self.output = output
self.fmt = re.sub(r'(?P<name>%\(.+?\))d',
r'\g<name>%dd' % len(str(total)), fmt)
self.current = 0
def __call__(self):
percent = self.current / float(self.total)
size = int(self.width * percent)
remaining = self.total - self.current
bar = self.title + ' [' + self.symbol * size + ' ' * (self.width - size) + ']'
args = {
'total': self.total,
'bar': bar,
'current': self.current,
'percent': percent * 100,
'remaining': remaining
}
print('\r' + self.fmt % args, file=self.output, end='')
def done(self):
self.current = self.total
self()
print('', file=self.output)

16
terminal/util.py Normal file
View File

@ -0,0 +1,16 @@
def y_n_question(question_str):
yes = {'yes','y', 'ye', ''}
no = {'no','n'}
while True:
sys.stdout.write(question_str + " [Y/n]: ")
choice = input().lower()
if choice in yes:
return True
elif choice in no:
return False
else:
sys.stdout.write("\nPlease respond with 'yes' or 'no'\n")
continue

View File

@ -1,2 +1 @@
gunicorn -w 1 --bind 0.0.0.0:5555 www-serve:app
gunicorn -w 1 -b 127.0.0.1:5555 www-serve:app

View File

@ -1,2 +1,4 @@
from www import app from www import app
#app.run(debug=True, threaded=True, use_reloader=False) # uncomment this line to run flask's server
if __name__ == "__main__":
app.run(debug=True, use_reloader=False)

View File

@ -1,144 +1,46 @@
from flask import render_template, request, jsonify from flask import render_template, request, jsonify
from www import app from www import app
from www import archives import archive.archive as archive
import search.archive import config
import www.config as wconfig
from datetime import datetime from datetime import datetime
import logging import logging
logging.info(' ------- arch = Archives() -------- ')
arch = archives.Archives()
arch.load()
archives_data = arch.data
@app.route('/') @app.route('/')
def index(): def index():
k = archives_data.keys() return render_template("index.html")
return render_template("index.html", archives=k)
# def get_key(kv_tuple):
# k = kv_tuple[0]
# # k is of the form "Month_Year" - ex.: "January_2001"
# try:
# return datetime.strptime(k, "%B_%Y")
# except Exception:
# pass
# # k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
# try:
# return datetime.strptime(k, "%b_%y")
# except Exception:
# pass
# # k is of the form "Year" - ex.: "2001"
# try:
# return datetime.strptime(k, "%Y")
# except Exception:
# pass
# return None
@app.route('/<list>')
def get_list(list):
if list in archives_data:
d = []
for k, v in sorted(archives_data[list].archive.items(), key=search.archive.get_key, reverse=True):
d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])})
return render_template("list.html", list_name=list, list=d)
else:
return 'nee nee'
@app.route('/<list>/<sublist>')
def get_sublist(list, sublist):
print(list)
print(sublist)
sublist = sublist.replace(' ', '_')
if list in archives_data and sublist in archives_data[list].archive:
return render_template("threads.html", sublist_name=sublist, threads=archives_data[list].archive[sublist]['threads'])
else:
return 'na na'
@app.route('/<list>/<sublist>/<int:index>')
def get_message(list, sublist, index):
sublist = sublist.replace(' ', '_')
index = int(index)
if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']):
return render_template("message.html", message=archives_data[list].archive[sublist]['threads'][index])
else:
'non non'
@app.route('/<list>/<sublist>/<int:index>/<path:follow_ups>')
def get_follow_ups(list, sublist, index, follow_ups):
sublist = sublist.replace(' ', '_')
index = int(index)
ups = follow_ups.split('/')
follow = []
for u in ups:
follow.append(int(u))
if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']):
message = archives_data[list].archive[sublist]['threads'][index]
for f in follow:
message = message['follow-up'][f]
return render_template("message.html", message=message)
else:
'nope nope'
@app.route('/search') @app.route('/search')
def searh(): def searh():
if len(request.args) < 1: if len(request.args) < 1:
k = archives_data.keys() return render_template("search.html", archives=wconfig.lists_to_serve, fields=['content', 'from'])
return render_template("search.html", archives=k, fields=['content', 'from(name)', 'from(email)'], hits=['n/a', '2', '3', '4', '5', '6', '7', '8', '9'])
k_arg = request.args.get('keyword') k_arg = request.args.get('keyword')
l_arg = request.args.get('list') l_arg = request.args.get('list')
sl_arg = request.args.get('sublist')
f_arg = request.args.get('field') f_arg = request.args.get('field')
h_arg = request.args.get('hits')
if k_arg is None or k_arg.strip() == '': if k_arg is None or k_arg.strip() == '':
return "no keyword..." return "no keyword..."
if l_arg is None: if l_arg != "all" and l_arg not in wconfig.lists_to_serve:
return "no list..."
if not (l_arg == "all") and not (l_arg in archives_data):
return "list '" + l_arg + "' does not exist" return "list '" + l_arg + "' does not exist"
if sl_arg is not None: if f_arg not in ['content', 'from']:
if not sl_arg in archives_data[l]: return "field '" + f_arg + "' does not exist"
return "sublist '" + sl_arg + "' does not exist in list '" + l_arg + "'"
if f_arg == "from(name)":
f_arg = 'author_name'
elif f_arg == "from(email)":
f_arg = 'from'
lists = [] lists = []
if l_arg == "all": if l_arg == "all":
for k in archives_data.keys(): lists = wconfig.lists_to_serve
lists.append(k)
else: else:
lists.append(l_arg) lists.append(l_arg)
nbr_hits = 0
if h_arg in ['2', '3', '4', '5', '6', '7', '8', '9']:
nbr_hits = int(h_arg)
################################ ################################
## ##
## need to cache all the below ## need to cache all the below.....
## ##
################################ ################################
@ -147,18 +49,41 @@ def searh():
logging.info("search keyword = " + k_arg) logging.info("search keyword = " + k_arg)
for l in lists: for l in lists:
if k_arg == "rank":
logging.info(" ranking " + l)
s = archives_data[l].threads_ranking()
else:
s = archives_data[l].search(keyword=k_arg, field=f_arg, min_hits=nbr_hits)
results.append(s) with archive.Archive(l, config=config.db) as a:
if f_arg == 'content':
r = a.content_search(k_arg)
else:
r = a.from_search(k_arg)
## -- sort results? # format data to return
search_results = sorted(results, key=get_result_key) search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] }
month_year_results = {}
return jsonify(result=search_results) for (from_, author_name_, subject_, date_, url_) in r:
m_y = date_.strftime("%B_%Y")
if m_y not in month_year_results:
month_year_results[m_y] = []
month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_})
for k, v in sorted(month_year_results.items(), key=get_key, reverse=True):
search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v})
# search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
# where:
# 'thread' = "%B_%Y" aka. January 2001
# 'nbr_hits' = nbr hits for that month
# 'hits' = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}]
results.append(search_results)
sorted_results = sorted(results, key=get_result_key)
return jsonify(result=sorted_results)
def get_key(kv):
return datetime.strptime(kv[0], "%B_%Y")
def get_result_key(r): def get_result_key(r):
return r['archive'] return r['archive']

View File

@ -1,8 +1,6 @@
<html> <html>
<head></head> <head></head>
<body> <body>
{% for a in archives %} <a href="/search"><h3>---> SEARCH <---</h3></a>
<a href="/{{ a }}"><h3>{{ a }}</h3></a>
{% endfor %}
</body> </body>
</html> </html>

View File

@ -1,10 +0,0 @@
<html>
<head></head>
<body>
<ul>
{% for t in list %}
<li><a href="{{ list_name }}/{{ t.name }}"><h3>{{ t.name }} -- {{ t.nbr_threads }}</h3></a></li>
{% endfor %}
</ul>
</body>
</html>

View File

@ -1,11 +0,0 @@
<html>
<head>
<meta charset="UTF-8">
</head>
<body>
<h3>{{ message.subject }}</h3>
<h4>{{ message.author_name }}</h4>
<h4>{{ message.date }}</h4>
<p>{{ message.content }} </p>
</body>
</html>

View File

@ -20,11 +20,6 @@
<option value="{{ a }}">{{ a }}</option> <option value="{{ a }}">{{ a }}</option>
{% endfor %} {% endfor %}
</select> </select>
<select form="search" name="hits">
{% for a in hits %}
<option value="{{ a }}">{{ a }}</option>
{% endfor %}
</select>
<input type="submit" value="search" id="submit"> <input type="submit" value="search" id="submit">
<div id="loading">Loading...</div> <div id="loading">Loading...</div>
</form> </form>

View File

@ -1,25 +0,0 @@
<html>
<head></head>
<body>
{% macro message(m, index, urlpath)-%}
{% set path = urlpath + '/' + index|string %}
<li>
{{ index }}. <a href="{{ path }}">{{ m.subject }}</a> <i>{{ m.author_name }}</i>
{% if m.get('follow-up') %}
<ul>
{% for msg in m.get('follow-up') %}
{{ message(m=msg, index=loop.index - 1, urlpath=path) }}
{% endfor %}
</ul>
{% endif %}
</li>
{%- endmacro %}
<ul>
{% for m in threads recursive %}
{{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }}
{% endfor %}
</ul>
</body>
</html>