From 4197cd4d32a6f7074baddfc43e552bd65bb9ffe0 Mon Sep 17 00:00:00 2001 From: gauthiier Date: Thu, 11 Jul 2019 13:21:42 +0200 Subject: [PATCH] MEGA -- DB --- .gitignore | 8 +- README | 6 + analyse.py | 230 ------------- analysis/archive.py | 165 --------- analysis/format.py | 165 --------- analysis/plot.py | 79 ----- analysis/query.py | 573 -------------------------------- analysis/util.py | 92 ----- {search => archive}/__init__.py | 0 archive/archive.py | 257 ++++++++++++++ archive/sql.py | 31 ++ archive/util.py | 225 +++++++++++++ conda_env.yml | 57 ++-- search/archive.py | 150 --------- setenv | 2 +- terminal/progress.py | 43 +++ terminal/util.py | 16 + www-serve | 3 +- www-serve.py | 4 +- www/routes.py | 159 +++------ www/templates/index.html | 4 +- www/templates/list.html | 10 - www/templates/message.html | 11 - www/templates/search.html | 5 - www/templates/threads.html | 25 -- 25 files changed, 663 insertions(+), 1657 deletions(-) delete mode 100644 analyse.py delete mode 100644 analysis/archive.py delete mode 100644 analysis/format.py delete mode 100644 analysis/plot.py delete mode 100644 analysis/query.py delete mode 100644 analysis/util.py rename {search => archive}/__init__.py (100%) create mode 100644 archive/archive.py create mode 100644 archive/sql.py create mode 100755 archive/util.py delete mode 100644 search/archive.py create mode 100644 terminal/progress.py create mode 100644 terminal/util.py delete mode 100644 www/templates/list.html delete mode 100644 www/templates/message.html delete mode 100644 www/templates/threads.html diff --git a/.gitignore b/.gitignore index 18c05fa..49fd8f2 100755 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,11 @@ -# mailinglists specific +# listservs specific archives/ -figs/ +config/ config.py +test.py + +#macos +.DS_Store # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README b/README index 998c786..9ae5b7a 100644 --- a/README +++ b/README @@ -1,3 +1,9 @@ + +TODO (July 2019): + - refactor archive.py and search.py + - test lists import with mariadb backend + + usage: archive.py [-h] [--arch ARCH] url [url ...] Mailinglists are dead. Long live mailinglists! diff --git a/analyse.py b/analyse.py deleted file mode 100644 index bef4381..0000000 --- a/analyse.py +++ /dev/null @@ -1,230 +0,0 @@ -import os - -# matplot view/windows -import matplotlib -matplotlib.interactive(True) - -# pd display -import pandas as pd -pd.set_option('display.max_colwidth', 100) - -from analysis.archive import Archive -from analysis.query import Query -from analysis.plot import Plot - -import analysis.format - -# spectre: slategrey -# nettime: red -# crumb: purple -# empyre: darkblue - -def save_fig_cohort(q, name, dir, color): - t = name + " - Cohorts" - pp = q.cohort().plot(color=color, title=t) - ts = name + "_cohorts.png" - filename = os.path.join(dir, ts) - pp.get_figure().savefig(filename) - -def save_fig_messages_total(q, name, dir, color): - t = name + " - Nbr. Messages" - pp = q.activity_overall().plot(kind='bar', color=color, title=t) - ts = name + "_messages.png" - filename = os.path.join(dir, ts) - pp.get_figure().savefig(filename) - -def save_fig_threads_total(q, name, dir, color): - t = name + " - Nbr. Threads" - pp = q.threads_overall().plot(kind='bar', color=color, title=t) - ts = name + "_threads.png" - filename = os.path.join(dir, ts) - pp.get_figure().savefig(filename) - -def save_fig_messages_constituency(q, name, dir): - t = name + " - Messages Constituency" - replies = pd.Series(q.replies_overall(series=True)) - # threads = pd.Series(q.single_threads_overall(series=True)) - threads = pd.Series(q.threads_overall(series=True)) - messages = pd.Series(q.activity_overall(series=True)) - single_messages = messages - (replies + threads) - - # df = {'a': single_messages, 'b': threads, 'c': replies} - # df = pd.DataFrame([single_messages, threads, replies], columns=['a', 'b', 'c']) - df = pd.concat([single_messages.to_frame('single-messages').astype(int), threads.to_frame('threads').astype(int), replies.to_frame('replies').astype(int)], axis=1) - pp = df.plot(kind='bar', stacked=True, title=t) - - # pp = [single_messages, threads, replies].plot(kind='bar', stacked=True) - - ts = name + "_constituency.png" - filename = os.path.join(dir, ts) - pp.get_figure().savefig(filename) - -def save_fig_avg_threads_replies(q, name, dir, color): - t = name + " - Avg. Threads + Replies" - replies = pd.Series(q.replies_overall(series=True)) - threads = pd.Series(q.threads_overall(series=True)) - messages = pd.Series(q.activity_overall(series=True)) - - avg_threads_messages = (replies + threads) / messages - - pp = pd.DataFrame(avg_threads_messages).plot(kind='bar', color=color, title=t) - - ts = name + "_avg_threads_replies.png" - filename = os.path.join(dir, ts) - pp.get_figure().savefig(filename) - -def save_fig_diff_threads_replies_vs_messages(q, name, dir, color): - t = name + " - Diff. Threads + Replies vs Single Messages" - replies = pd.Series(q.replies_overall(series=True)) - threads = pd.Series(q.threads_overall(series=True)) - rt = replies + threads - messages = pd.Series(q.activity_overall(series=True)) - - diff_threads_messages = (2 * rt) - messages - - pp = pd.DataFrame(diff_threads_messages).plot(kind='bar', color=color, title=t) - - ts = name + "_diff_threads_replies_messages.png" - filename = os.path.join(dir, ts) - pp.get_figure().savefig(filename) - -def save_fig_ratio_replies_threads(q, name, dir, color): - t = name + " - Ratio Replies per Thread" - replies = pd.Series(q.replies_overall(series=True)) - threads = pd.Series(q.threads_overall(series=True)) - - ratio_replies_threads = replies / threads - - pp = pd.DataFrame(ratio_replies_threads).plot(kind='bar', color=color, title=t) - - ts = name + "_ratio_replies_threads.png" - filename = os.path.join(dir, ts) - pp.get_figure().savefig(filename) - -def html_td_rank_year(year, data): - td_str = '' - if year in data: - td_str += analysis.format.table_threads_ranking(data[year]) - td_str += '' - return td_str - -def html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre): - - html_str = '' - - html_str += '' - html_str += '' - html_str += '' - html_str += '' - html_str += '' - html_str += '' - html_str += '' - - years = sorted(ranking_nettime.keys()) - - print(years) - - for i in years: - html_str += '' - html_str += '' - html_str += html_td_rank_year(i, ranking_nettime) - html_str += html_td_rank_year(i, ranking_crumb) - html_str += html_td_rank_year(i, ranking_spectre) - html_str += html_td_rank_year(i, ranking_empyre) - html_str += '' - - html_str += '
yearnettimecrumbspectreempyre
' + i + '
' - return html_str - - -print("nettime") -#nettime -nt = Archive('nettime-l') -ntq = nt.query() -ntp = Plot(ntq) - - - -# save_fig_cohort(ntq, 'nettime', 'figs/', 'red') -# save_fig_messages_total(ntq, 'nettime', 'figs/', 'red') -# save_fig_threads_total(ntq, 'nettime', 'figs/', 'red') -# save_fig_messages_constituency(ntq, 'nettime', 'figs/') - -# save_fig_avg_threads_replies(ntq, 'nettime', 'figs/', 'red') -# save_fig_diff_threads_replies_vs_messages(ntq, 'nettime', 'figs/', 'red') -# save_fig_ratio_replies_threads(ntq, 'nettime', 'figs/', 'red') - -ranking_nettime = ntq.threads_ranking(rank=15) - -# print(r['2000']) - -# print(analysis.format.table_threads_ranking(r['2000'])) - - -print("crumb") -#crumb -cr = Archive('crumb') -crq = cr.query() -crp = Plot(crq) - -# save_fig_cohort(crq, 'crumb', 'figs/', 'purple') -# save_fig_messages_total(crq, 'crumb', 'figs/', 'purple') -# save_fig_threads_total(crq, 'crumb', 'figs/', 'purple') -# save_fig_messages_constituency(crq, 'crumb', 'figs/') - -# save_fig_avg_threads_replies(crq, 'crumb', 'figs/', 'purple') -# save_fig_diff_threads_replies_vs_messages(crq, 'crumb', 'figs/', 'purple') -# save_fig_ratio_replies_threads(crq, 'crumb', 'figs/', 'purple') - -ranking_crumb = crq.threads_ranking(rank=15) - - -print("empyre") -#empyre -em = Archive('empyre') -emq = em.query() -emp = Plot(emq) - -# save_fig_cohort(emq, 'empyre', 'figs/', 'darkblue') -# save_fig_messages_total(emq, 'empyre', 'figs/', 'darkblue') -# save_fig_threads_total(emq, 'empyre', 'figs/', 'darkblue') -# save_fig_messages_constituency(emq, 'empyre', 'figs/') - -# save_fig_avg_threads_replies(emq, 'empyre', 'figs/', 'darkblue') -# save_fig_diff_threads_replies_vs_messages(emq, 'empyre', 'figs/', 'darkblue') -# save_fig_ratio_replies_threads(emq, 'empyre', 'figs/', 'darkblue') - -ranking_empyre = emq.threads_ranking(rank=15) - -print("spectre") -#spectre -sp = Archive('spectre') -spq = sp.query() -spp = Plot(spq) - -# save_fig_cohort(spq, 'spectre', 'figs/', 'slategrey') -# save_fig_messages_total(spq, 'spectre', 'figs/', 'slategrey') -# save_fig_threads_total(spq, 'spectre', 'figs/', 'slategrey') -# save_fig_messages_constituency(spq, 'spectre', 'figs/') - -# save_fig_avg_threads_replies(spq, 'spectre', 'figs/', 'slategrey') -# save_fig_diff_threads_replies_vs_messages(spq, 'spectre', 'figs/', 'slategrey') -# save_fig_ratio_replies_threads(spq, 'spectre', 'figs/', 'slategrey') - -ranking_spectre = spq.threads_ranking(rank=15) - - -## comparative ranking - -rankings = html_table_ranking_per_year(ranking_nettime, ranking_crumb, ranking_spectre, ranking_empyre) - -html_template = 'figs/ranking/index_template.html' -with open(html_template, 'r') as fp: - h = fp.read() - -html = h.replace("--table--", rankings) - -html_output = 'figs/ranking/index.html' -with open(html_output, 'w+') as fp: - fp.write(html) - diff --git a/analysis/archive.py b/analysis/archive.py deleted file mode 100644 index 3fc77cd..0000000 --- a/analysis/archive.py +++ /dev/null @@ -1,165 +0,0 @@ -import numpy as np -import pandas as pd -import email, email.parser -import os, datetime, json, gzip, re -import analysis.util -import analysis.query - -import search.archive ## circular... - - -def filter_date(msg, archive_name): - - time_tz = analysis.util.format_date(msg, archive_name) - if not time_tz: - return None - - dt = datetime.datetime.fromtimestamp(time_tz) - try: - date_time = pd.to_datetime(dt) - except pd.tslib.OutOfBoundsDatetime: - print('time out of bound') - print(dt) - return None - - min_date = pd.to_datetime(analysis.util.min_date(archive_name), format='%d/%m/%Y') - max_date = pd.to_datetime(datetime.datetime.now()) - if date_time < min_date or date_time > max_date: - return None - - return date_time - - -def message_to_tuple_record(msg, records, archive_name, references='X'): - - # check date first? - date = filter_date(msg, archive_name) - if not date: - print("Archive::filter_date returned None. Skip.") - return - - # check / filter from email address second? - from_addr = analysis.util.format_from(msg, archive_name) - if not from_addr: - print("Archive::analysis.util.format_from returned None. Skip.") - return - - url = analysis.util.format_url(msg, archive_name) - author = analysis.util.format_author(msg, archive_name) - subject = analysis.util.format_subject(msg, archive_name) - message_id = analysis.util.format_id(msg, archive_name) - content = analysis.util.format_content(msg, archive_name) - - records.append((message_id, - from_addr, - author, - subject, - date, - url, - len(content), - 0 if not 'follow-up' in msg else len(msg['follow-up']), - references)) - - # recursive follow up -- but references is not keeping track really... - if 'follow-up' in msg: - for f in msg['follow-up']: - message_to_tuple_record(f, records, archive_name, references=message_id) - - return - -def json_data_to_pd_dataframe(json_data, archive_name): - - records = [] - for d in json_data: - for dd in d['threads']: - message_to_tuple_record(dd, records, archive_name) - - print('zzzzzzzzz ----> ' + archive_name + " ---- " + str(len(records))) - - df = pd.DataFrame.from_records(records, - index='date', - columns=['message-id', - 'from', - 'author', - 'subject', - 'date', - 'url', - 'content-length', - 'nbr-references', - 'references']) - - df.index.name = 'date' - - return df - -def load_from_file(filename, archive_name, archive_dir, json_data=None): - - if not filename.endswith('.json.gz'): - file_path = os.path.join(archive_dir, filename + '.json.gz') - else: - file_path = os.path.join(archive_dir, filename) - - if os.path.isfile(file_path): - with gzip.open(file_path, 'r') as fp: - json_data = json.load(fp) - return json_data_to_pd_dataframe(json_data['threads'], archive_name) - else: - #list of all "filename[...].json.gz" in archive_dir - files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')]) - if files: - filename = files[-1] # take the most recent (listed alpha-chronological) - file_path = os.path.join(archive_dir, filename) - if os.path.isfile(file_path): - with gzip.open(file_path, 'r') as fp: - json_data = json.load(fp) - return json_data_to_pd_dataframe(json_data['threads'], archive_name) - else: - #list of all json files in archive_dir/filename - dir_path = os.path.join(archive_dir, filename) - if not os.path.isdir(dir_path): - return None - - files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')] - if not files: - return None - - # load all json files - threads = [] - for file_path in files: - with open(file_path, 'r') as fp: - json_data = json.load(fp) - threads.append(json_data) - - print('---> ' + archive_name) - return json_data_to_pd_dataframe(threads, archive_name) - -def load_from_search_archive(archive): - threads = [] - for k, v in archive.archive.items(): - threads.append(v) - return json_data_to_pd_dataframe(threads, archive.archive_name) - - - - -class Archive: - - data = None # "raw" json data - dataframe = None # main pd dataframe - - def __init__(self, archive_name, archive_dir="archives"): - - if isinstance(archive_name, pd.core.frame.DataFrame): - self.dataframe = archive_name ## no copies here - - if isinstance(archive_name, search.archive.Archive): - self.dataframe = load_from_search_archive(archive_name) - - if isinstance(archive_name, str): - # need a filename or a dir name.... - self.dataframe = load_from_file(archive_name, archive_name, archive_dir, self.data) - - def query(self): - q = analysis.query.Query(self) - return q - diff --git a/analysis/format.py b/analysis/format.py deleted file mode 100644 index 2ac54a4..0000000 --- a/analysis/format.py +++ /dev/null @@ -1,165 +0,0 @@ -import analysis.query -import logging, html, numpy -from tabulate import tabulate - -def makeurl(text, url): - return '' + text + "" - -def table_threads_ranking(ranking_dataframe): - - html_str = '' - - - html_str += '' - html_str += '' - html_str += '' - html_str += '' - html_str += '' - html_str += '' - - - for i, row in ranking_dataframe.iterrows(): - - html_str += '' - html_str += '' - html_str += '' - html_str += '' - html_str += '' - html_str += '' - - html_str += "
datesubjectfromreplies
' + str(i) + '' + makeurl(row['subject'], row['url']) + '' + row['from'] + '' + str(row['nbr-references']) + '
" - - return html_str - -def frame_to_dictionary_threads_ranking(ranking_dataframe): - - results = [] - for i, row in ranking_dataframe.iterrows(): - d = {'date': str(i), 'subject': row['subject'], 'url': row['url'], 'from': row['from'], 'nbr-references': row['nbr-references']} - results.append(d) - return results - - - -class Html: - - query = None - - def __init__(self, q=None): - - if not isinstance(q, query.Query): - logging.error("HtmlFormat constructor Error: query must be of type nettime.query.Query") - raise Exception() - - self.query = q - - def threads_ranking(self, rank=5, resolution=None): - - data = self.query.threads_ranking(rank=rank) - - h = html.HTML() - t = h.table() - - r = t.tr - r.td('date', klass='td_date_t') - r.td('from', klass='td_from_t') - r.td('replies', klass='td_rep_t') - r.td('subject', klass='td_subject_t') - - for i, row in data.iterrows(): - r = t.tr - - print(row.index) - - r.td(str(row['date']), klass='td_date') - r.td(row['from'], klass='td_from') - r.td(str(row['nbr-references']), klass='td_rep') - r.td('', klass='td_subject').text(str(h.a(row['subject'], href=row['url'])), escape=False) - - return str(t) - - @staticmethod - def from_dataframe(data_frame, table_name=None, name_map={}, url_map={}): - - header = [] - if data_frame.index.name in name_map: - header.append(name_map[data_frame.index.name]) - else: - header.append(data_frame.index.name) - for h in data_frame.columns: - if h in name_map: - h = name_map[h] - header.append(h) - - css_header = [] - css_element = [] - for i in header: - css_header.append('td_' + i + '_t') - css_element.append('td_' + i) - - h = html.HTML() - if table_name: - t = h.table(id=table_name, klass=table_name + '_t') - else: - t = h.table() - - # url map - url_hash = {} - url_skip = [] - url_keys = url_map.keys() - for u in url_keys: - if u in header and url_map[u] in header: - url_indx = header.index(url_map[u]) - url_hash[header.index(u)] = url_indx - url_skip.append(url_indx) - header.pop(url_indx) - - #header - r = t.tr - n = 0 - for j in header: - r.td(str(j), klass=css_header[n]) - n += 1 - - - #elements - for k, row in data_frame.iterrows(): - r = t.tr - r.td(str(k), klass=css_element[0]) - n = 1 - for l in row: - - if n in url_skip: - continue - - if isinstance(l, float): - if l % 1 > 0: - l = '{0:.4f}'.format(l) - else: - l = int(l) - - if n in url_hash.keys(): - url = row[url_hash[n] - 1] - r.td('', klass=css_element[n]).text(str(h.a(str(l), href=url)), escape=False) - - else: - r.td(str(l), klass=css_element[n]) - n += 1 - - return str(t) - -class Tab: - - @staticmethod - def from_dataframe(data_frame, name_map={}, format=".0f"): - - header = [] - header.append(data_frame.index.name) - for h in data_frame.columns: - if h in name_map: - h = name_map[h] - header.append(h) - - return tabulate(data_frame, headers=header, floatfmt=format) - - diff --git a/analysis/plot.py b/analysis/plot.py deleted file mode 100644 index 8f08430..0000000 --- a/analysis/plot.py +++ /dev/null @@ -1,79 +0,0 @@ -import numpy as np -import pandas as pd -import analysis.query - -# for colormaps see: -# http://scipy.github.io/old-wiki/pages/Cookbook/Matplotlib/Show_colormaps -# http://pandas.pydata.org/pandas-docs/stable/visualization.html#colormaps -# http://matplotlib.org/examples/color/colormaps_reference.html -# for colors see: -# http://matplotlib.org/examples/color/named_colors.html - -# spectre: slategrey -# nettime: red -# crumb: purple -# empyre: darkblue - -def bar_plot_series(series, title, color='blueviolet', ylim=None): - return series.plot(kind = 'bar', title=title, color=color, alpha=0.8, stacked=True, ylim=ylim) - -def save(plot, name): - fig = plot.get_figure() - fig.savefig(name) - -class Plot: - - query = None - - def __init__(self, q=None): - - if not isinstance(q, analysis.query.Query): - logging.error("HtmlFormat constructor Error: query must be of type analysis.query.Query") - raise Exception() - - self.query = q - - ''' - activity - ''' - - def activity_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)): - - activity_rank = self.query.activity_from_ranking(rank=rank, series=True).keys() - series = [] - for k in activity_rank: - series.append(self.query.activity_from(k, resolution, series=True)) - - df = pd.concat(series, axis=1) - - return df.plot.area(colormap='spectral', figsize=figsize, stacked=False) - - ''' - content lenght - ''' - - def content_length_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)): - - content_rank = self.query.content_length_from_ranking(rank=rank, series=True).keys() - series = [] - for k in content_rank: - series.append(self.query.content_length_from(k, resolution, series=True)) - - df = pd.concat(series, axis=1) - - return df.plot.area(colormap=colormap, figsize=figsize, stacked=False) - - ''' - threads - ''' - - def threads_from_ranking(self, resolution='y', rank=5, colormap='spectral', figsize=(8, 7)): - - threads_rank = self.query.threads_from_ranking(rank=rank, series=True).keys() - series = [] - for k in threads_rank: - series.append(self.query.threads_from(k, resolution, series=True)) - - df = pd.concat(series, axis=1) - - return df.plot.area(colormap=colormap, figsize=figsize, stacked=False) diff --git a/analysis/query.py b/analysis/query.py deleted file mode 100644 index 5b46488..0000000 --- a/analysis/query.py +++ /dev/null @@ -1,573 +0,0 @@ -import numpy as np -import pandas as pd -import analysis.archive -import logging - -class Query: - - archive = None # analysis.archive.Archive object - activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month)) - content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes)) - threads = None # ... - single_threads = None - replies = None # ... - - def __init__(self, arch=None): - - if not isinstance(arch, analysis.archive.Archive): - logging.error("Query constructor Error: arch must be of type analysis.archive.Archive") - raise Exception() - - self.archive = arch - - ''' - activity - ''' - - def _activity(self): - - if self.activity is None: - from_index = self.archive.dataframe.reindex(columns=['from']) - self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0) - - return self.activity - - def activity_from(self, email_address, resolution='y', series=False): - - eaddr = email_address.replace('@', '{at}').lower() - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - self._activity() - try: - af = self.activity[eaddr] - except KeyError: - return None - - activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum() - - if freq == 'AS': - activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y')) - activity_from.index.name = 'year' - else: - activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m')) - activity_from.index.name = 'year-month' - - if series: - return activity_from - - return activity_from.to_frame('nbr-messages').astype(int) - - def activity_from_ranking(self, rank=5, filter_nettime=True, series=False): - - self._activity() - afr = self.activity.sum(axis=0).order(ascending=False) - if filter_nettime: - p = r'^((?!nettime*).)*$' - afr = afr[afr.index.str.contains(p)] - - if series: - return afr[:rank] - - return afr[:rank].to_frame('nbr-messages').astype(int) - - - # def activity_overall(self, resolution='y', series=False): - - # freq = 'M' - # if resolution.lower() == 'y': - # freq = 'AS' - # elif resolution.lower() == 'm': - # freq = 'M' - # else: - # return None - - # self._activity() - - # y = self.activity.sum(axis=1) - # y = y.groupby([pd.TimeGrouper(freq=freq)]).sum() - - # if freq == 'AS': - # y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) - # y.index.name = 'year' - # else: - # y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) - # y.index.name = 'year-month' - - # if series: - # return y - - # return y.to_frame('nbr-messages').astype(int) - - def activity_overall(self, resolution='y', series=False): - - a = self.archive.dataframe['url'] - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count() - - if freq == 'AS': - y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) - y.index.name = 'year' - else: - y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) - y.index.name = 'year-month' - - if series: - return y - - return y.to_frame('nbr-messages').astype(int) - - def cohort(self, resolution='m', series=False): - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - self._activity() - - c = self.activity.idxmax().order().to_frame('date') - c.index = c['date'] - - cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size() - - if freq == 'AS': - cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y')) - cohort.index.name = 'year' - else: - cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m')) - cohort.index.name = 'year-month' - - if series: - return cohort - - return cohort.to_frame('first-messages').astype(int) - - ''' - content lenght - ''' - - def _content_length(self): - - if self.content_length is None: - from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length']) - self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum() - self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0) - - return self.content_length - - def content_length_from(self, email_address, resolution='y', series=False): - - eaddr = email_address.replace('@', '{at}').lower() - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - self._content_length() - try: - af = self.content_length[eaddr] - except KeyError: - return None - - content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum() - - if freq == 'AS': - content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y')) - content_length_from.index.name = 'year' - else: - content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m')) - content_length_from.index.name = 'year-month' - - if series: - return content_length_from - - return content_length_from.to_frame('nbr-bytes').astype(int) - - def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False): - - self._content_length() - cfr = self.content_length.sum(axis=0).order(ascending=False) - if filter_nettime: - p = r'^((?!nettime*).)*$' - cfr = cfr[cfr.index.str.contains(p)] - - if series: - return cfr[:rank] - - return cfr[:rank].to_frame('nbr-bytes').astype(int) - - def content_length_overall(self, resolution='y', series=False): - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - self._content_length() - - y = self.content_length.sum(axis=1) - y = y.groupby([pd.TimeGrouper(freq=freq)]).sum() - - if freq == 'AS': - y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) - y.index.name = 'year' - else: - y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) - y.index.name = 'year-month' - - if series: - return y - - return y.to_frame('nbr-bytes').astype(int) - - - ''' - threads - ''' - - def _threads(self, thresh=0): - - print("doing threads") - - if self.threads is None: - self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False) - - if self.single_threads is None: - self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False) - - return self.threads; - - def threads_ranking(self, rank=5, resolution='y'): - - self._threads() - - if resolution == None: - data = self.threads.drop('message-id', axis=1)[:rank] - return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1) - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - # get the threads ranking per time resolution - # - data = self.threads.drop('message-id', axis=1) - data = data.groupby([pd.TimeGrouper(freq=freq)]) - r = {} - for k, v in data: - if freq == 'AS': - time_key = k.strftime('%Y') - else: - time_key = k.strftime('%Y-%m') - frame = v[:rank] - frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1) - r[time_key] = frame - return r - - def threads_replies_to(self, email_address, resolution='y', series=False): - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - self._threads() - - eaddr = email_address.replace('@', '{at}').lower() - - self._threads() - threads_from = self.threads.reindex(columns=['from', 'nbr-references']) - threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references - threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0) - - if series: - return threads_from_ranking[eaddr] - - threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int) - - if freq == 'AS': - threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y')) - threads_from_ranking.index.name = 'year' - else: - threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m')) - threads_from_ranking.index.name = 'year-month' - - return threads_from_ranking - - def threads_replies_to_ranking(self, rank=5, filter_nettime=True): - - self._threads() - - tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False) - - if filter_nettime: - p = r'^((?!nettime*).)*$' - tfr = tfr[tfr.index.str.contains(p)] - - tfr = tfr[:rank].astype(int) - return tfr - - def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False): - - self._threads() - tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False) - if filter_nettime: - p = r'^((?!nettime*).)*$' - tir = tir[tir.index.str.contains(p)] - - if series: - return tir[:rank] - - return tir[:rank].to_frame('nbr-initiated-threads').astype(int) - - def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True): - - # activity - self._activity() - afr = self.activity.sum(axis=0).astype(int) - if filter_nettime: - p = r'^((?!nettime*).)*$' - afr = afr[afr.index.str.contains(p)] - - # initiated threads [top 25] - self._threads() - tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25 - if filter_nettime: - p = r'^((?!nettime*).)*$' - tir = tir[tir.index.str.contains(p)] - - inter = afr.index.intersection(tir.index) - avg = tir[inter] / afr[inter] - - labels = ['messages', 'threads', 'avg.threads'] - return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank] - - def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True): - - self._threads() - - #initiated - tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False) - if filter_nettime: - p = r'^((?!nettime*).)*$' - tir = tir[tir.index.str.contains(p)] - - #replies [top 25] - tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25 - if filter_nettime: - p = r'^((?!nettime*).)*$' - tfr = tfr[tfr.index.str.contains(p)] - tfr = tfr['nbr-references'] # dataframe to series - - - inter = tir.index.intersection(tfr.index) - avg = tfr[inter] / tir[inter] - - labels = ['threads', 'replies', 'avg.replies'] - return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank] - - - def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0): - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - agg = aggregate.lower() - if not agg in ['sum', 'mean', 'count']: - return None - - if not self.threads is None: - del self.threads - self.threads = None - - self._threads(tresh) - - if agg == 'sum': - # number of replies total (re: sum all the replies) - y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum() - elif agg == 'mean': - y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean() - else: - # number of threads (re: msgs with at least one reply) - y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count() - - if freq == 'AS': - y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) - y.index.name = 'year' - else: - y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) - y.index.name = 'year-month' - - if series: - return y - - return y.to_frame('nbr-threads').astype(int) - - def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1): - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - agg = aggregate.lower() - if not agg in ['sum', 'mean', 'count']: - return None - - if not self.single_threads is None: - del self.single_threads - self.single_threads = None - - self._threads(tresh) - - - y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count() - - - if freq == 'AS': - y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) - y.index.name = 'year' - else: - y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) - y.index.name = 'year-month' - - if series: - return y - - return y.to_frame('nbr-threads').astype(int) - - - ''' - replies - ''' - - def _replies(self): - - if self.replies is None: - self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references']) - self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references']) - return self.replies; - - def replies_ranking(self, rank=5, resolution=None): - - self._replies() - - if resolution == None: - data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank] - return data.to_frame('nbr_replies') - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - # get the threads ranking per time resolution - # - data = self.replies.groupby([pd.TimeGrouper(freq=freq)]) - r = {} - for k, v in data: - if freq == 'AS': - time_key = k.strftime('%Y') - else: - time_key = k.strftime('%Y-%m') - frame = v.groupby('from').size().sort_values(ascending=False)[:rank] - r[time_key] = frame.to_frame('nbr-replies') - return r - - def replies_avg_ranking(self, rank=5, filter_nettime=True): - - # activity - self._activity() - afr = self.activity.sum(axis=0) - if filter_nettime: - p = r'^((?!nettime*).)*$' - afr = afr[afr.index.str.contains(p)] - - # replies in thread [top 25] - - self._replies() - rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25] - - inter = afr.index.intersection(rpl.index) - avg = rpl[inter] / afr[inter] - - labels = ['messages', 'replies', 'avg.replies'] - return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank] - - def replies_overall(self, resolution='y', series=False): - - freq = 'M' - if resolution.lower() == 'y': - freq = 'AS' - elif resolution.lower() == 'm': - freq = 'M' - else: - return None - - if not self.replies is None: - del self.replies - self.replies = None - - self._replies() - - y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count() - - - if freq == 'AS': - y.index = y.index.format(formatter=lambda x: x.strftime('%Y')) - y.index.name = 'year' - else: - y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m')) - y.index.name = 'year-month' - - if series: - return y - - return y.to_frame('nbr-replies').astype(int) - - - - diff --git a/analysis/util.py b/analysis/util.py deleted file mode 100644 index 4602517..0000000 --- a/analysis/util.py +++ /dev/null @@ -1,92 +0,0 @@ -import email -import hashlib - -def format_content(msg, archive_name): - return msg['content'] - -def format_url(msg, archive_name): - return msg['url'] - -def format_author(msg, archive_name): - return msg['author_name'] - -def format_from_token(from_str, sep): - - fff = from_str - - from_addr = email.utils.parseaddr(from_str)[1] - - fffa = email.utils.parseaddr(from_str) - - if sep not in from_addr: - tok = from_str.split() - try: - at = tok.index(sep) - from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]]) - if from_addr.startswith('<') or from_addr.endswith('>'): - from_addr = from_addr.strip('<').strip('>') - except ValueError: - print(tok) - print("error formating 'from' " + from_str + " -- expecting sep: " + sep) - print("*** " + fff) - print("+++") - print(fffa) - print("----") - - return None - else: - from_addr = from_addr.replace(sep, '{AT}') - return from_addr.lower() - -def format_from(msg, archive_name): - from_str = msg['from'] - - if " {AT} " in from_str: - return format_from_token(from_str, '{AT}') - elif " at " in from_str: - return format_from_token(from_str, 'at') - elif "@" in from_str: - return format_from_token(from_str, '@') - else: - return from_str - -# returns utc timestamp -def format_date(msg, archive_name): - date_str = msg['date'] - time_tz = None - try: - date_tz = email.utils.parsedate_tz(date_str) - time_tz = email.utils.mktime_tz(date_tz) #utc timestamp - except TypeError: - print("Format Date TypeError") - print(" > " + date_str) - return None - except ValueError: - print("Format Date ValueError") - print(" > " + date_str) - return None - finally: - return time_tz - -def format_subject(msg, archive_name): - return msg['subject'] - -def format_id(msg, archive_name): - if "message-id" in msg: - return msg['message-id'] - else: - # create hash with author_name + date - s = msg['author_name'] + msg['date'] - sha = hashlib.sha1(s.encode('utf-8')) - return sha.hexdigest() - -# format='%d/%m/%Y' -def min_date(archive_name): - if "nettime" in archive_name: - return '01/10/1995' - elif archive_name == "spectre": - return '01/08/2001' - elif archive_name == "empyre": - return '01/01/2002' - elif archive_name == "crumb": - return '01/02/2001' diff --git a/search/__init__.py b/archive/__init__.py similarity index 100% rename from search/__init__.py rename to archive/__init__.py diff --git a/archive/archive.py b/archive/archive.py new file mode 100644 index 0000000..4610904 --- /dev/null +++ b/archive/archive.py @@ -0,0 +1,257 @@ +import email, email.parser +import os, json, gzip, re +import mysql.connector as mariadb +import archive.sql, archive.util +from datetime import date, datetime +from dateutil import parser +import terminal.progress + +def load_from_file(filename, archive_name, archive_dir): + + if not filename.endswith('.json.gz'): + file_path = os.path.join(archive_dir, filename + '.json.gz') + else: + file_path = os.path.join(archive_dir, filename) + + if os.path.isfile(file_path): + with gzip.open(file_path, 'r') as fp: + json_data = json.load(fp) + return (json_data, archive_name) + else: + #list of all "filename[...].json.gz" in archive_dir + files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')]) + if files: + filename = files[-1] # take the most recent (listed alpha-chronological) + file_path = os.path.join(archive_dir, filename) + if os.path.isfile(file_path): + with gzip.open(file_path, 'r') as fp: + json_data = json.load(fp) + return (json_data, archive_name) # <--- this makes no sense.... + + else: + #list of all json files in archive_dir/filename + dir_path = os.path.join(archive_dir, filename) + if not os.path.isdir(dir_path): + return None + + files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')] + if not files: + return None + + # load all json files + threads = [] + for file_path in files: + with open(file_path, 'r') as fp: + json_data = json.load(fp) + threads.append(json_data) + + return (threads, archive_name) + +def connect_db(database, host, user, password): + + try: + con = mariadb.connect(host=host, user=user, password=password, database=database) + except mariadb.Error as error: + print("Error: {}".format(error)) + if error.errno == 1049: + if util.y_n_question("Table " + archive_name + " does not exist. Create it?"): + print("creating") + else: + print("not creating") + return None + finally: + return con + + +class Archive: + + data = None # "raw" json data + db_con = None + + def __init__(self, archive_name, archive_dir): + + if isinstance(archive_name, str): + # need a filename or a dir name.... + print("reading archive " + archive_name, end='') + (self.data, self.archive_name) = load_from_file(archive_name, archive_name, archive_dir) + print(" - done.") + + def __init__(self, archive_name, database, host, user, password): + + self.archive_name = archive_name + self.db_con = connect_db(database, host, user, password) + + def __init__(self, archive_name, config): + + self.archive_name = archive_name + self.db_con = connect_db(config['database'], config['host'], config['user'], config['password']) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.db_con is not None: + self.db_con.close() + + + def create_db(self, host, database, user, password): + + print("creating table: " + self.archive_name, end='') + self.db_con = connect_db(database, host, user, password) + if self.db_con is None: + return + + try: + cursor = self.db_con.cursor() + cursor.execute(archive.sql.CREATE.format(self.archive_name)) + except mariadb.Error as error: + print("Error: {}".format(error)) + finally: + cursor.close() + + print(" - done.") + + def insert_db(self, host, database, user, password): + + self.db_con = connect_db(database, host, user, password) + + if self.db_con is None: + return + + try: + cursor = self.db_con.cursor() + + progress = terminal.progress.ProgressBar(self.archive_name, len(self.data), fmt=terminal.progress.ProgressBar.FULL) + + for t in self.data: + + n_inserted = self.recursive_insert_db(cursor, t["threads"]) + # print(" - insert: " + str(n_inserted), end='') + if n_inserted > 0: + self.db_con.commit() + + progress.current += 1 + progress() + + progress.done() + self.db_con.commit() + + except mariadb.Error as error: + pass + # print("Error: {}".format(error)) + finally: + cursor.close() + + def recursive_insert_db(self, cursor, thread): + + n_inserted = 0 + for m in thread: + try: + + from_ = archive.util.format_from(m) + author_name_ = archive.util.format_author(m) + to_ = archive.util.format_to(m) + date_ = archive.util.format_date(m, self.archive_name) + + if date_ is None or from_ is None: + # print("\nerrorororororo") + # print(m['from'] + " -- " + m['date']) + continue + + cursor.execute(archive.sql.INSERT, (from_,author_name_,to_,m["subject"],date_,m["content-type"],m["content"],m["url"])) + n_inserted += 1 + + if "follow-up" in m: + n_inserted += self.recursive_insert_db(cursor, m["follow-up"]) + + except mariadb.Error as error: + if error.errno == 1062: + #duplication continue <------------------------- look this up... + # print("\nError: {}".format(error)) + continue + + return n_inserted + + def content_search(self, term, bool=True): + + if self.db_con is None: + print("Not connection to database...") + return + + try: + cursor = self.db_con.cursor(buffered=True) + if bool: + cursor.execute(archive.sql.CONTENT_QUERY_BOOLEAN.format(self.archive_name, term)) + else: + cursor.execute(archive.sql.CONTENT_QUERY.format(self.archive_name, term)) + + # print(cursor.rowcount) + results = [] + for (from_, author_name_, subject_, date_, url_) in cursor: + results.append((from_, author_name_, subject_, date_, url_)) + # print("{} {} {}".format(from_, str(date_), url_)) + return results + + except mariadb.Error as error: + print("Error: {}".format(error)) + finally: + cursor.close() + + def from_search(self, term, bool=True): + + if self.db_con is None: + print("Not connection to database...") + return + + try: + cursor = self.db_con.cursor(buffered=True) + if bool: + cursor.execute(archive.sql.FROM_QUERY_BOOLEAN.format(self.archive_name, term)) + else: + cursor.execute(archive.sql.FROM_QUERY.format(self.archive_name, term)) + + # print(cursor.rowcount) + results = [] + for (from_, author_name_, subject_, date_, url_) in cursor: + results.append((from_, author_name_, subject_, date_, url_)) + # print("{} {} {}".format(from_, str(date_), url_)) + return results + + except mariadb.Error as error: + print("Error: {}".format(error)) + finally: + cursor.close() + + # analysis + def longest_field(self, field, thread, max_length=0): + import archive.util + for m in thread: + if not field in m: + if "threads" in m: + max_length = self.longest_field(field, m["threads"], max_length) + continue + if m[field] is None: + continue + if field == "from": + m[field] = archive.util.format_from(m) + elif field == "author_name": + m[field] = archive.util.format_author(m) + elif field == "to": + m[field] = archive.util.format_to(m) + elif field == "date": + m[field] = str(archive.util.format_date(m, self.archive_name)) + + + if m[field] is None: + continue + + l = len(m[field]) + if l > max_length: + max_length = l + print(">> " + m[field]) + if "follow-up" in m: + max_length = self.longest_field(field, m["follow-up"], max_length) + return max_length + + + diff --git a/archive/sql.py b/archive/sql.py new file mode 100644 index 0000000..eee0474 --- /dev/null +++ b/archive/sql.py @@ -0,0 +1,31 @@ +CREATE = "CREATE TABLE `{}` (" \ + "`from_` varchar(85) NOT NULL," \ + "`author_name_` varchar(200) NOT NULL," \ + "`to_` text(60)," \ + "`subject_` varchar(3500) NOT NULL," \ + "`date_` datetime NOT NULL," \ + "`content_type_` varchar(15) NOT NULL," \ + "`content_` mediumtext NOT NULL," \ + "`url_` varchar(100) NOT NULL," \ +"PRIMARY KEY(`from_`, `date_`)," \ +"FULLTEXT (`subject_`, `content_`)," \ +"FULLTEXT (`from_`, `author_name_`)" \ +") ENGINE = InnoDB;" + +INSERT = ("INSERT INTO nettime_l" + "(from_, author_name_, to_, subject_, date_, content_type_, content_, url_) " + "VALUES (%s, %s, %s, %s, %s, %s, %s, %s)") + +CONTENT_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} " + "WHERE MATCH(subject_, content_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_") + +CONTENT_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} " + "WHERE MATCH(subject_, content_) AGAINST('{}') ORDER BY date_") + +FROM_QUERY_BOOLEAN = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} " + "WHERE MATCH(from_, author_name_) AGAINST('{}' IN BOOLEAN MODE) ORDER BY date_") + +FROM_QUERY_NL = ("SELECT from_, author_name_, subject_, date_, url_ FROM {} " + "WHERE MATCH(from_, author_name_) AGAINST('{}') ORDER BY date_") + +# SELECT from_, author_name_, subject_, date_, url_ FROM nettime_l WHERE MATCH(content_) AGAINST('%s' IN BOOLEAN MODE) \ No newline at end of file diff --git a/archive/util.py b/archive/util.py new file mode 100755 index 0000000..70a9d67 --- /dev/null +++ b/archive/util.py @@ -0,0 +1,225 @@ +import email, datetime, sys +import hashlib +import dateparser + +def format_content(msg): + return msg['content'] + +def format_url(msg): + return msg['url'] + +def format_author(msg): + + if 'author_name' not in msg or msg['author_name'] is None: + return None + + author_str = msg['author_name'].replace('"', '') + + if "by way of" in author_str: + toks = author_str.split("by way of") + if toks[0] == "": + author_str = format_from(msg) + elif toks[0][-1] == "(": + author_str = toks[0][:-1].strip() + else: + author_str = toks[0] + + if ("(" in author_str) or ("<" in author_str): + # ex. zx {AT} xyz.net (Michel Foucault) OR Michel Foucault (c'estcommeca.com) OR Michel Foucault + # print("±±±±±±") + # print("name: " + author_str) + # print("from: " + msg['from']) + if not '@' in author_str.lower().replace('{at}', '@').replace(' at ', '@'): + author_str = author_str.split('(')[0].strip() + else: + author_str = email.utils.parseaddr(author_str)[0] + # print(" Name:" + author_str.replace('"', '')) + # print(" From:" + format_from(msg)) + + if " ," in author_str: + # nettime's_roving_reporter , thing.net {AT} bbs.thing.net + author_str = author_str.split(' ,')[0] + + + return author_str + +def format_from_token(from_str, sep): + from_addr = email.utils.parseaddr(from_str)[1] + if sep not in from_addr: + tok = from_str.split() + try: + at = tok.index(sep) + from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]]) + if from_addr.startswith('<') or from_addr.endswith('>'): + from_addr = from_addr.strip('<').strip('>') + except ValueError: + print(tok) + print("error formating 'from' " + from_str + " -- expecting sep: " + sep) + return None + else: + from_addr = from_addr.replace(sep, '{AT}') + return "".join(from_addr.lower().split()) + +def format_from(msg): + + if 'from' not in msg or msg['from'] is None: + return None + + from_str = msg['from'] + + if " {AT} " in from_str: + return format_from_token(from_str, '{AT}') + elif " at " in from_str: + return format_from_token(from_str, 'at') + elif "@" in from_str: + return format_from_token(from_str, '@') + else: + return "".join(from_str.split()) + +def format_to(msg): + + if "to" not in msg or msg["to"] is None: + return None + + to_str = msg["to"] + toks = email.utils.parseaddr(to_str) + # print(toks) + + if len(toks) == 2: + to_str = toks[1] + + return "".join(to_str.lower().split()) + + +# returns utc timestamp --- old... +def format_date_utc(msg, archive_name): + + if 'date' not in msg or msg['date'] is None: + return None + + date_str = msg['date'].replace('.', '') + time_tz = None + try: + date_tz = email.utils.parsedate_tz(date_str) + time_tz = email.utils.mktime_tz(date_tz) #utc timestamp + except TypeError: + print("Format Date TypeError") + print(" > " + date_str) + return None + except ValueError: + print("Format Date ValueError") + print(" > " + date_str) + return None + finally: + return time_tz + +def format_date(msg, archive_name): + + if 'date' not in msg or msg['date'] is None: + return None + + # date_str = msg['date'].replace('.', '') + date_str = msg['date'] + + # fix Thu, 01 Aug 2002 17:33:08 +0900 (JST) + if '(' in date_str: + date_str = date_str.split('(')[0].rstrip() + + + date_time = dateparser.parse(date_str) + if date_time is None: + + # random stuff... + fix = False + toks = date_str.split() + + if len(toks[-1]) == 5 or len(toks[-1]) == 4: + # ex. Thu, 24 Jan 2002 15:21:31 -0000 + if toks[-1] in ['+0000', '-0000', '0000']: + date_str = date_str[:-5] + fix = True + # ex. Fri, 25 Jan 2002 13:21:49 +1050 + elif toks[-1][-2] == '5': + d = list(date_str) + d[-2] = '3' + date_str = "".join(d) + fix = True + + if toks[-1][-1] != '0': + #ex. 'Fri,', '20', 'Jun', '1997', '02:58:59', '-0005' + date_str = date_str[:-5] + fix = True + + if 'Fru' in toks[0]: + date_str = date_str.replace('Fru', 'Fri') + fix = True + elif 'Thur' in toks[0]: + date_str = date_str.replace('Thur', 'Thu') + fix = True + + if not fix: + # print("----") + return None + + date_time = dateparser.parse(date_str) + if date_time is None: + + if 'GMT' in date_str: + # ex. 'Mon,', '15', 'Jan', '96', '02:55', 'GMT+0100' + date_str = date_str.split('GMT')[0].rstrip() + fix = True + + if 'METDST' in toks[-1]: + # ex. 'Sat,', '3', 'May', '97', '21:07', 'METDST' + date_str = date_str.replace('METDST', 'MET') + fix = True + + + if not fix: + # print("++++") + return None + + date_time = dateparser.parse(date_str) + return date_time + + # else: + # print(date_str) + + # date_time = datetime.datetime.fromtimestamp(time_tz) + + min_d = datetime.datetime.strptime(min_date(archive_name), "%d/%m/%Y") + max_d = datetime.datetime.now() + + date_time_naive = date_time.replace(tzinfo=None) + + if date_time_naive < min_d or date_time_naive > max_d: + return None + + return date_time + +def format_subject(msg, archive_name): + + if 'subject' not in msg or msg['subject'] is None: + return None + + return msg['subject'] + +def format_id(msg, archive_name): + if "message-id" in msg: + return msg['message-id'] + else: + # create hash with author_name + date + s = msg['author_name'] + msg['date'] + sha = hashlib.sha1(s.encode('utf-8')) + return sha.hexdigest() + +# format='%d/%m/%Y' +def min_date(archive_name): + if "nettime" in archive_name: + return '01/10/1995' + elif archive_name == "spectre": + return '01/08/2001' + elif archive_name == "empyre": + return '01/01/2002' + elif archive_name == "crumb": + return '01/02/2001' diff --git a/conda_env.yml b/conda_env.yml index 370d12c..d5896ed 100644 --- a/conda_env.yml +++ b/conda_env.yml @@ -1,29 +1,34 @@ -name: listservs +name: listserv channels: -- defaults + - defaults dependencies: -- beautiful-soup=4.3.2=py34_0 -- click=6.7=py34_0 -- flask=0.12=py34_0 -- gunicorn=19.1.0=py34_0 -- html5lib=0.999=py34_0 -- itsdangerous=0.24=py34_0 -- jinja2=2.9.6=py34_0 -- markupsafe=0.23=py34_2 -- openssl=1.0.2l=0 -- pastedeploy=1.5.2=py34_1 -- pip=9.0.1=py34_1 -- python=3.4.5=0 -- readline=6.2=2 -- setuptools=27.2.0=py34_0 -- six=1.10.0=py34_0 -- sqlite=3.13.0=0 -- tk=8.5.18=0 -- werkzeug=0.11.15=py34_0 -- wheel=0.29.0=py34_0 -- xz=5.2.2=1 -- zlib=1.2.8=3 -- pip: - - beautifulsoup4==4.3.2 - - webencodings==0.5.1 + - ca-certificates=2019.5.15=0 + - openssl=1.0.2s=h1de35cc_0 + - pip=9.0.1=py34_1 + - python=3.4.5=0 + - readline=6.2=2 + - setuptools=27.2.0=py34_0 + - sqlite=3.13.0=0 + - tk=8.5.18=0 + - wheel=0.29.0=py34_0 + - xz=5.2.4=h1de35cc_4 + - zlib=1.2.11=h1de35cc_3 + - pip: + - beautifulsoup4==4.7.1 + - click==7.0 + - dateparser==0.7.1 + - flask==1.0.4 + - gunicorn==19.9.0 + - itsdangerous==1.1.0 + - jinja2==2.10.1 + - markupsafe==1.1.1 + - mysql-connector-python==8.0.16 + - protobuf==3.8.0 + - python-dateutil==2.8.0 + - pytz==2019.1 + - regex==2019.6.8 + - six==1.12.0 + - soupsieve==1.9.2 + - tzlocal==1.5.1 + - werkzeug==0.15.4 diff --git a/search/archive.py b/search/archive.py deleted file mode 100644 index 7b4631a..0000000 --- a/search/archive.py +++ /dev/null @@ -1,150 +0,0 @@ -import logging, os, json, re -from datetime import datetime - -import analysis.archive ## circular... -import analysis.query -import analysis.format - -import threading - -class Archive(): - - def __init__(self, archives_dir=None): - if archives_dir==None: - from www import config - self.archives_dir = config.ARCHIVES_PATH - else: - self.archives_dir = archives_dir - - self.loaded = False - - self.lock_search = threading.Lock() - self.lock_threads_ranking = threading.Lock() - - def load(self, archive_name=None): - - if archive_name == None: - raise Exception('Archive is not specified') - - archive_path = os.path.join(self.archives_dir, archive_name) - if not os.path.isdir(archive_path): - raise Exception('Archive ' + path + ' does not exist') - - self.archive_name = archive_name - self.archive_path = archive_path - - files = [f for f in os.listdir(archive_path) if f.endswith('.json')] - - self.archive = {} - - for f in files: - file_path = os.path.join(archive_path, f) - label = f.replace('.json', '') - with open(file_path) as fdata: - self.archive[label] = json.load(fdata) - - self.loaded = True - - def search_message(self, keyword, msg, index_str, results, field='content'): - - nbr_hits = 0 - if msg[field] is not None and msg[field].lower().find(keyword.lower()) > 0: - nbr_hits += 1 - results.append({ "index_str": index_str, "subject": msg['subject'], "date": msg['date'], "author_name": msg['author_name'], "url": msg['url'] }) - - if 'follow-up' in msg: - i = 0 - for m in msg['follow-up']: - current_index_str = index_str + '/' + str(i) - nbr_hits += self.search_message(keyword, m, current_index_str, results, field) - i += 1 - - return nbr_hits - - - def search(self, keyword, field='content', min_hits=0): - - with self.lock_search: - - search_results = { "keyword": keyword, "field": field, "archive": self.archive_name, "results": [] } - - for k, v in sorted(self.archive.items(), key=get_key, reverse=True): - - current_index_str = self.archive_name + '/' + k - hits = [] - nbr_hits = 0 - i = 0 - for m in v['threads']: - current_index_str = self.archive_name + '/' + k + '/' + str(i) - nbr_hits += self.search_message(keyword, m, current_index_str, hits, field) - i += 1 - - if nbr_hits > min_hits: - # nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001') - if k.startswith("nettime-l_"): - dt = datetime.strptime(k, "nettime-l_%b_%y") - k = dt.strftime("%B_%Y") - search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits}) - - return search_results - - def threads_ranking(self, rank=5): - - with self.lock_threads_ranking: - - search_results = { "keyword": "thread ranking", "field": "ranking", "archive": self.archive_name, "results": [] } - - a = analysis.archive.Archive(self) - q = a.query(); - - ranking = q.threads_ranking(rank=rank) - - for i in ranking: - r = analysis.format.frame_to_dictionary_threads_ranking(ranking[i]) - for h in r: - hit = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}] - search_results['results'].append({'thread': h['date'], 'nbr_hits': h['nbr-references'], 'hits': hit}) - del a - del q - - return search_results - - - -def get_key(kv_tuple): - - k = kv_tuple[0] - - # k is of the form "Month_Year" - ex.: "January_2001" - try: - return datetime.strptime(k, "%B_%Y") - except Exception: - pass - - # k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01" - try: - return datetime.strptime(k, "%b_%y") - except Exception: - pass - - # k is of the form "Year" - ex.: "2001" - try: - return datetime.strptime(k, "%Y") - except Exception: - pass - - # nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01" - try: - return datetime.strptime(k, "nettime-l_%b_%y") - except Exception: - pass - - print("--------------") - print(k) - - return None - - - - - diff --git a/setenv b/setenv index af68e55..fa717b3 100644 --- a/setenv +++ b/setenv @@ -1 +1 @@ -source activate listservs \ No newline at end of file +source activate listserv diff --git a/terminal/progress.py b/terminal/progress.py new file mode 100644 index 0000000..c66f3b7 --- /dev/null +++ b/terminal/progress.py @@ -0,0 +1,43 @@ +from __future__ import print_function +import sys +import re + +# https://stackoverflow.com/questions/3160699/python-progress-bar + +class ProgressBar(object): + DEFAULT = 'Progress: %(bar)s %(percent)3d%%' + FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go' + + def __init__(self, title, total, width=40, fmt=DEFAULT, symbol='=', + output=sys.stderr): + assert len(symbol) == 1 + + self.title = title + self.total = total + self.width = width + self.symbol = symbol + self.output = output + self.fmt = re.sub(r'(?P%\(.+?\))d', + r'\g%dd' % len(str(total)), fmt) + + self.current = 0 + + def __call__(self): + percent = self.current / float(self.total) + size = int(self.width * percent) + remaining = self.total - self.current + bar = self.title + ' [' + self.symbol * size + ' ' * (self.width - size) + ']' + + args = { + 'total': self.total, + 'bar': bar, + 'current': self.current, + 'percent': percent * 100, + 'remaining': remaining + } + print('\r' + self.fmt % args, file=self.output, end='') + + def done(self): + self.current = self.total + self() + print('', file=self.output) \ No newline at end of file diff --git a/terminal/util.py b/terminal/util.py new file mode 100644 index 0000000..695b671 --- /dev/null +++ b/terminal/util.py @@ -0,0 +1,16 @@ + +def y_n_question(question_str): + + yes = {'yes','y', 'ye', ''} + no = {'no','n'} + + while True: + sys.stdout.write(question_str + " [Y/n]: ") + choice = input().lower() + if choice in yes: + return True + elif choice in no: + return False + else: + sys.stdout.write("\nPlease respond with 'yes' or 'no'\n") + continue diff --git a/www-serve b/www-serve index 318a511..ee4b738 100644 --- a/www-serve +++ b/www-serve @@ -1,2 +1 @@ - -gunicorn -w 1 -b 127.0.0.1:5555 www-serve:app \ No newline at end of file +gunicorn -w 1 --bind 0.0.0.0:5555 www-serve:app \ No newline at end of file diff --git a/www-serve.py b/www-serve.py index c9c8833..6bac083 100644 --- a/www-serve.py +++ b/www-serve.py @@ -1,2 +1,4 @@ from www import app -#app.run(debug=True, threaded=True, use_reloader=False) # uncomment this line to run flask's server + +if __name__ == "__main__": + app.run(debug=True, use_reloader=False) \ No newline at end of file diff --git a/www/routes.py b/www/routes.py index ec29df4..f2b33be 100644 --- a/www/routes.py +++ b/www/routes.py @@ -1,144 +1,46 @@ from flask import render_template, request, jsonify from www import app -from www import archives -import search.archive +import archive.archive as archive +import config +import www.config as wconfig from datetime import datetime - import logging -logging.info(' ------- arch = Archives() -------- ') -arch = archives.Archives() -arch.load() -archives_data = arch.data @app.route('/') def index(): - k = archives_data.keys() - return render_template("index.html", archives=k) - -# def get_key(kv_tuple): - -# k = kv_tuple[0] - -# # k is of the form "Month_Year" - ex.: "January_2001" -# try: -# return datetime.strptime(k, "%B_%Y") -# except Exception: -# pass - -# # k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01" -# try: -# return datetime.strptime(k, "%b_%y") -# except Exception: -# pass - -# # k is of the form "Year" - ex.: "2001" -# try: -# return datetime.strptime(k, "%Y") -# except Exception: -# pass - -# return None - -@app.route('/') -def get_list(list): - if list in archives_data: - d = [] - for k, v in sorted(archives_data[list].archive.items(), key=search.archive.get_key, reverse=True): - d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])}) - return render_template("list.html", list_name=list, list=d) - - else: - return 'nee nee' - -@app.route('//') -def get_sublist(list, sublist): - - print(list) - print(sublist) - - sublist = sublist.replace(' ', '_') - if list in archives_data and sublist in archives_data[list].archive: - return render_template("threads.html", sublist_name=sublist, threads=archives_data[list].archive[sublist]['threads']) - else: - return 'na na' - -@app.route('///') -def get_message(list, sublist, index): - - sublist = sublist.replace(' ', '_') - index = int(index) - if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']): - return render_template("message.html", message=archives_data[list].archive[sublist]['threads'][index]) - else: - 'non non' - -@app.route('////') -def get_follow_ups(list, sublist, index, follow_ups): - - sublist = sublist.replace(' ', '_') - index = int(index) - - ups = follow_ups.split('/') - follow = [] - for u in ups: - follow.append(int(u)) - - if list in archives_data and sublist in archives_data[list].archive and index < len(archives_data[list].archive[sublist]['threads']): - message = archives_data[list].archive[sublist]['threads'][index] - for f in follow: - message = message['follow-up'][f] - return render_template("message.html", message=message) - else: - 'nope nope' + return render_template("index.html") @app.route('/search') def searh(): if len(request.args) < 1: - k = archives_data.keys() - return render_template("search.html", archives=k, fields=['content', 'from(name)', 'from(email)'], hits=['n/a', '2', '3', '4', '5', '6', '7', '8', '9']) + return render_template("search.html", archives=wconfig.lists_to_serve, fields=['content', 'from']) k_arg = request.args.get('keyword') l_arg = request.args.get('list') - sl_arg = request.args.get('sublist') f_arg = request.args.get('field') - h_arg = request.args.get('hits') if k_arg is None or k_arg.strip() == '': return "no keyword..." - - if l_arg is None: - return "no list..." - if not (l_arg == "all") and not (l_arg in archives_data): + if l_arg != "all" and l_arg not in wconfig.lists_to_serve: return "list '" + l_arg + "' does not exist" - if sl_arg is not None: - if not sl_arg in archives_data[l]: - return "sublist '" + sl_arg + "' does not exist in list '" + l_arg + "'" + if f_arg not in ['content', 'from']: + return "field '" + f_arg + "' does not exist" - if f_arg == "from(name)": - f_arg = 'author_name' - elif f_arg == "from(email)": - f_arg = 'from' lists = [] if l_arg == "all": - for k in archives_data.keys(): - lists.append(k) + lists = wconfig.lists_to_serve else: lists.append(l_arg) - nbr_hits = 0 - if h_arg in ['2', '3', '4', '5', '6', '7', '8', '9']: - nbr_hits = int(h_arg) - - ################################ ## - ## need to cache all the below + ## need to cache all the below..... ## ################################ @@ -147,18 +49,41 @@ def searh(): logging.info("search keyword = " + k_arg) for l in lists: - if k_arg == "rank": - logging.info(" ranking " + l) - s = archives_data[l].threads_ranking() - else: - s = archives_data[l].search(keyword=k_arg, field=f_arg, min_hits=nbr_hits) + + with archive.Archive(l, config=config.db) as a: + if f_arg == 'content': + r = a.content_search(k_arg) + else: + r = a.from_search(k_arg) - results.append(s) + # format data to return + search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] } + month_year_results = {} - ## -- sort results? - search_results = sorted(results, key=get_result_key) + for (from_, author_name_, subject_, date_, url_) in r: + m_y = date_.strftime("%B_%Y") + if m_y not in month_year_results: + month_year_results[m_y] = [] + month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_}) - return jsonify(result=search_results) + for k, v in sorted(month_year_results.items(), key=get_key, reverse=True): + search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v}) + + # search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits}) + # where: + # 'thread' = "%B_%Y" aka. January 2001 + # 'nbr_hits' = nbr hits for that month + # 'hits' = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}] + + results.append(search_results) + + + sorted_results = sorted(results, key=get_result_key) + return jsonify(result=sorted_results) + + +def get_key(kv): + return datetime.strptime(kv[0], "%B_%Y") def get_result_key(r): return r['archive'] diff --git a/www/templates/index.html b/www/templates/index.html index 1476481..646c4a0 100644 --- a/www/templates/index.html +++ b/www/templates/index.html @@ -1,8 +1,6 @@ - {% for a in archives %} -

{{ a }}

- {% endfor %} +

---> SEARCH <---

\ No newline at end of file diff --git a/www/templates/list.html b/www/templates/list.html deleted file mode 100644 index 9a47098..0000000 --- a/www/templates/list.html +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/www/templates/message.html b/www/templates/message.html deleted file mode 100644 index 7125408..0000000 --- a/www/templates/message.html +++ /dev/null @@ -1,11 +0,0 @@ - - - - - -

{{ message.subject }}

-

{{ message.author_name }}

-

{{ message.date }}

-

{{ message.content }}

- - \ No newline at end of file diff --git a/www/templates/search.html b/www/templates/search.html index a92048e..c5b5715 100644 --- a/www/templates/search.html +++ b/www/templates/search.html @@ -20,11 +20,6 @@ {% endfor %} -
Loading...
diff --git a/www/templates/threads.html b/www/templates/threads.html deleted file mode 100644 index 050e3cf..0000000 --- a/www/templates/threads.html +++ /dev/null @@ -1,25 +0,0 @@ - - - -{% macro message(m, index, urlpath)-%} -{% set path = urlpath + '/' + index|string %} -
  • - {{ index }}. {{ m.subject }} {{ m.author_name }} - {% if m.get('follow-up') %} -
      - {% for msg in m.get('follow-up') %} - {{ message(m=msg, index=loop.index - 1, urlpath=path) }} - {% endfor %} -
    - {% endif %} -
  • -{%- endmacro %} - -
      - {% for m in threads recursive %} - {{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }} - {% endfor %} -
    - - - \ No newline at end of file