From 3b01ec68c6103d274f1121dc319ef93f9a7fe5a6 Mon Sep 17 00:00:00 2001 From: gauthiier Date: Thu, 27 Jul 2017 10:09:33 +0200 Subject: [PATCH] search --- search.py | 37 ++++++++++ search/__init__.py | 0 search/archive.py | 107 +++++++++++++++++++++++++++ www/routes.py | 54 +++++++++++++- www/static/search.js | 150 ++++++++++++++++++++++++++++++++++++++ www/templates/search.html | 23 ++++++ 6 files changed, 370 insertions(+), 1 deletion(-) create mode 100644 search.py create mode 100644 search/__init__.py create mode 100644 search/archive.py create mode 100644 www/static/search.js create mode 100644 www/templates/search.html diff --git a/search.py b/search.py new file mode 100644 index 0000000..6c0de83 --- /dev/null +++ b/search.py @@ -0,0 +1,37 @@ +import sys, logging, argparse +import search.archive + +logging.basicConfig(level=logging.DEBUG) + +def run(args): + + if not args.keyword: + sys.exit('No keyword. Aborting.') + + if not args.list: + args.list = ['spectre', 'crumb', 'empyre'] ## eh.... + + for l in args.list: + arch = search.archive.Archive('archives/') + arch.load(l) + + r = arch.search(keyword=args.keyword, field=args.field) + + for z in r['results']: + print(z['thread'] + " ---- " + str(z['nbr_hits'])) + for zz in z['hits']: + print(" " + zz['url']) + print(" " + zz['index_str']) + + sys.exit() + +if __name__ == "__main__": + + p = argparse.ArgumentParser(description='Searches mailinglists archives') + p.add_argument('keyword', metavar="keyword", help="keyword to search") + p.add_argument('--list', help="mailinglist(s') name(s)", nargs="+") + p.add_argument('--field', help="message field (i.e. 'content' or 'subject', etc.)", default="content") + + args = p.parse_args() + + run(args) diff --git a/search/__init__.py b/search/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/search/archive.py b/search/archive.py new file mode 100644 index 0000000..5301b88 --- /dev/null +++ b/search/archive.py @@ -0,0 +1,107 @@ +import logging, os, json, re +from datetime import datetime + +class Archive(): + + def __init__(self, archives_dir=None): + if archives_dir==None: + self.archives_dir = "archives/" + else: + self.archives_dir = archives_dir + + self.loaded = False + + def load(self, archive_name=None): + + if archive_name == None: + raise Exception('Archive is not specified') + + archive_path = os.path.join(self.archives_dir, archive_name) + if not os.path.isdir(archive_path): + raise Exception('Archive ' + path + ' does not exist') + + self.archive_name = archive_name + self.archive_path = archive_path + + files = [f for f in os.listdir(archive_path) if f.endswith('.json')] + + self.archive = {} + + for f in files: + file_path = os.path.join(archive_path, f) + label = f.replace('.json', '') + with open(file_path) as fdata: + self.archive[label] = json.load(fdata) + + self.loaded = True + + def search_message(self, keyword, msg, index_str, results, field='content'): + + nbr_hits = 0 + if msg[field].find(keyword) > 0: + nbr_hits += 1 + results.append({ "index_str": index_str, "subject": msg['subject'], "date": msg['date'], "author_name": msg['author_name'], "url": msg['url'] }) + + if 'follow-up' in msg: + i = 0 + for m in msg['follow-up']: + current_index_str = index_str + '/' + str(i) + nbr_hits += self.search_message(keyword, m, current_index_str, results, field) + i += 1 + + return nbr_hits + + + def search(self, keyword, field='content'): + + search_results = { "keyword": keyword, "field": field, "archive": self.archive_name, "results": [] } + + for k, v in sorted(self.archive.items(), key=get_key, reverse=True): + + current_index_str = self.archive_name + '/' + k + hits = [] + nbr_hits = 0 + i = 0 + for m in v['threads']: + current_index_str = self.archive_name + '/' + k + '/' + str(i) + nbr_hits += self.search_message(keyword, m, current_index_str, hits, field) + i += 1 + + if nbr_hits > 0: + search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits}) + + return search_results + + + +def get_key(kv_tuple): + + k = kv_tuple[0] + + # k is of the form "Month_Year" - ex.: "January_2001" + try: + return datetime.strptime(k, "%B_%Y") + except Exception: + pass + + # k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01" + try: + return datetime.strptime(k, "%b_%y") + except Exception: + pass + + # k is of the form "Year" - ex.: "2001" + try: + return datetime.strptime(k, "%Y") + except Exception: + pass + + print("--------------") + print(k) + + return None + + + + + diff --git a/www/routes.py b/www/routes.py index e9ac177..163bb9f 100644 --- a/www/routes.py +++ b/www/routes.py @@ -1,6 +1,7 @@ -from flask import render_template +from flask import render_template, request, jsonify from www import app from www import archives +import search.archive from datetime import datetime @app.route('/') @@ -46,6 +47,9 @@ def get_list(list): @app.route('//') def get_sublist(list, sublist): + print(list) + print(sublist) + sublist = sublist.replace(' ', '_') if list in archives.archives_data and sublist in archives.archives_data[list]: return render_template("threads.html", sublist_name=sublist, threads=archives.archives_data[list][sublist]['threads']) @@ -81,6 +85,54 @@ def get_follow_ups(list, sublist, index, follow_ups): else: 'nope nope' +@app.route('/search') +def searh(): + + if len(request.args) < 1: + k = archives.archives_data.keys() + return render_template("search.html", archives=k) + + k_arg = request.args.get('keyword') + l_arg = request.args.get('list') + sl_arg = request.args.get('sublist') + + if k_arg is None or k_arg.strip() == '': + return "no keyword..." + + if l_arg is None: + return "no list..." + + if not (l_arg == "all") and not (l_arg in archives.archives_data): + return "list '" + l_arg + "' does not exist" + + if sl_arg is not None: + if not sl_arg in archives.archives_data[l]: + return "sublist '" + sl_arg + "' does not exist in list '" + l_arg + "'" + + lists = [] + if l_arg == "all": + for k in archives.archives_data.keys(): + lists.append(k) + else: + lists.append(l_arg) + + ################################ + ## + ## need to chache all the below + ## + ################################ + + results = [] + for l in lists: + a = search.archive.Archive() + a.load(l) + results.append(a.search(k_arg)) + + return jsonify(result=results) + + + + diff --git a/www/static/search.js b/www/static/search.js new file mode 100644 index 0000000..1435edb --- /dev/null +++ b/www/static/search.js @@ -0,0 +1,150 @@ + +$(document).ready(function(){ + $('#search').on('submit', function(e) { + e.preventDefault(); + args = $(this).serialize(); + $.get('/search?'+args, function(data) { + console.log(data); + $('#graph').empty(); + $('#results').empty(); + $.each(data.result, function(i, item) { + search_result_archive(item); + }); + graph(data); + }); + }); +}); + +function search_result_archive(a) { + $('
', { + id: a.archive, + class: "archive", + }).appendTo('#results'); + $('#' + a.archive).append("

" + a.archive + "

"); + $.each(a.results, function(i, r) { + $('
    ', { + id: r.thread + "-" + a.archive, + text: r.thread.replace('_', ' ') + }).appendTo('#' + a.archive); + let hits = "
      "; + $.each(r.hits, function(j, h){ + let hit = '
    • ' + h.subject + ' -- ' + h.author_name + '
    • '; + hits += hit; + }); + hits += "
    "; + $('#' + r.thread + "-" + a.archive).append(hits); + }); +} + +var min_month = new Date(2000, 0); +var max_month = new Date(); + +function diff_months(d1, d2) { + var months; + months = (d2.getFullYear() - d1.getFullYear()) * 12; + months -= d1.getMonth(); + months += d2.getMonth(); + return months <= 0 ? 0 : months; +} + +function format(date) { + var month_names = [ + "Jan", "Feb", "Mar", + "Apr", "May", "Jun", "Jul", + "Aug", "Sep", "Oct", + "Nov", "Dec" + ]; + return month_names[date.getMonth()] + ' ' + date.getFullYear(); + //return date.getMonth() + ' - ' + date.getFullYear(); +} + + +function graph(data) { + var d = diff_months(min_month, max_month); + var vec = new Array(); + for(let ar of data.result) { + let ar_vec = new Array(d + 1).fill(0); + ar_vec[0] = ar.archive; + for(let r of ar.results) { + let date = new Date(Date.parse(r.thread.replace("_", " 1, "))); // this may blow... + let index = diff_months(min_month, date); + ar_vec[index + 1] = r.nbr_hits; + } + vec.push(ar_vec); + } + + + // var x_axis = new Array(d + 1); + // x_axis[0] = 'x'; + // for (let i = 1; i < d+1; i++) { + // let d = new Date(min_month.getFullYear(), min_month.getMonth()); + // d.setMonth(d.getMonth() + (i - 1)); + // x_axis[i] = format(d); + // } + + // vec.push(x_axis); + + var x_axis = new Array(d); + for (let i = 0; i < d; i++) { + let d = new Date(min_month.getFullYear(), min_month.getMonth()); + d.setMonth(d.getMonth() + i); + x_axis[i] = format(d); + } + + + console.log(vec); + + var chart = c3.generate({ + bindto: '#graph', + data: { + columns: vec, + type: 'bar' + }, + axis: { + x: { + type: 'category', + categories: x_axis, + tick: { + culling: { + max: 15 + }, + multiline:false + } + } + }, + bar: { + width: { + ratio: 0.9 + } + } + }); +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/www/templates/search.html b/www/templates/search.html new file mode 100644 index 0000000..f667087 --- /dev/null +++ b/www/templates/search.html @@ -0,0 +1,23 @@ + + + + + + + + + + +
    +
    + + \ No newline at end of file