From 464194c9b930aef734554f7a6611f85a0b4d2ad1 Mon Sep 17 00:00:00 2001 From: gauthiier Date: Fri, 24 Nov 2017 09:52:14 +0100 Subject: [PATCH] hits --- analysis/archive.py | 15 ++++++++++++++- analysis/format.py | 7 +++++++ analysis/util.py | 15 +++++++++++++-- search/archive.py | 23 +++++++++++++++++++++++ www/archives.py | 1 - www/routes.py | 11 ++++++----- www/static/search.js | 9 +++++++++ 7 files changed, 72 insertions(+), 9 deletions(-) diff --git a/analysis/archive.py b/analysis/archive.py index 597615a..3fc77cd 100644 --- a/analysis/archive.py +++ b/analysis/archive.py @@ -5,6 +5,8 @@ import os, datetime, json, gzip, re import analysis.util import analysis.query +import search.archive ## circular... + def filter_date(msg, archive_name): @@ -130,6 +132,14 @@ def load_from_file(filename, archive_name, archive_dir, json_data=None): print('---> ' + archive_name) return json_data_to_pd_dataframe(threads, archive_name) + +def load_from_search_archive(archive): + threads = [] + for k, v in archive.archive.items(): + threads.append(v) + return json_data_to_pd_dataframe(threads, archive.archive_name) + + class Archive: @@ -140,7 +150,10 @@ class Archive: def __init__(self, archive_name, archive_dir="archives"): if isinstance(archive_name, pd.core.frame.DataFrame): - self.dataframe = archive_name.copy() + self.dataframe = archive_name ## no copies here + + if isinstance(archive_name, search.archive.Archive): + self.dataframe = load_from_search_archive(archive_name) if isinstance(archive_name, str): # need a filename or a dir name.... diff --git a/analysis/format.py b/analysis/format.py index 4c8e8b0..2ac54a4 100644 --- a/analysis/format.py +++ b/analysis/format.py @@ -31,6 +31,13 @@ def table_threads_ranking(ranking_dataframe): return html_str +def frame_to_dictionary_threads_ranking(ranking_dataframe): + + results = [] + for i, row in ranking_dataframe.iterrows(): + d = {'date': str(i), 'subject': row['subject'], 'url': row['url'], 'from': row['from'], 'nbr-references': row['nbr-references']} + results.append(d) + return results diff --git a/analysis/util.py b/analysis/util.py index cd39f54..4602517 100644 --- a/analysis/util.py +++ b/analysis/util.py @@ -11,9 +11,15 @@ def format_author(msg, archive_name): return msg['author_name'] def format_from_token(from_str, sep): + + fff = from_str + from_addr = email.utils.parseaddr(from_str)[1] + + fffa = email.utils.parseaddr(from_str) + if sep not in from_addr: - tok = from_str.split() + tok = from_str.split() try: at = tok.index(sep) from_addr = ''.join([tok[at-1], '{AT}', tok[at+1]]) @@ -22,13 +28,18 @@ def format_from_token(from_str, sep): except ValueError: print(tok) print("error formating 'from' " + from_str + " -- expecting sep: " + sep) + print("*** " + fff) + print("+++") + print(fffa) + print("----") + return None else: from_addr = from_addr.replace(sep, '{AT}') return from_addr.lower() def format_from(msg, archive_name): - from_str = msg['from'] + from_str = msg['from'] if " {AT} " in from_str: return format_from_token(from_str, '{AT}') diff --git a/search/archive.py b/search/archive.py index fc35844..664cd7b 100644 --- a/search/archive.py +++ b/search/archive.py @@ -1,6 +1,10 @@ import logging, os, json, re from datetime import datetime +import analysis.archive ## circular... +import analysis.query +import analysis.format + class Archive(): def __init__(self, archives_dir=None): @@ -77,6 +81,25 @@ class Archive(): return search_results + def threads_ranking(self, rank=5): + + search_results = { "keyword": "thread ranking", "field": "ranking", "archive": self.archive_name, "results": [] } + + a = analysis.archive.Archive(self) + q = a.query(); + + ranking = q.threads_ranking(rank=rank) + + for i in ranking: + r = analysis.format.frame_to_dictionary_threads_ranking(ranking[i]) + for h in r: + hit = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}] + search_results['results'].append({'thread': h['date'], 'nbr_hits': h['nbr-references'], 'hits': hit}) + del a + del q + + return search_results + def get_key(kv_tuple): diff --git a/www/archives.py b/www/archives.py index 8ff9826..7402376 100644 --- a/www/archives.py +++ b/www/archives.py @@ -77,4 +77,3 @@ class Archives(metaclass=Singleton): # return arch - diff --git a/www/routes.py b/www/routes.py index d48492d..ec29df4 100644 --- a/www/routes.py +++ b/www/routes.py @@ -147,12 +147,13 @@ def searh(): logging.info("search keyword = " + k_arg) for l in lists: + if k_arg == "rank": + logging.info(" ranking " + l) + s = archives_data[l].threads_ranking() + else: + s = archives_data[l].search(keyword=k_arg, field=f_arg, min_hits=nbr_hits) - # this makes no sense... - # a = search.archive.Archive() - # a.load(l) - - results.append(archives_data[l].search(keyword=k_arg, field=f_arg, min_hits=nbr_hits)) + results.append(s) ## -- sort results? search_results = sorted(results, key=get_result_key) diff --git a/www/static/search.js b/www/static/search.js index 6366e91..b37eb38 100644 --- a/www/static/search.js +++ b/www/static/search.js @@ -35,12 +35,21 @@ function search_result_archive(a) { text: r.thread.replace('_', ' ') }).appendTo('#' + a.archive); let hits = ""; $('#' + r.thread + "-" + a.archive).append(hits); + + console.log("***"); + }); }