listservs/search/archive.py

151 lines
3.8 KiB
Python
Raw Normal View History

2017-07-27 10:09:33 +02:00
import logging, os, json, re
from datetime import datetime
2017-11-24 09:52:14 +01:00
import analysis.archive ## circular...
import analysis.query
import analysis.format
2018-11-20 11:46:10 +01:00
import threading
2017-07-27 10:09:33 +02:00
class Archive():
def __init__(self, archives_dir=None):
if archives_dir==None:
from www import config
self.archives_dir = config.ARCHIVES_PATH
2017-07-27 10:09:33 +02:00
else:
self.archives_dir = archives_dir
self.loaded = False
2018-11-20 11:46:10 +01:00
self.lock_search = threading.Lock()
self.lock_threads_ranking = threading.Lock()
2017-07-27 10:09:33 +02:00
def load(self, archive_name=None):
if archive_name == None:
raise Exception('Archive is not specified')
archive_path = os.path.join(self.archives_dir, archive_name)
if not os.path.isdir(archive_path):
raise Exception('Archive ' + path + ' does not exist')
self.archive_name = archive_name
self.archive_path = archive_path
files = [f for f in os.listdir(archive_path) if f.endswith('.json')]
self.archive = {}
for f in files:
file_path = os.path.join(archive_path, f)
label = f.replace('.json', '')
with open(file_path) as fdata:
self.archive[label] = json.load(fdata)
2017-11-08 11:48:28 +01:00
self.loaded = True
2017-07-27 10:09:33 +02:00
def search_message(self, keyword, msg, index_str, results, field='content'):
nbr_hits = 0
2017-11-06 14:11:18 +01:00
if msg[field] is not None and msg[field].lower().find(keyword.lower()) > 0:
2017-07-27 10:09:33 +02:00
nbr_hits += 1
results.append({ "index_str": index_str, "subject": msg['subject'], "date": msg['date'], "author_name": msg['author_name'], "url": msg['url'] })
if 'follow-up' in msg:
i = 0
for m in msg['follow-up']:
current_index_str = index_str + '/' + str(i)
nbr_hits += self.search_message(keyword, m, current_index_str, results, field)
i += 1
return nbr_hits
2017-11-09 13:49:22 +01:00
def search(self, keyword, field='content', min_hits=0):
2017-07-27 10:09:33 +02:00
2018-11-20 11:46:10 +01:00
with self.lock_search:
2017-07-27 10:09:33 +02:00
2018-11-20 11:46:10 +01:00
search_results = { "keyword": keyword, "field": field, "archive": self.archive_name, "results": [] }
2017-07-27 10:09:33 +02:00
2018-11-20 11:46:10 +01:00
for k, v in sorted(self.archive.items(), key=get_key, reverse=True):
current_index_str = self.archive_name + '/' + k
hits = []
nbr_hits = 0
i = 0
for m in v['threads']:
current_index_str = self.archive_name + '/' + k + '/' + str(i)
nbr_hits += self.search_message(keyword, m, current_index_str, hits, field)
i += 1
2017-07-27 10:09:33 +02:00
2018-11-20 11:46:10 +01:00
if nbr_hits > min_hits:
# nettime-l - fix (the name of the thread from ex. 'nettime-l_Jan_01' to 'January 2001')
if k.startswith("nettime-l_"):
dt = datetime.strptime(k, "nettime-l_%b_%y")
k = dt.strftime("%B_%Y")
search_results['results'].append({ 'thread': k, 'nbr_hits': nbr_hits, 'hits': hits})
2017-07-27 10:09:33 +02:00
2018-11-20 11:46:10 +01:00
return search_results
2017-07-27 10:09:33 +02:00
2017-11-24 09:52:14 +01:00
def threads_ranking(self, rank=5):
2018-11-20 11:46:10 +01:00
with self.lock_threads_ranking:
search_results = { "keyword": "thread ranking", "field": "ranking", "archive": self.archive_name, "results": [] }
2017-11-24 09:52:14 +01:00
2018-11-20 11:46:10 +01:00
a = analysis.archive.Archive(self)
q = a.query();
2017-11-24 09:52:14 +01:00
2018-11-20 11:46:10 +01:00
ranking = q.threads_ranking(rank=rank)
2017-11-24 09:52:14 +01:00
2018-11-20 11:46:10 +01:00
for i in ranking:
r = analysis.format.frame_to_dictionary_threads_ranking(ranking[i])
for h in r:
hit = [{ 'url': h['url'], 'subject': h['subject'], 'author_name': h['from']}]
search_results['results'].append({'thread': h['date'], 'nbr_hits': h['nbr-references'], 'hits': hit})
del a
del q
2017-11-24 09:52:14 +01:00
2018-11-20 11:46:10 +01:00
return search_results
2017-11-24 09:52:14 +01:00
2017-07-27 10:09:33 +02:00
def get_key(kv_tuple):
k = kv_tuple[0]
# k is of the form "Month_Year" - ex.: "January_2001"
try:
return datetime.strptime(k, "%B_%Y")
except Exception:
pass
# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
try:
return datetime.strptime(k, "%b_%y")
except Exception:
pass
# k is of the form "Year" - ex.: "2001"
try:
return datetime.strptime(k, "%Y")
except Exception:
pass
2017-11-04 13:34:05 +01:00
# nettime-l - fix - k is of the form "nettime-l_Month(abv)_Year(abv)" - ex.: "nettime-l_Jan_01"
try:
return datetime.strptime(k, "nettime-l_%b_%y")
except Exception:
pass
2017-07-27 10:09:33 +02:00
print("--------------")
print(k)
return None