listservs/search.py

95 lines
2.9 KiB
Python
Raw Normal View History

2017-07-27 10:09:33 +02:00
import sys, logging, argparse
2019-12-21 16:27:37 +01:00
from datetime import datetime
import archive.archive as archive
import config
2017-07-27 10:09:33 +02:00
logging.basicConfig(level=logging.DEBUG)
2019-12-21 16:27:37 +01:00
def get_key(kv):
return datetime.strptime(kv[0], "%B_%Y")
def get_result_key(r):
return r['archive']
2017-07-27 10:09:33 +02:00
def run(args):
if not args.keyword:
sys.exit('No keyword. Aborting.')
2019-12-21 16:27:37 +01:00
if args.field not in ["content", "from"]:
sys.exit('Invalid field ' + args.field + '. Aborting.')
all_lists = archive.list_tables_db(config.db['database'], config.db['host'], config.db['user'], config.db['password'])
2017-07-27 10:09:33 +02:00
2019-12-21 16:27:37 +01:00
if not args.list or (len(args.list) == 1 and args.list[0] == "all"):
args.list = all_lists
results = []
2017-07-27 10:09:33 +02:00
for l in args.list:
2019-12-21 16:27:37 +01:00
if l not in all_lists:
logging.warning(l + "is not a valud list... continuing")
continue
k_arg = args.keyword
f_arg = args.field
with archive.Archive(l, config=config.db) as a:
if f_arg == 'content':
r = a.content_search(k_arg)
else:
r = a.from_search(k_arg)
# format data to return
search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] }
month_year_results = {}
for (from_, author_name_, subject_, date_, url_) in r:
m_y = date_.strftime("%B_%Y")
if m_y not in month_year_results:
month_year_results[m_y] = []
month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_})
for k, v in sorted(month_year_results.items(), key=get_key, reverse=True):
search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v})
results.append(search_results)
2017-07-27 10:09:33 +02:00
2019-12-21 16:27:37 +01:00
return sorted(results, key=get_result_key)
# for l in args.list:
# arch = search.archive.Archive('archives/')
# arch.load(l)
# r = arch.search(keyword=args.keyword, field=args.field)
# for z in r['results']:
# print(z['thread'] + " ---- " + str(z['nbr_hits']))
# for zz in z['hits']:
# print(" " + zz['url'])
# print(" " + zz['index_str'])
# sys.exit()
2017-07-27 10:09:33 +02:00
if __name__ == "__main__":
p = argparse.ArgumentParser(description='Searches mailinglists archives')
p.add_argument('keyword', metavar="keyword", help="keyword to search")
2019-12-21 16:27:37 +01:00
p.add_argument('--list', '-l', help="mailinglist(s') name(s) (default 'all')", nargs="+")
p.add_argument('--field', '-f', help="message field (i.e. 'content' or 'from' (default 'content'))", default="content")
p.add_argument('--json', '-j', help="json output")
2017-07-27 10:09:33 +02:00
args = p.parse_args()
2019-12-21 16:27:37 +01:00
result = run(args)
if args.json:
import json
print(json.dumps(result, indent=4))
else:
print(result)