From 4470df5f7aa6bb0f850d047c78ccd8f2c7da6b4f Mon Sep 17 00:00:00 2001 From: gauthiier Date: Sat, 21 Dec 2019 16:27:37 +0100 Subject: [PATCH] README --- README | 47 +++++++++++++++++++++--------- crawl.py | 2 +- index.py | 2 +- search.py | 87 +++++++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 108 insertions(+), 30 deletions(-) diff --git a/README b/README index 9ae5b7a..d81f6b3 100644 --- a/README +++ b/README @@ -1,31 +1,52 @@ -TODO (July 2019): - - refactor archive.py and search.py - - test lists import with mariadb backend +[craw.py] [index.py] [search.py] [www-serve(.py)] ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -usage: archive.py [-h] [--arch ARCH] url [url ...] +usage: crawl.py [-h] [--names NAMES [NAMES ...]] [--archives ARCHIVES] + url [url ...] Mailinglists are dead. Long live mailinglists! positional arguments: - url mailinglist urls to archive + url mailinglist urls to archive optional arguments: - -h, --help show this help message and exit - --arch ARCH path to archives directory (default='archives') + -h, --help show this help message and exit + --names NAMES [NAMES ...], -n NAMES [NAMES ...] + mailinglists' names + --archives ARCHIVES, -a ARCHIVES + path to archives directory - --- ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -usage: search.py [-h] [--list LIST [LIST ...]] [--field FIELD] keyword +usage: index.py [-h] [--archives ARCHIVES] list [list ...] -Searches mailinglist's archives +Mailinglists are dead. Long live mailinglists! + +positional arguments: + list list(s) to index + +optional arguments: + -h, --help show this help message and exit + --archives ARCHIVES, -a ARCHIVES + path to archives directory (default='archives') + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +usage: search.py [-h] [--list LIST [LIST ...]] [--field FIELD] [--json JSON] + keyword + +Searches mailinglists archives positional arguments: keyword keyword to search optional arguments: -h, --help show this help message and exit - --list LIST [LIST ...] - mailinglist(s') name(s) - --field FIELD message field (i.e. 'content' or 'subject', etc.) \ No newline at end of file + --list LIST [LIST ...], -l LIST [LIST ...] + mailinglist(s') name(s) (default 'all') + --field FIELD, -f FIELD + message field (i.e. 'content' or 'from' (default + 'content')) + --json JSON, -j JSON json output diff --git a/crawl.py b/crawl.py index 07cf84a..11b97fb 100644 --- a/crawl.py +++ b/crawl.py @@ -25,7 +25,7 @@ if __name__ == "__main__": p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!') p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+") p.add_argument('--names', '-n', help="mailinglists' names", nargs="+") - p.add_argument('--archives', '-a', help="path to archives directory (default='archives')", default=config.archives) + p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives) args = p.parse_args() diff --git a/index.py b/index.py index 5be515f..57c29b2 100644 --- a/index.py +++ b/index.py @@ -13,7 +13,7 @@ def run(lists, archives): for a in archives: archive = archive.Archive(a, archive_dir=archives) - archive.insert_db(host=config.host, database=config.database, user=config.user, password=config.password) + archive.insert_db(host=config.db['host'], database=config.db['database'], user=config.db['user'], password=config.db['password']) if __name__ == "__main__": diff --git a/search.py b/search.py index 6c0de83..5b56beb 100644 --- a/search.py +++ b/search.py @@ -1,37 +1,94 @@ import sys, logging, argparse -import search.archive +from datetime import datetime +import archive.archive as archive +import config logging.basicConfig(level=logging.DEBUG) +def get_key(kv): + return datetime.strptime(kv[0], "%B_%Y") + +def get_result_key(r): + return r['archive'] + def run(args): if not args.keyword: sys.exit('No keyword. Aborting.') - if not args.list: - args.list = ['spectre', 'crumb', 'empyre'] ## eh.... + if args.field not in ["content", "from"]: + sys.exit('Invalid field ' + args.field + '. Aborting.') + all_lists = archive.list_tables_db(config.db['database'], config.db['host'], config.db['user'], config.db['password']) + + if not args.list or (len(args.list) == 1 and args.list[0] == "all"): + args.list = all_lists + + results = [] for l in args.list: - arch = search.archive.Archive('archives/') - arch.load(l) - r = arch.search(keyword=args.keyword, field=args.field) + if l not in all_lists: + logging.warning(l + "is not a valud list... continuing") + continue - for z in r['results']: - print(z['thread'] + " ---- " + str(z['nbr_hits'])) - for zz in z['hits']: - print(" " + zz['url']) - print(" " + zz['index_str']) + k_arg = args.keyword + f_arg = args.field + + with archive.Archive(l, config=config.db) as a: + if f_arg == 'content': + r = a.content_search(k_arg) + else: + r = a.from_search(k_arg) - sys.exit() + # format data to return + search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] } + month_year_results = {} + + for (from_, author_name_, subject_, date_, url_) in r: + m_y = date_.strftime("%B_%Y") + if m_y not in month_year_results: + month_year_results[m_y] = [] + month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_}) + + for k, v in sorted(month_year_results.items(), key=get_key, reverse=True): + search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v}) + + results.append(search_results) + + + return sorted(results, key=get_result_key) + + + # for l in args.list: + # arch = search.archive.Archive('archives/') + # arch.load(l) + + # r = arch.search(keyword=args.keyword, field=args.field) + + # for z in r['results']: + # print(z['thread'] + " ---- " + str(z['nbr_hits'])) + # for zz in z['hits']: + # print(" " + zz['url']) + # print(" " + zz['index_str']) + + # sys.exit() if __name__ == "__main__": p = argparse.ArgumentParser(description='Searches mailinglists archives') p.add_argument('keyword', metavar="keyword", help="keyword to search") - p.add_argument('--list', help="mailinglist(s') name(s)", nargs="+") - p.add_argument('--field', help="message field (i.e. 'content' or 'subject', etc.)", default="content") + p.add_argument('--list', '-l', help="mailinglist(s') name(s) (default 'all')", nargs="+") + p.add_argument('--field', '-f', help="message field (i.e. 'content' or 'from' (default 'content'))", default="content") + p.add_argument('--json', '-j', help="json output") args = p.parse_args() - run(args) + result = run(args) + + if args.json: + import json + print(json.dumps(result, indent=4)) + else: + print(result) + +