This commit is contained in:
gauthiier 2019-12-21 16:27:37 +01:00
parent e0ab850067
commit 4470df5f7a
4 changed files with 108 additions and 30 deletions

47
README
View File

@ -1,31 +1,52 @@
TODO (July 2019):
- refactor archive.py and search.py
- test lists import with mariadb backend
[craw.py] [index.py] [search.py] [www-serve(.py)]
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
usage: archive.py [-h] [--arch ARCH] url [url ...]
usage: crawl.py [-h] [--names NAMES [NAMES ...]] [--archives ARCHIVES]
url [url ...]
Mailinglists are dead. Long live mailinglists!
positional arguments:
url mailinglist urls to archive
url mailinglist urls to archive
optional arguments:
-h, --help show this help message and exit
--arch ARCH path to archives directory (default='archives')
-h, --help show this help message and exit
--names NAMES [NAMES ...], -n NAMES [NAMES ...]
mailinglists' names
--archives ARCHIVES, -a ARCHIVES
path to archives directory
---
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
usage: search.py [-h] [--list LIST [LIST ...]] [--field FIELD] keyword
usage: index.py [-h] [--archives ARCHIVES] list [list ...]
Searches mailinglist's archives
Mailinglists are dead. Long live mailinglists!
positional arguments:
list list(s) to index
optional arguments:
-h, --help show this help message and exit
--archives ARCHIVES, -a ARCHIVES
path to archives directory (default='archives')
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
usage: search.py [-h] [--list LIST [LIST ...]] [--field FIELD] [--json JSON]
keyword
Searches mailinglists archives
positional arguments:
keyword keyword to search
optional arguments:
-h, --help show this help message and exit
--list LIST [LIST ...]
mailinglist(s') name(s)
--field FIELD message field (i.e. 'content' or 'subject', etc.)
--list LIST [LIST ...], -l LIST [LIST ...]
mailinglist(s') name(s) (default 'all')
--field FIELD, -f FIELD
message field (i.e. 'content' or 'from' (default
'content'))
--json JSON, -j JSON json output

View File

@ -25,7 +25,7 @@ if __name__ == "__main__":
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
p.add_argument('--names', '-n', help="mailinglists' names", nargs="+")
p.add_argument('--archives', '-a', help="path to archives directory (default='archives')", default=config.archives)
p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives)
args = p.parse_args()

View File

@ -13,7 +13,7 @@ def run(lists, archives):
for a in archives:
archive = archive.Archive(a, archive_dir=archives)
archive.insert_db(host=config.host, database=config.database, user=config.user, password=config.password)
archive.insert_db(host=config.db['host'], database=config.db['database'], user=config.db['user'], password=config.db['password'])
if __name__ == "__main__":

View File

@ -1,37 +1,94 @@
import sys, logging, argparse
import search.archive
from datetime import datetime
import archive.archive as archive
import config
logging.basicConfig(level=logging.DEBUG)
def get_key(kv):
return datetime.strptime(kv[0], "%B_%Y")
def get_result_key(r):
return r['archive']
def run(args):
if not args.keyword:
sys.exit('No keyword. Aborting.')
if not args.list:
args.list = ['spectre', 'crumb', 'empyre'] ## eh....
if args.field not in ["content", "from"]:
sys.exit('Invalid field ' + args.field + '. Aborting.')
all_lists = archive.list_tables_db(config.db['database'], config.db['host'], config.db['user'], config.db['password'])
if not args.list or (len(args.list) == 1 and args.list[0] == "all"):
args.list = all_lists
results = []
for l in args.list:
arch = search.archive.Archive('archives/')
arch.load(l)
r = arch.search(keyword=args.keyword, field=args.field)
if l not in all_lists:
logging.warning(l + "is not a valud list... continuing")
continue
for z in r['results']:
print(z['thread'] + " ---- " + str(z['nbr_hits']))
for zz in z['hits']:
print(" " + zz['url'])
print(" " + zz['index_str'])
k_arg = args.keyword
f_arg = args.field
sys.exit()
with archive.Archive(l, config=config.db) as a:
if f_arg == 'content':
r = a.content_search(k_arg)
else:
r = a.from_search(k_arg)
# format data to return
search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] }
month_year_results = {}
for (from_, author_name_, subject_, date_, url_) in r:
m_y = date_.strftime("%B_%Y")
if m_y not in month_year_results:
month_year_results[m_y] = []
month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_})
for k, v in sorted(month_year_results.items(), key=get_key, reverse=True):
search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v})
results.append(search_results)
return sorted(results, key=get_result_key)
# for l in args.list:
# arch = search.archive.Archive('archives/')
# arch.load(l)
# r = arch.search(keyword=args.keyword, field=args.field)
# for z in r['results']:
# print(z['thread'] + " ---- " + str(z['nbr_hits']))
# for zz in z['hits']:
# print(" " + zz['url'])
# print(" " + zz['index_str'])
# sys.exit()
if __name__ == "__main__":
p = argparse.ArgumentParser(description='Searches mailinglists archives')
p.add_argument('keyword', metavar="keyword", help="keyword to search")
p.add_argument('--list', help="mailinglist(s') name(s)", nargs="+")
p.add_argument('--field', help="message field (i.e. 'content' or 'subject', etc.)", default="content")
p.add_argument('--list', '-l', help="mailinglist(s') name(s) (default 'all')", nargs="+")
p.add_argument('--field', '-f', help="message field (i.e. 'content' or 'from' (default 'content'))", default="content")
p.add_argument('--json', '-j', help="json output")
args = p.parse_args()
run(args)
result = run(args)
if args.json:
import json
print(json.dumps(result, indent=4))
else:
print(result)