README
This commit is contained in:
parent
e0ab850067
commit
4470df5f7a
47
README
47
README
@ -1,31 +1,52 @@
|
||||
|
||||
TODO (July 2019):
|
||||
- refactor archive.py and search.py
|
||||
- test lists import with mariadb backend
|
||||
[craw.py] [index.py] [search.py] [www-serve(.py)]
|
||||
|
||||
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
usage: archive.py [-h] [--arch ARCH] url [url ...]
|
||||
usage: crawl.py [-h] [--names NAMES [NAMES ...]] [--archives ARCHIVES]
|
||||
url [url ...]
|
||||
|
||||
Mailinglists are dead. Long live mailinglists!
|
||||
|
||||
positional arguments:
|
||||
url mailinglist urls to archive
|
||||
url mailinglist urls to archive
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--arch ARCH path to archives directory (default='archives')
|
||||
-h, --help show this help message and exit
|
||||
--names NAMES [NAMES ...], -n NAMES [NAMES ...]
|
||||
mailinglists' names
|
||||
--archives ARCHIVES, -a ARCHIVES
|
||||
path to archives directory
|
||||
|
||||
---
|
||||
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
usage: search.py [-h] [--list LIST [LIST ...]] [--field FIELD] keyword
|
||||
usage: index.py [-h] [--archives ARCHIVES] list [list ...]
|
||||
|
||||
Searches mailinglist's archives
|
||||
Mailinglists are dead. Long live mailinglists!
|
||||
|
||||
positional arguments:
|
||||
list list(s) to index
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--archives ARCHIVES, -a ARCHIVES
|
||||
path to archives directory (default='archives')
|
||||
|
||||
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
usage: search.py [-h] [--list LIST [LIST ...]] [--field FIELD] [--json JSON]
|
||||
keyword
|
||||
|
||||
Searches mailinglists archives
|
||||
|
||||
positional arguments:
|
||||
keyword keyword to search
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--list LIST [LIST ...]
|
||||
mailinglist(s') name(s)
|
||||
--field FIELD message field (i.e. 'content' or 'subject', etc.)
|
||||
--list LIST [LIST ...], -l LIST [LIST ...]
|
||||
mailinglist(s') name(s) (default 'all')
|
||||
--field FIELD, -f FIELD
|
||||
message field (i.e. 'content' or 'from' (default
|
||||
'content'))
|
||||
--json JSON, -j JSON json output
|
||||
|
||||
2
crawl.py
2
crawl.py
@ -25,7 +25,7 @@ if __name__ == "__main__":
|
||||
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
||||
p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
|
||||
p.add_argument('--names', '-n', help="mailinglists' names", nargs="+")
|
||||
p.add_argument('--archives', '-a', help="path to archives directory (default='archives')", default=config.archives)
|
||||
p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives)
|
||||
|
||||
args = p.parse_args()
|
||||
|
||||
|
||||
2
index.py
2
index.py
@ -13,7 +13,7 @@ def run(lists, archives):
|
||||
|
||||
for a in archives:
|
||||
archive = archive.Archive(a, archive_dir=archives)
|
||||
archive.insert_db(host=config.host, database=config.database, user=config.user, password=config.password)
|
||||
archive.insert_db(host=config.db['host'], database=config.db['database'], user=config.db['user'], password=config.db['password'])
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
87
search.py
87
search.py
@ -1,37 +1,94 @@
|
||||
import sys, logging, argparse
|
||||
import search.archive
|
||||
from datetime import datetime
|
||||
import archive.archive as archive
|
||||
import config
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
def get_key(kv):
|
||||
return datetime.strptime(kv[0], "%B_%Y")
|
||||
|
||||
def get_result_key(r):
|
||||
return r['archive']
|
||||
|
||||
def run(args):
|
||||
|
||||
if not args.keyword:
|
||||
sys.exit('No keyword. Aborting.')
|
||||
|
||||
if not args.list:
|
||||
args.list = ['spectre', 'crumb', 'empyre'] ## eh....
|
||||
if args.field not in ["content", "from"]:
|
||||
sys.exit('Invalid field ' + args.field + '. Aborting.')
|
||||
|
||||
all_lists = archive.list_tables_db(config.db['database'], config.db['host'], config.db['user'], config.db['password'])
|
||||
|
||||
if not args.list or (len(args.list) == 1 and args.list[0] == "all"):
|
||||
args.list = all_lists
|
||||
|
||||
results = []
|
||||
for l in args.list:
|
||||
arch = search.archive.Archive('archives/')
|
||||
arch.load(l)
|
||||
|
||||
r = arch.search(keyword=args.keyword, field=args.field)
|
||||
if l not in all_lists:
|
||||
logging.warning(l + "is not a valud list... continuing")
|
||||
continue
|
||||
|
||||
for z in r['results']:
|
||||
print(z['thread'] + " ---- " + str(z['nbr_hits']))
|
||||
for zz in z['hits']:
|
||||
print(" " + zz['url'])
|
||||
print(" " + zz['index_str'])
|
||||
k_arg = args.keyword
|
||||
f_arg = args.field
|
||||
|
||||
sys.exit()
|
||||
with archive.Archive(l, config=config.db) as a:
|
||||
if f_arg == 'content':
|
||||
r = a.content_search(k_arg)
|
||||
else:
|
||||
r = a.from_search(k_arg)
|
||||
|
||||
# format data to return
|
||||
search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] }
|
||||
month_year_results = {}
|
||||
|
||||
for (from_, author_name_, subject_, date_, url_) in r:
|
||||
m_y = date_.strftime("%B_%Y")
|
||||
if m_y not in month_year_results:
|
||||
month_year_results[m_y] = []
|
||||
month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_})
|
||||
|
||||
for k, v in sorted(month_year_results.items(), key=get_key, reverse=True):
|
||||
search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v})
|
||||
|
||||
results.append(search_results)
|
||||
|
||||
|
||||
return sorted(results, key=get_result_key)
|
||||
|
||||
|
||||
# for l in args.list:
|
||||
# arch = search.archive.Archive('archives/')
|
||||
# arch.load(l)
|
||||
|
||||
# r = arch.search(keyword=args.keyword, field=args.field)
|
||||
|
||||
# for z in r['results']:
|
||||
# print(z['thread'] + " ---- " + str(z['nbr_hits']))
|
||||
# for zz in z['hits']:
|
||||
# print(" " + zz['url'])
|
||||
# print(" " + zz['index_str'])
|
||||
|
||||
# sys.exit()
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
p = argparse.ArgumentParser(description='Searches mailinglists archives')
|
||||
p.add_argument('keyword', metavar="keyword", help="keyword to search")
|
||||
p.add_argument('--list', help="mailinglist(s') name(s)", nargs="+")
|
||||
p.add_argument('--field', help="message field (i.e. 'content' or 'subject', etc.)", default="content")
|
||||
p.add_argument('--list', '-l', help="mailinglist(s') name(s) (default 'all')", nargs="+")
|
||||
p.add_argument('--field', '-f', help="message field (i.e. 'content' or 'from' (default 'content'))", default="content")
|
||||
p.add_argument('--json', '-j', help="json output")
|
||||
|
||||
args = p.parse_args()
|
||||
|
||||
run(args)
|
||||
result = run(args)
|
||||
|
||||
if args.json:
|
||||
import json
|
||||
print(json.dumps(result, indent=4))
|
||||
else:
|
||||
print(result)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user