README
This commit is contained in:
parent
e0ab850067
commit
4470df5f7a
43
README
43
README
@ -1,10 +1,10 @@
|
|||||||
|
|
||||||
TODO (July 2019):
|
[craw.py] [index.py] [search.py] [www-serve(.py)]
|
||||||
- refactor archive.py and search.py
|
|
||||||
- test lists import with mariadb backend
|
|
||||||
|
|
||||||
|
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
|
||||||
usage: archive.py [-h] [--arch ARCH] url [url ...]
|
usage: crawl.py [-h] [--names NAMES [NAMES ...]] [--archives ARCHIVES]
|
||||||
|
url [url ...]
|
||||||
|
|
||||||
Mailinglists are dead. Long live mailinglists!
|
Mailinglists are dead. Long live mailinglists!
|
||||||
|
|
||||||
@ -13,19 +13,40 @@ positional arguments:
|
|||||||
|
|
||||||
optional arguments:
|
optional arguments:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--arch ARCH path to archives directory (default='archives')
|
--names NAMES [NAMES ...], -n NAMES [NAMES ...]
|
||||||
|
mailinglists' names
|
||||||
|
--archives ARCHIVES, -a ARCHIVES
|
||||||
|
path to archives directory
|
||||||
|
|
||||||
---
|
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
|
||||||
usage: search.py [-h] [--list LIST [LIST ...]] [--field FIELD] keyword
|
usage: index.py [-h] [--archives ARCHIVES] list [list ...]
|
||||||
|
|
||||||
Searches mailinglist's archives
|
Mailinglists are dead. Long live mailinglists!
|
||||||
|
|
||||||
|
positional arguments:
|
||||||
|
list list(s) to index
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
--archives ARCHIVES, -a ARCHIVES
|
||||||
|
path to archives directory (default='archives')
|
||||||
|
|
||||||
|
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
|
||||||
|
usage: search.py [-h] [--list LIST [LIST ...]] [--field FIELD] [--json JSON]
|
||||||
|
keyword
|
||||||
|
|
||||||
|
Searches mailinglists archives
|
||||||
|
|
||||||
positional arguments:
|
positional arguments:
|
||||||
keyword keyword to search
|
keyword keyword to search
|
||||||
|
|
||||||
optional arguments:
|
optional arguments:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--list LIST [LIST ...]
|
--list LIST [LIST ...], -l LIST [LIST ...]
|
||||||
mailinglist(s') name(s)
|
mailinglist(s') name(s) (default 'all')
|
||||||
--field FIELD message field (i.e. 'content' or 'subject', etc.)
|
--field FIELD, -f FIELD
|
||||||
|
message field (i.e. 'content' or 'from' (default
|
||||||
|
'content'))
|
||||||
|
--json JSON, -j JSON json output
|
||||||
|
|||||||
2
crawl.py
2
crawl.py
@ -25,7 +25,7 @@ if __name__ == "__main__":
|
|||||||
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
||||||
p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
|
p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
|
||||||
p.add_argument('--names', '-n', help="mailinglists' names", nargs="+")
|
p.add_argument('--names', '-n', help="mailinglists' names", nargs="+")
|
||||||
p.add_argument('--archives', '-a', help="path to archives directory (default='archives')", default=config.archives)
|
p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives)
|
||||||
|
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|
||||||
|
|||||||
2
index.py
2
index.py
@ -13,7 +13,7 @@ def run(lists, archives):
|
|||||||
|
|
||||||
for a in archives:
|
for a in archives:
|
||||||
archive = archive.Archive(a, archive_dir=archives)
|
archive = archive.Archive(a, archive_dir=archives)
|
||||||
archive.insert_db(host=config.host, database=config.database, user=config.user, password=config.password)
|
archive.insert_db(host=config.db['host'], database=config.db['database'], user=config.db['user'], password=config.db['password'])
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
|||||||
87
search.py
87
search.py
@ -1,37 +1,94 @@
|
|||||||
import sys, logging, argparse
|
import sys, logging, argparse
|
||||||
import search.archive
|
from datetime import datetime
|
||||||
|
import archive.archive as archive
|
||||||
|
import config
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
def get_key(kv):
|
||||||
|
return datetime.strptime(kv[0], "%B_%Y")
|
||||||
|
|
||||||
|
def get_result_key(r):
|
||||||
|
return r['archive']
|
||||||
|
|
||||||
def run(args):
|
def run(args):
|
||||||
|
|
||||||
if not args.keyword:
|
if not args.keyword:
|
||||||
sys.exit('No keyword. Aborting.')
|
sys.exit('No keyword. Aborting.')
|
||||||
|
|
||||||
if not args.list:
|
if args.field not in ["content", "from"]:
|
||||||
args.list = ['spectre', 'crumb', 'empyre'] ## eh....
|
sys.exit('Invalid field ' + args.field + '. Aborting.')
|
||||||
|
|
||||||
|
all_lists = archive.list_tables_db(config.db['database'], config.db['host'], config.db['user'], config.db['password'])
|
||||||
|
|
||||||
|
if not args.list or (len(args.list) == 1 and args.list[0] == "all"):
|
||||||
|
args.list = all_lists
|
||||||
|
|
||||||
|
results = []
|
||||||
for l in args.list:
|
for l in args.list:
|
||||||
arch = search.archive.Archive('archives/')
|
|
||||||
arch.load(l)
|
|
||||||
|
|
||||||
r = arch.search(keyword=args.keyword, field=args.field)
|
if l not in all_lists:
|
||||||
|
logging.warning(l + "is not a valud list... continuing")
|
||||||
|
continue
|
||||||
|
|
||||||
for z in r['results']:
|
k_arg = args.keyword
|
||||||
print(z['thread'] + " ---- " + str(z['nbr_hits']))
|
f_arg = args.field
|
||||||
for zz in z['hits']:
|
|
||||||
print(" " + zz['url'])
|
|
||||||
print(" " + zz['index_str'])
|
|
||||||
|
|
||||||
sys.exit()
|
with archive.Archive(l, config=config.db) as a:
|
||||||
|
if f_arg == 'content':
|
||||||
|
r = a.content_search(k_arg)
|
||||||
|
else:
|
||||||
|
r = a.from_search(k_arg)
|
||||||
|
|
||||||
|
# format data to return
|
||||||
|
search_results = { "keyword": k_arg, "field": f_arg, "archive": a.archive_name, "results": [] }
|
||||||
|
month_year_results = {}
|
||||||
|
|
||||||
|
for (from_, author_name_, subject_, date_, url_) in r:
|
||||||
|
m_y = date_.strftime("%B_%Y")
|
||||||
|
if m_y not in month_year_results:
|
||||||
|
month_year_results[m_y] = []
|
||||||
|
month_year_results[m_y].append({ 'url': url_, 'subject': subject_, 'author_name': author_name_})
|
||||||
|
|
||||||
|
for k, v in sorted(month_year_results.items(), key=get_key, reverse=True):
|
||||||
|
search_results['results'].append({ 'thread': k, 'nbr_hits': len(v), 'hits': v})
|
||||||
|
|
||||||
|
results.append(search_results)
|
||||||
|
|
||||||
|
|
||||||
|
return sorted(results, key=get_result_key)
|
||||||
|
|
||||||
|
|
||||||
|
# for l in args.list:
|
||||||
|
# arch = search.archive.Archive('archives/')
|
||||||
|
# arch.load(l)
|
||||||
|
|
||||||
|
# r = arch.search(keyword=args.keyword, field=args.field)
|
||||||
|
|
||||||
|
# for z in r['results']:
|
||||||
|
# print(z['thread'] + " ---- " + str(z['nbr_hits']))
|
||||||
|
# for zz in z['hits']:
|
||||||
|
# print(" " + zz['url'])
|
||||||
|
# print(" " + zz['index_str'])
|
||||||
|
|
||||||
|
# sys.exit()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
p = argparse.ArgumentParser(description='Searches mailinglists archives')
|
p = argparse.ArgumentParser(description='Searches mailinglists archives')
|
||||||
p.add_argument('keyword', metavar="keyword", help="keyword to search")
|
p.add_argument('keyword', metavar="keyword", help="keyword to search")
|
||||||
p.add_argument('--list', help="mailinglist(s') name(s)", nargs="+")
|
p.add_argument('--list', '-l', help="mailinglist(s') name(s) (default 'all')", nargs="+")
|
||||||
p.add_argument('--field', help="message field (i.e. 'content' or 'subject', etc.)", default="content")
|
p.add_argument('--field', '-f', help="message field (i.e. 'content' or 'from' (default 'content'))", default="content")
|
||||||
|
p.add_argument('--json', '-j', help="json output")
|
||||||
|
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|
||||||
run(args)
|
result = run(args)
|
||||||
|
|
||||||
|
if args.json:
|
||||||
|
import json
|
||||||
|
print(json.dumps(result, indent=4))
|
||||||
|
else:
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user