From 1ef4782fc8f82c9e69ee154b1aee8a9bd19a3538 Mon Sep 17 00:00:00 2001 From: gauthiier Date: Sat, 21 Dec 2019 21:09:34 +0100 Subject: [PATCH] crawl all --- crawl.py | 36 ++++++++++++++++++++---------------- lists/mhonarc_nettime.py | 13 +++++++++++-- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/crawl.py b/crawl.py index 11b97fb..d91861a 100644 --- a/crawl.py +++ b/crawl.py @@ -3,30 +3,34 @@ import lists.crawl, config logging.basicConfig(level=logging.DEBUG) -def run(args): +def run(list_archive_name_url, archive_dir): - if not args.url: - sys.exit('No url(s). Aborting.') - - if not args.names: - args.names = [] - - ## check valid url?... hmm... nej - i = 0 - for u in args.url: - name = args.names[i] if i < len(args.names) else None - lists.crawl.crawl(url=u, name=name, sublist_name=name, archive_dir=args.arch) #<-- sublist for nettime - i = i + 1 + ## check valid url?... hmm... nej + for l in list_archive_name_url: + logging.info("+++++ Crawling list: " + l['name'] + " +++++") + lists.crawl.crawl(url=l['url'], name=l['name'], sublist_name=l['subname'], archive_dir=archive_dir) - sys.exit() if __name__ == "__main__": p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!') p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+") - p.add_argument('--names', '-n', help="mailinglists' names", nargs="+") + p.add_argument('--names', '-n', help="mailinglists' names (for the archive)", nargs="+") + p.add_argument('--subnames', '-s', help="mailinglists' subnames (on the webpage)", nargs="+") p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives) args = p.parse_args() - run(args) + l = [] + if len(args.url) == 1 and args.url[0] == 'all': + l = config.lists + elif len(args.url) == len(args.names) == len(args.subnames): + i = 0 + for u in args.url: + l.append({'name': args.names[i], 'url': u, 'subname': args.subnames[i]}) + i += 1 + else: + sys.exit("Inconcistent list of names, urls, and subnames. Aborting.") + + + run(l, args.archives) diff --git a/lists/mhonarc_nettime.py b/lists/mhonarc_nettime.py index 5fc08cf..e5e106b 100644 --- a/lists/mhonarc_nettime.py +++ b/lists/mhonarc_nettime.py @@ -1,5 +1,6 @@ import urllib.request, urllib.parse import logging, os, sys, traceback, re, time, json, gzip +from datetime import datetime from bs4 import BeautifulSoup DELAY = 0.2 @@ -73,6 +74,11 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox= return None +def new_name(n): + dt = datetime.strptime(n, 'nettime-l_%b_%y') + return dt.strftime('%B_%Y') + + def collect_threads_from_url(url, base_archive_dir, mbox=False): response = urllib.request.urlopen(url) @@ -88,12 +94,15 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False): threads_name = soup.select('head title')[0].string threads_name = threads_name.replace(' ', '_') + #fix name for database (re: nettime-l to nettime_l) and consitency with other archives + new_threads_name = new_name(threads_name) + logging.debug(threads_name) # thread data struct - threads = {'name' : threads_name, 'url' : base_url, 'threads' : []} + threads = {'name' : new_threads_name, 'url' : base_url, 'threads' : []} - logging.info("Collecting Threads of: " + threads_name) + logging.info("Collecting Threads of: " + new_threads_name) # check if archive already exists file_path = os.path.join(base_archive_dir, threads['name'] + ".json")