crawl all

This commit is contained in:
gauthiier 2019-12-21 21:09:34 +01:00
parent c13e00b919
commit 1ef4782fc8
2 changed files with 31 additions and 18 deletions

View File

@ -3,30 +3,34 @@ import lists.crawl, config
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
def run(args): def run(list_archive_name_url, archive_dir):
if not args.url:
sys.exit('No url(s). Aborting.')
if not args.names:
args.names = []
## check valid url?... hmm... nej ## check valid url?... hmm... nej
i = 0 for l in list_archive_name_url:
for u in args.url: logging.info("+++++ Crawling list: " + l['name'] + " +++++")
name = args.names[i] if i < len(args.names) else None lists.crawl.crawl(url=l['url'], name=l['name'], sublist_name=l['subname'], archive_dir=archive_dir)
lists.crawl.crawl(url=u, name=name, sublist_name=name, archive_dir=args.arch) #<-- sublist for nettime
i = i + 1
sys.exit()
if __name__ == "__main__": if __name__ == "__main__":
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!') p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+") p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
p.add_argument('--names', '-n', help="mailinglists' names", nargs="+") p.add_argument('--names', '-n', help="mailinglists' names (for the archive)", nargs="+")
p.add_argument('--subnames', '-s', help="mailinglists' subnames (on the webpage)", nargs="+")
p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives) p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives)
args = p.parse_args() args = p.parse_args()
run(args) l = []
if len(args.url) == 1 and args.url[0] == 'all':
l = config.lists
elif len(args.url) == len(args.names) == len(args.subnames):
i = 0
for u in args.url:
l.append({'name': args.names[i], 'url': u, 'subname': args.subnames[i]})
i += 1
else:
sys.exit("Inconcistent list of names, urls, and subnames. Aborting.")
run(l, args.archives)

View File

@ -1,5 +1,6 @@
import urllib.request, urllib.parse import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip import logging, os, sys, traceback, re, time, json, gzip
from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
DELAY = 0.2 DELAY = 0.2
@ -73,6 +74,11 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
return None return None
def new_name(n):
dt = datetime.strptime(n, 'nettime-l_%b_%y')
return dt.strftime('%B_%Y')
def collect_threads_from_url(url, base_archive_dir, mbox=False): def collect_threads_from_url(url, base_archive_dir, mbox=False):
response = urllib.request.urlopen(url) response = urllib.request.urlopen(url)
@ -88,12 +94,15 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
threads_name = soup.select('head title')[0].string threads_name = soup.select('head title')[0].string
threads_name = threads_name.replace(' ', '_') threads_name = threads_name.replace(' ', '_')
#fix name for database (re: nettime-l to nettime_l) and consitency with other archives
new_threads_name = new_name(threads_name)
logging.debug(threads_name) logging.debug(threads_name)
# thread data struct # thread data struct
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []} threads = {'name' : new_threads_name, 'url' : base_url, 'threads' : []}
logging.info("Collecting Threads of: " + threads_name) logging.info("Collecting Threads of: " + new_threads_name)
# check if archive already exists # check if archive already exists
file_path = os.path.join(base_archive_dir, threads['name'] + ".json") file_path = os.path.join(base_archive_dir, threads['name'] + ".json")