crawl all

2019-12-21 21:09:34 +01:00
parent c13e00b919
commit 1ef4782fc8
2 changed files with 31 additions and 18 deletions
@@ -3,30 +3,34 @@ import lists.crawl, config

 logging.basicConfig(level=logging.DEBUG)

-def run(args):
-
-    if not args.url:
-        sys.exit('No url(s). Aborting.')
-
-    if not args.names:
-        args.names = []
+def run(list_archive_name_url, archive_dir):

    ## check valid url?... hmm... nej    
-    i = 0
-    for u in args.url:
-        name = args.names[i] if i < len(args.names) else None
-        lists.crawl.crawl(url=u, name=name, sublist_name=name, archive_dir=args.arch) #<-- sublist for nettime
-        i = i + 1
+    for l in list_archive_name_url:
+        logging.info("+++++ Crawling list: " + l['name'] + " +++++")
+        lists.crawl.crawl(url=l['url'], name=l['name'], sublist_name=l['subname'], archive_dir=archive_dir)     
    
-    sys.exit()

 if __name__ == "__main__":

    p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
    p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
-    p.add_argument('--names', '-n', help="mailinglists' names", nargs="+")
+    p.add_argument('--names', '-n', help="mailinglists' names (for the archive)", nargs="+")
+    p.add_argument('--subnames', '-s', help="mailinglists' subnames (on the webpage)", nargs="+")
    p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives)

    args = p.parse_args()

-    run(args)
+    l = []
+    if len(args.url) == 1 and  args.url[0] == 'all':
+        l = config.lists
+    elif len(args.url) == len(args.names) == len(args.subnames):
+        i = 0
+        for u in args.url:
+            l.append({'name': args.names[i], 'url': u, 'subname': args.subnames[i]})
+            i += 1
+    else:
+        sys.exit("Inconcistent list of names, urls, and subnames. Aborting.")
+
+
+    run(l, args.archives)
@@ -1,5 +1,6 @@
 import urllib.request, urllib.parse
 import logging, os, sys, traceback, re, time, json, gzip
+from datetime import datetime
 from bs4 import BeautifulSoup

 DELAY = 0.2
@@ -73,6 +74,11 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=

 	return None

+def new_name(n):
+	dt = datetime.strptime(n, 'nettime-l_%b_%y')
+	return dt.strftime('%B_%Y')
+
+
 def collect_threads_from_url(url, base_archive_dir, mbox=False):

 	response = urllib.request.urlopen(url)
@@ -88,12 +94,15 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
 	threads_name = soup.select('head title')[0].string
 	threads_name = threads_name.replace(' ', '_')

+	#fix name for database (re: nettime-l to nettime_l) and consitency with other archives
+	new_threads_name = new_name(threads_name)
+
 	logging.debug(threads_name)

 	# thread data struct
-	threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
+	threads = {'name' : new_threads_name, 'url' : base_url, 'threads' : []}

-	logging.info("Collecting Threads of: " + threads_name)
+	logging.info("Collecting Threads of: " + new_threads_name)

 	# check if archive already exists
 	file_path = os.path.join(base_archive_dir, threads['name'] + ".json")