crawl all
This commit is contained in:
parent
c13e00b919
commit
1ef4782fc8
34
crawl.py
34
crawl.py
@ -3,30 +3,34 @@ import lists.crawl, config
|
|||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
def run(args):
|
def run(list_archive_name_url, archive_dir):
|
||||||
|
|
||||||
if not args.url:
|
|
||||||
sys.exit('No url(s). Aborting.')
|
|
||||||
|
|
||||||
if not args.names:
|
|
||||||
args.names = []
|
|
||||||
|
|
||||||
## check valid url?... hmm... nej
|
## check valid url?... hmm... nej
|
||||||
i = 0
|
for l in list_archive_name_url:
|
||||||
for u in args.url:
|
logging.info("+++++ Crawling list: " + l['name'] + " +++++")
|
||||||
name = args.names[i] if i < len(args.names) else None
|
lists.crawl.crawl(url=l['url'], name=l['name'], sublist_name=l['subname'], archive_dir=archive_dir)
|
||||||
lists.crawl.crawl(url=u, name=name, sublist_name=name, archive_dir=args.arch) #<-- sublist for nettime
|
|
||||||
i = i + 1
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
||||||
p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
|
p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
|
||||||
p.add_argument('--names', '-n', help="mailinglists' names", nargs="+")
|
p.add_argument('--names', '-n', help="mailinglists' names (for the archive)", nargs="+")
|
||||||
|
p.add_argument('--subnames', '-s', help="mailinglists' subnames (on the webpage)", nargs="+")
|
||||||
p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives)
|
p.add_argument('--archives', '-a', help="path to archives directory", default=config.archives)
|
||||||
|
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|
||||||
run(args)
|
l = []
|
||||||
|
if len(args.url) == 1 and args.url[0] == 'all':
|
||||||
|
l = config.lists
|
||||||
|
elif len(args.url) == len(args.names) == len(args.subnames):
|
||||||
|
i = 0
|
||||||
|
for u in args.url:
|
||||||
|
l.append({'name': args.names[i], 'url': u, 'subname': args.subnames[i]})
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
sys.exit("Inconcistent list of names, urls, and subnames. Aborting.")
|
||||||
|
|
||||||
|
|
||||||
|
run(l, args.archives)
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import urllib.request, urllib.parse
|
import urllib.request, urllib.parse
|
||||||
import logging, os, sys, traceback, re, time, json, gzip
|
import logging, os, sys, traceback, re, time, json, gzip
|
||||||
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
DELAY = 0.2
|
DELAY = 0.2
|
||||||
@ -73,6 +74,11 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def new_name(n):
|
||||||
|
dt = datetime.strptime(n, 'nettime-l_%b_%y')
|
||||||
|
return dt.strftime('%B_%Y')
|
||||||
|
|
||||||
|
|
||||||
def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
response = urllib.request.urlopen(url)
|
||||||
@ -88,12 +94,15 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
|||||||
threads_name = soup.select('head title')[0].string
|
threads_name = soup.select('head title')[0].string
|
||||||
threads_name = threads_name.replace(' ', '_')
|
threads_name = threads_name.replace(' ', '_')
|
||||||
|
|
||||||
|
#fix name for database (re: nettime-l to nettime_l) and consitency with other archives
|
||||||
|
new_threads_name = new_name(threads_name)
|
||||||
|
|
||||||
logging.debug(threads_name)
|
logging.debug(threads_name)
|
||||||
|
|
||||||
# thread data struct
|
# thread data struct
|
||||||
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
|
threads = {'name' : new_threads_name, 'url' : base_url, 'threads' : []}
|
||||||
|
|
||||||
logging.info("Collecting Threads of: " + threads_name)
|
logging.info("Collecting Threads of: " + new_threads_name)
|
||||||
|
|
||||||
# check if archive already exists
|
# check if archive already exists
|
||||||
file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
|
file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user