crawl all
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import urllib.request, urllib.parse
|
||||
import logging, os, sys, traceback, re, time, json, gzip
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
DELAY = 0.2
|
||||
@@ -73,6 +74,11 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
|
||||
|
||||
return None
|
||||
|
||||
def new_name(n):
|
||||
dt = datetime.strptime(n, 'nettime-l_%b_%y')
|
||||
return dt.strftime('%B_%Y')
|
||||
|
||||
|
||||
def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
||||
|
||||
response = urllib.request.urlopen(url)
|
||||
@@ -88,12 +94,15 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
||||
threads_name = soup.select('head title')[0].string
|
||||
threads_name = threads_name.replace(' ', '_')
|
||||
|
||||
#fix name for database (re: nettime-l to nettime_l) and consitency with other archives
|
||||
new_threads_name = new_name(threads_name)
|
||||
|
||||
logging.debug(threads_name)
|
||||
|
||||
# thread data struct
|
||||
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
|
||||
threads = {'name' : new_threads_name, 'url' : base_url, 'threads' : []}
|
||||
|
||||
logging.info("Collecting Threads of: " + threads_name)
|
||||
logging.info("Collecting Threads of: " + new_threads_name)
|
||||
|
||||
# check if archive already exists
|
||||
file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
|
||||
|
||||
Reference in New Issue
Block a user