26 lines
934 B
Python
26 lines
934 B
Python
|
|
import urllib2, urllib, urlparse
|
||
|
|
import os, re, json, gzip
|
||
|
|
import mhonarccrawl
|
||
|
|
import datetime
|
||
|
|
|
||
|
|
def archive_from_url(url, sublist_name="nettime-l", archive_dir="archives"):
|
||
|
|
url = url.rstrip()
|
||
|
|
archive_list_dir = check_dir(archive_dir, sublist_name)
|
||
|
|
|
||
|
|
archive_name = sublist_name.lower()
|
||
|
|
archive_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
|
||
|
|
archive = {'name' : sublist_name.lower(), 'url': url, 'date': archive_date, 'threads' : []}
|
||
|
|
|
||
|
|
archive['threads'] = mhonarccrawl.collect_from_url(url, sublist_name, archive_list_dir, mbox=True)
|
||
|
|
|
||
|
|
file_path = os.path.join(archive_dir, archive_name + "_" + archive_date + ".json.gz")
|
||
|
|
with gzip.open(file_path, 'w') as fp:
|
||
|
|
json.dump(archive, fp, indent=4)
|
||
|
|
|
||
|
|
return
|
||
|
|
|
||
|
|
def check_dir(base_dir, list_name):
|
||
|
|
arc_dir = os.path.join(base_dir, list_name)
|
||
|
|
if not os.path.exists(arc_dir):
|
||
|
|
os.makedirs(arc_dir)
|
||
|
|
return arc_dir
|