2017-07-14 10:54:56 +02:00
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
import lists.pipermail as pipermail
|
2017-07-25 11:30:04 +02:00
|
|
|
import lists.listserv as listserv
|
2017-11-04 13:34:05 +01:00
|
|
|
import lists.mhonarc as mhonarc
|
|
|
|
|
import lists.mhonarc_nettime as mhonarc_nettime
|
2017-07-12 21:26:36 +02:00
|
|
|
|
2017-07-14 10:54:56 +02:00
|
|
|
DELAY = 0.2
|
|
|
|
|
|
2017-11-04 13:34:05 +01:00
|
|
|
def crawl(url, name, sublist_name=None, archive_dir="archives"):
|
2017-07-14 10:54:56 +02:00
|
|
|
u = urlparse(url)
|
|
|
|
|
|
2017-07-25 11:30:04 +02:00
|
|
|
# the following type 'tests' are very weak...
|
|
|
|
|
# how to test is list is pipermail / listserv / mhonarc?
|
|
|
|
|
|
2017-07-14 10:54:56 +02:00
|
|
|
if 'pipermail' in u.path:
|
|
|
|
|
# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
|
|
|
|
|
if name is None:
|
|
|
|
|
path = u.path if not u.path.endswith('/') else u.path[:len(u.path) - 1]
|
|
|
|
|
name = path.strip().split('/')[-1]
|
|
|
|
|
|
|
|
|
|
pipermail.collect_from_url(url, name, archive_dir)
|
|
|
|
|
|
2017-07-25 11:30:04 +02:00
|
|
|
elif 'cgi-bin' in u.path:
|
|
|
|
|
listserv.collect_from_url(url, name, archive_dir)
|
|
|
|
|
|
2017-11-04 13:34:05 +01:00
|
|
|
# special case -- nettime.
|
|
|
|
|
# the name should be the sublist_name (i.e nettime-l)
|
|
|
|
|
elif "nettime" in name:
|
|
|
|
|
mhonarc_nettime.collect_from_url(url, name, name, archive_dir)
|
|
|
|
|
|
2017-07-14 10:54:56 +02:00
|
|
|
else:
|
|
|
|
|
print('mhonarc?')
|
|
|
|
|
|
2017-07-12 21:26:36 +02:00
|
|
|
return
|