listservs/lists/crawl.py
2019-12-22 17:14:17 +01:00

42 lines
1.2 KiB
Python

from urllib.parse import urlparse
import lists.pipermail as pipermail
import lists.listserv as listserv
import lists.mhonarc as mhonarc
import lists.mhonarc_nettime as mhonarc_nettime
DELAY = 0.2
def crawl(url, name, sublist_name=None, archive_dir="archives"):
u = urlparse(url)
# the following type 'tests' are very weak...
# how to test is list is pipermail / listserv / mhonarc?
if 'pipermail' in u.path:
# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
if name is None:
path = u.path if not u.path.endswith('/') else u.path[:len(u.path) - 1]
name = path.strip().split('/')[-1]
pipermail.collect_from_url(url, name, archive_dir)
elif 'cgi-bin' in u.path:
listserv.collect_from_url(url, name, archive_dir)
# special case -- nettime.
# the name should be the sublist_name (i.e nettime-l)
elif "nettime" in url:
if sublist_name is None:
sublist_name = name
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
# elif "oldboys" in name:
# mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
# elif "oldboys" in name:
# mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
else:
print('mhonarc?')
return