listservs/lists/crawl.py

from urllib.parse import urlparse
import lists.pipermail as pipermail
import lists.listserv as listserv
import lists.mhonarc as mhonarc
import lists.mhonarc_nettime as mhonarc_nettime

DELAY = 0.2

def crawl(url, name, sublist_name=None, archive_dir="archives"):
	u = urlparse(url)

	# the following type 'tests' are very weak...
	# how to test is list is pipermail / listserv / mhonarc?

	if 'pipermail' in u.path:
		# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
		if name is None:
			path = u.path if not u.path.endswith('/') else u.path[:len(u.path) - 1]
			name = path.strip().split('/')[-1]

		pipermail.collect_from_url(url, name, archive_dir)
			
	elif 'cgi-bin' in u.path:
		listserv.collect_from_url(url, name, archive_dir)

	# special case -- nettime.
	# the name should be the sublist_name (i.e nettime-l)
	elif "nettime" in name:
		mhonarc_nettime.collect_from_url(url, name, name, archive_dir)

	else:
		print('mhonarc?')

	return
pipermail init 2017-07-14 10:54:56 +02:00			`from urllib.parse import urlparse`
			`import lists.pipermail as pipermail`
listserv and www 2017-07-25 11:30:04 +02:00			`import lists.listserv as listserv`
many many things... 2017-11-04 13:34:05 +01:00			`import lists.mhonarc as mhonarc`
			`import lists.mhonarc_nettime as mhonarc_nettime`
haha! commit 2017-07-12 21:26:36 +02:00
pipermail init 2017-07-14 10:54:56 +02:00			`DELAY = 0.2`

many many things... 2017-11-04 13:34:05 +01:00			`def crawl(url, name, sublist_name=None, archive_dir="archives"):`
pipermail init 2017-07-14 10:54:56 +02:00			`u = urlparse(url)`

listserv and www 2017-07-25 11:30:04 +02:00			`# the following type 'tests' are very weak...`
			`# how to test is list is pipermail / listserv / mhonarc?`

pipermail init 2017-07-14 10:54:56 +02:00			`if 'pipermail' in u.path:`
			`# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')`
			`if name is None:`
			`path = u.path if not u.path.endswith('/') else u.path[:len(u.path) - 1]`
			`name = path.strip().split('/')[-1]`

			`pipermail.collect_from_url(url, name, archive_dir)`

listserv and www 2017-07-25 11:30:04 +02:00			`elif 'cgi-bin' in u.path:`
			`listserv.collect_from_url(url, name, archive_dir)`

many many things... 2017-11-04 13:34:05 +01:00			`# special case -- nettime.`
			`# the name should be the sublist_name (i.e nettime-l)`
			`elif "nettime" in name:`
			`mhonarc_nettime.collect_from_url(url, name, name, archive_dir)`

pipermail init 2017-07-14 10:54:56 +02:00			`else:`
			`print('mhonarc?')`

haha! commit 2017-07-12 21:26:36 +02:00			`return`