listservs/lists/listserv.py

import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip, difflib
from bs4 import BeautifulSoup
import lists.util


DELAY = 0.2

def collect_from_url(url, name, base_archive_dir):

	# response = urllib.request.urlopen(url)
	# #html = response.read().decode(encoding="utf-8")
	# html = response.read()
	# soup = BeautifulSoup(html, "html5lib")

	soup = lists.util.request(url)

	threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
	li = []
	for t in threads_list:
		thread_label = t.text.strip()
		thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
		li.append((thread_label, thread_url))

	# create (main) directory 
	# this is where all temp files will be created
	d = os.path.join(base_archive_dir, name)
	if not os.path.exists(d):
		os.makedirs(d)		

	threads = []
	nbr_threads = str(len(li))
	n = 0
	for l in li: ### change this
		n += 1
		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
		try:
			threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
		except KeyboardInterrupt:
			sys.exit(0)					
		except:
			logging.warning("Error archiving: " + l[1] + "... Continuing.")
			ex_t, ex, tb = sys.exc_info()
			print(ex_t)
			traceback.print_tb(tb)
			del tb
			continue

	# archive['name'] = name
	# archive['list'] = threads

	# file_path = os.path.join(base_arch_dir, name + '.json')

	# with open(file_path, 'w') as fp:
	# 	json.dump(archive, fp, indent=4)

	# logging.info("done.")


def collect_threads_from_url(url, name, base_arch_dir):

	threads = {'name' : name, 'url' : url, 'threads' : []}
	
	logging.info("Collecting threads of: " + name)

	arch_name = name.replace(' ', '_')

	# check if archive already exists
	file_path = os.path.join(base_arch_dir, arch_name + '.json')
	if os.path.isfile(file_path):
		logging.info("archive " + name + " already exists. loading from file " + file_path)
		with open(file_path, 'r') as fin:
			try:
				threads = json.load(fin)
				return threads  
			except:
				logging.info("can't open archive " + file_path + "... rearchiving.")


	# response = urllib.request.urlopen(url)
	# #html = response.read().decode(encoding="utf-8")
	# html = response.read()
	# soup = BeautifulSoup(html, "html5lib")

	soup = lists.util.request(url)

	table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
	li = []
	for tr in table:
		if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
			li.append(tr)

	# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
	# hence the thread parsing algorithm will also be flat -- re: a single loop

	nbr_msgs = str(len(li))
	n = 0	
	last_message = None	
	for tr in li:
		n += 1
		logging.info("	> " + str(n) + "/" + nbr_msgs)
		td = tr.find_all('td')
		thread_a = td[0].select("p span a")[0]
		thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
		thread_title = thread_a.text.strip()

		try:

			message = {u'id': 0, u'subject': thread_title, u'url': thread_url, u'author_name': 'n/a'}

			threads['threads'].append(collect_message(thread_url, message))

			if last_message and similar(last_message['subject'], message['subject']):
				if u'follow-up' not in last_message:
					last_message[u'follow-up'] = []
				print(message['subject'] + " - follows - " + last_message['subject'])
				last_message[u'follow-up'].append(message)

			else:
				last_message = message

		except KeyboardInterrupt:
			sys.exit(0)		
		except:
			ex_t, ex, tb = sys.exc_info()
			print(ex_t)
			traceback.print_tb(tb)
			del tb
			continue

		time.sleep(DELAY)	

	logging.info("writing archive to file " + file_path)

	with open(file_path, 'w') as fp:
		json.dump(threads, fp, indent=4)

	logging.info("done.")


def collect_message(url, message):

	# response = urllib.request.urlopen(url)
	# #html = response.read().decode(encoding="utf-8")
	# html = response.read()
	# soup = BeautifulSoup(html, "html5lib")

	soup = lists.util.request(url)

	tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)

	header = tr[0].find_all('tbody')[0].find_all('tr', recursive=False)
	message['subject'] = header[0].select("p a")[0].text.strip()
	message['from'] = header[1].select("p")[1].text.replace("<[log in to unmask]>", "").strip()
	message['author_name'] = message['from']
	message['date'] = header[3].select("p")[1].text.strip()
	message['content-type'] = header[4].select("p")[1].text.strip()

	message['content'] = tr[1].find_all('pre')[0].text	

	return message


def similar(str_a, str_b):
	r = difflib.SequenceMatcher(None, str_a, str_b).ratio()
	return r > 0.75
listserv and www 2017-07-25 11:30:04 +02:00			`import urllib.request, urllib.parse`
			`import logging, os, sys, traceback, re, time, json, gzip, difflib`
			`from bs4 import BeautifulSoup`
crawling 2019-12-22 08:22:20 +01:00			`import lists.util`
listserv and www 2017-07-25 11:30:04 +02:00

			`DELAY = 0.2`

			`def collect_from_url(url, name, base_archive_dir):`

crawling 2019-12-22 08:22:20 +01:00			`# response = urllib.request.urlopen(url)`
			`# #html = response.read().decode(encoding="utf-8")`
			`# html = response.read()`
			`# soup = BeautifulSoup(html, "html5lib")`

			`soup = lists.util.request(url)`
listserv and www 2017-07-25 11:30:04 +02:00
			`threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')`
archive work 2019-12-31 17:53:47 +01:00			`li = []`
listserv and www 2017-07-25 11:30:04 +02:00			`for t in threads_list:`
			`thread_label = t.text.strip()`
			`thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))`
archive work 2019-12-31 17:53:47 +01:00			`li.append((thread_label, thread_url))`
listserv and www 2017-07-25 11:30:04 +02:00
			`# create (main) directory`
			`# this is where all temp files will be created`
			`d = os.path.join(base_archive_dir, name)`
			`if not os.path.exists(d):`
			`os.makedirs(d)`

			`threads = []`
archive work 2019-12-31 17:53:47 +01:00			`nbr_threads = str(len(li))`
listserv and www 2017-07-25 11:30:04 +02:00			`n = 0`
archive work 2019-12-31 17:53:47 +01:00			`for l in li: ### change this`
listserv and www 2017-07-25 11:30:04 +02:00			`n += 1`
			`logging.info("## " + str(n) + " / " + nbr_threads + " ##")`
			`try:`
			`threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))`
			`except KeyboardInterrupt:`
			`sys.exit(0)`
			`except:`
			`logging.warning("Error archiving: " + l[1] + "... Continuing.")`
			`ex_t, ex, tb = sys.exc_info()`
			`print(ex_t)`
			`traceback.print_tb(tb)`
			`del tb`
			`continue`

many many things... 2017-11-04 13:34:05 +01:00			`# archive['name'] = name`
			`# archive['list'] = threads`

			`# file_path = os.path.join(base_arch_dir, name + '.json')`

			`# with open(file_path, 'w') as fp:`
			`# json.dump(archive, fp, indent=4)`

			`# logging.info("done.")`


listserv and www 2017-07-25 11:30:04 +02:00			`def collect_threads_from_url(url, name, base_arch_dir):`

			`threads = {'name' : name, 'url' : url, 'threads' : []}`

			`logging.info("Collecting threads of: " + name)`

			`arch_name = name.replace(' ', '_')`

			`# check if archive already exists`
			`file_path = os.path.join(base_arch_dir, arch_name + '.json')`
			`if os.path.isfile(file_path):`
			`logging.info("archive " + name + " already exists. loading from file " + file_path)`
			`with open(file_path, 'r') as fin:`
			`try:`
			`threads = json.load(fin)`
			`return threads`
			`except:`
			`logging.info("can't open archive " + file_path + "... rearchiving.")`


crawling 2019-12-22 08:22:20 +01:00			`# response = urllib.request.urlopen(url)`
			`# #html = response.read().decode(encoding="utf-8")`
			`# html = response.read()`
			`# soup = BeautifulSoup(html, "html5lib")`

			`soup = lists.util.request(url)`
listserv and www 2017-07-25 11:30:04 +02:00
			`table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')`
archive work 2019-12-31 17:53:47 +01:00			`li = []`
listserv and www 2017-07-25 11:30:04 +02:00			`for tr in table:`
			`if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):`
archive work 2019-12-31 17:53:47 +01:00			`li.append(tr)`
listserv and www 2017-07-25 11:30:04 +02:00
			`# the thread structure here is flat -- re: non-hierarchical, unlike pipermail`
			`# hence the thread parsing algorithm will also be flat -- re: a single loop`

archive work 2019-12-31 17:53:47 +01:00			`nbr_msgs = str(len(li))`
listserv and www 2017-07-25 11:30:04 +02:00			`n = 0`
			`last_message = None`
archive work 2019-12-31 17:53:47 +01:00			`for tr in li:`
listserv and www 2017-07-25 11:30:04 +02:00			`n += 1`
			`logging.info(" > " + str(n) + "/" + nbr_msgs)`
			`td = tr.find_all('td')`
			`thread_a = td[0].select("p span a")[0]`
			`thread_url = urllib.parse.urljoin(url, thread_a.get("href"))`
			`thread_title = thread_a.text.strip()`

			`try:`

			`message = {u'id': 0, u'subject': thread_title, u'url': thread_url, u'author_name': 'n/a'}`

			`threads['threads'].append(collect_message(thread_url, message))`

			`if last_message and similar(last_message['subject'], message['subject']):`
			`if u'follow-up' not in last_message:`
			`last_message[u'follow-up'] = []`
			`print(message['subject'] + " - follows - " + last_message['subject'])`
			`last_message[u'follow-up'].append(message)`

			`else:`
			`last_message = message`

			`except KeyboardInterrupt:`
			`sys.exit(0)`
			`except:`
			`ex_t, ex, tb = sys.exc_info()`
			`print(ex_t)`
			`traceback.print_tb(tb)`
			`del tb`
			`continue`

			`time.sleep(DELAY)`

			`logging.info("writing archive to file " + file_path)`

			`with open(file_path, 'w') as fp:`
			`json.dump(threads, fp, indent=4)`

			`logging.info("done.")`


			`def collect_message(url, message):`

crawling 2019-12-22 08:22:20 +01:00			`# response = urllib.request.urlopen(url)`
			`# #html = response.read().decode(encoding="utf-8")`
			`# html = response.read()`
			`# soup = BeautifulSoup(html, "html5lib")`

			`soup = lists.util.request(url)`
listserv and www 2017-07-25 11:30:04 +02:00
			`tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)`

			`header = tr[0].find_all('tbody')[0].find_all('tr', recursive=False)`
			`message['subject'] = header[0].select("p a")[0].text.strip()`
			`message['from'] = header[1].select("p")[1].text.replace("<[log in to unmask]>", "").strip()`
			`message['author_name'] = message['from']`
			`message['date'] = header[3].select("p")[1].text.strip()`
			`message['content-type'] = header[4].select("p")[1].text.strip()`

			`message['content'] = tr[1].find_all('pre')[0].text`

			`return message`


			`def similar(str_a, str_b):`
			`r = difflib.SequenceMatcher(None, str_a, str_b).ratio()`
			`return r > 0.75`