listservs/lists/listserv.py

import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip, difflib
from bs4 import BeautifulSoup
import lists.util


DELAY = 0.2

def collect_from_url(url, name, base_archive_dir):

	# response = urllib.request.urlopen(url)
	# #html = response.read().decode(encoding="utf-8")
	# html = response.read()
	# soup = BeautifulSoup(html, "html5lib")

	soup = lists.util.request(url)

	threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
	li = []
	for t in threads_list:
		thread_label = t.text.strip()
		thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
		li.append((thread_label, thread_url))

	# create (main) directory
	# this is where all temp files will be created
	d = os.path.join(base_archive_dir, name)
	if not os.path.exists(d):
		os.makedirs(d)

	threads = []
	nbr_threads = str(len(li))
	n = 0
	for l in li: ### change this
		n += 1
		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
		try:
			threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
		except KeyboardInterrupt:
			sys.exit(0)
		except:
			logging.warning("Error archiving: " + l[1] + "... Continuing.")
			ex_t, ex, tb = sys.exc_info()
			print(ex_t)
			traceback.print_tb(tb)
			del tb
			continue

	# archive['name'] = name
	# archive['list'] = threads

	# file_path = os.path.join(base_arch_dir, name + '.json')

	# with open(file_path, 'w') as fp:
	# 	json.dump(archive, fp, indent=4)

	# logging.info("done.")


def collect_threads_from_url(url, name, base_arch_dir):

	threads = {'name' : name, 'url' : url, 'threads' : []}

	logging.info("Collecting threads of: " + name)

	arch_name = name.replace(' ', '_')

	# check if archive already exists
	file_path = os.path.join(base_arch_dir, arch_name + '.json')
	if os.path.isfile(file_path):
		logging.info("archive " + name + " already exists. loading from file " + file_path)
		with open(file_path, 'r') as fin:
			try:
				threads = json.load(fin)
				return threads
			except:
				logging.info("can't open archive " + file_path + "... rearchiving.")


	# response = urllib.request.urlopen(url)
	# #html = response.read().decode(encoding="utf-8")
	# html = response.read()
	# soup = BeautifulSoup(html, "html5lib")

	soup = lists.util.request(url)

	table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
	li = []
	for tr in table:
		if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
			li.append(tr)

	# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
	# hence the thread parsing algorithm will also be flat -- re: a single loop

	nbr_msgs = str(len(li))
	n = 0
	last_message = None
	for tr in li:
		n += 1
		logging.info("	> " + str(n) + "/" + nbr_msgs)
		td = tr.find_all('td')
		thread_a = td[0].select("p span a")[0]
		thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
		thread_title = thread_a.text.strip()

		try:

			message = {u'id': 0, u'subject': thread_title, u'url': thread_url, u'author_name': 'n/a'}

			threads['threads'].append(collect_message(thread_url, message))

			if last_message and similar(last_message['subject'], message['subject']):
				if u'follow-up' not in last_message:
					last_message[u'follow-up'] = []
				print(message['subject'] + " - follows - " + last_message['subject'])
				last_message[u'follow-up'].append(message)

			else:
				last_message = message

		except KeyboardInterrupt:
			sys.exit(0)
		except:
			ex_t, ex, tb = sys.exc_info()
			print(ex_t)
			traceback.print_tb(tb)
			del tb
			continue

		time.sleep(DELAY)

	logging.info("writing archive to file " + file_path)

	with open(file_path, 'w') as fp:
		json.dump(threads, fp, indent=4)

	logging.info("done.")


def collect_message(url, message):

	# response = urllib.request.urlopen(url)
	# #html = response.read().decode(encoding="utf-8")
	# html = response.read()
	# soup = BeautifulSoup(html, "html5lib")

	soup = lists.util.request(url)

	tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)

	header = tr[0].find_all('tbody')[0].find_all('tr', recursive=False)
	message['subject'] = header[0].select("p a")[0].text.strip()
	message['from'] = header[1].select("p")[1].text.replace("<[log in to unmask]>", "").strip()
	message['author_name'] = message['from']
	message['date'] = header[3].select("p")[1].text.strip()
	message['content-type'] = header[4].select("p")[1].text.strip()

	message['content'] = tr[1].find_all('pre')[0].text

	return message


def similar(str_a, str_b):
	r = difflib.SequenceMatcher(None, str_a, str_b).ratio()
	return r > 0.75