listservs/lists/mhonarc_nettime.py

import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip
from datetime import datetime
from bs4 import BeautifulSoup
import lists.util

DELAY = 0.2

def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):

	# response = urllib.request.urlopen(url)
	# html = response.read()
	# soup = BeautifulSoup(html, "html5lib")

	soup = lists.util.request(url)

	# base url 
	try:
		base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
	except:
		base_url = url

	logging.debug(base_url)

	# #collect name
 #    list_name = soup.select('body p:nth-of-type(2) title')[0].string
 #    logging.info("Getting " + list_name + " list archive for " + sublist_name)

	# create (main) directory 
	# this is where all temp files will be created
	d = os.path.join(base_archive_dir, name)
	if not os.path.exists(d):
		os.makedirs(d)

	threads = []
	# lists = soup.select('ul:nth-of-type(2) li')
	li = soup.select('ul li')    

	for l in li:

		if l.strong is None:
			continue

		name = l.strong.string

		if name.lower() == sublist_name.lower():

			logging.debug(name)

			threads_url_list = []
			threads_links = l.select('ul li a')
			for t in threads_links:
				thread_url = urllib.parse.urljoin(base_url, t.get('href'))
				threads_url_list.append(thread_url)

			nbr_threads = str(len(threads_url_list))
			n = 0

			for u in threads_url_list:
				time.sleep(DELAY)
				n += 1
				logging.info("## " + str(n) + " / " + nbr_threads + " ##")                
				try:
					threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))   
				except KeyboardInterrupt:
					sys.exit(0)
				except:
					logging.warning("Error archiving: " + name + "... Continuing.")
					ex_t, ex, tb = sys.exc_info()
					print(ex_t)
					traceback.print_tb(tb)
					del tb
					continue                   

			return threads

			# for u in threads_url_list[0:10]:
			#     print "---------------------------------------"
			#     tt = collect_threads_from_url(u, base_archive_dir, mbox)
			#     threads.append(tt)                

	return None

def new_name(n):
	# ex: nettime-bold_Mar_99
	dt = datetime.strptime(n[-6:], '%b_%y')
	return dt.strftime('%B_%Y')


def collect_threads_from_url(url, base_archive_dir, mbox=False):

	# response = urllib.request.urlopen(url)
	# html = response.read()
	# soup = BeautifulSoup(html, "html5lib")

	soup = lists.util.request(url)	

	logging.debug("collecting: " + url)

	# base url 
	base_url = url

	# collect name
	threads_name = soup.select('head title')[0].string
	threads_name = threads_name.replace(' ', '_')

	#fix name for database (re: nettime-l to nettime_l) and consitency with other archives
	new_threads_name = new_name(threads_name)

	logging.debug(threads_name)

	# thread data struct
	threads = {'name' : new_threads_name, 'url' : base_url, 'threads' : []}

	logging.info("Collecting Threads of: " + new_threads_name)

	# check if archive already exists
	file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
	if os.path.isfile(file_path):
		logging.info("archive already exists. loading from file " + file_path)
		with open(file_path, 'r') as fpin:
			threads = json.load(fpin)
	else:
		li = soup.select('ul:nth-of-type(1) > li')

		nbr_threads = str(len(li))
		n = 0

		for l in li:
			n += 1
			logging.info("> " + str(n) + " / " + nbr_threads)

			thread = archive_thread(l, base_url, None)
			threads['threads'].append(thread)
			

			# try:
			# 	thread = archive_thread(l, base_url, None)
			# 	threads['threads'].append(thread)
			# except:
			# 	ex_type, ex, tb = sys.exc_info()
			# 	traceback.print_tb(tb)
			# 	del tb                
			# 	continue

			time.sleep(DELAY)

		# write 
		logging.info("writing archive to file " + file_path)

		with open(file_path, 'w') as fp:
			json.dump(threads, fp, indent=4)

		logging.info("done. ")

	return threads

	
def archive_thread(li, base_url, parent_thread_data):

	thread_link = li.select('strong a')[0]
	thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
	thread_id = thread_link.get('name')
	thread_title = thread_link.string
	thread_author_name = li.select('em')[0].string

	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}

	collect_message(thread_url, message)

	follow = li.select('ul > li')
	if len(follow) > 0:
		for f in follow:
			follow_link = f.select('strong a')
			if len (follow_link) > 0:
				archive_thread(f, base_url, message)  ## recursion
	
	if parent_thread_data is None:
		return message

	if u'follow-up' not in parent_thread_data:
		parent_thread_data[u'follow-up'] = []

	parent_thread_data[u'follow-up'].append(message)

	return message


def collect_message(url, message):

	logging.debug("collecting message: " + url)

	# response = urllib.request.urlopen(url)
	# html = response.read().decode(encoding="utf-8")
	# # html = response.read()
	# soup = BeautifulSoup(html, "html5lib")    

	soup = lists.util.request(url)	

	#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>

	message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    

	# mhonarc xcomments
	# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
	message['subject'] = parse_xcomment(soup, "X-Subject")
	message['date'] = parse_xcomment(soup, "X-Date")
	message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
	message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
	message['content-type'] = parse_xcomment(soup, 'X-Content-Type')

	# parse what is displayed on the page

	info = soup.select('ul:nth-of-type(1) > li')

	for i in info:
		if i.em == None:
			continue
		field = i.em.string
		if field.lower() in message_labels:
			message[field.lower()] = i.text.strip(field + ": ")


	# old way
	# message['content'] = soup.select('pre:nth-of-type(2)')[0].text

	# new way
	message['content'] = parse_xmessage(str(soup))

# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
def parse_xcomment(soup, xcom):
	com = soup.find(text=re.compile(xcom))
	if com is not None:
		return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
	return com

# (edit 21.12.2019): this is the new way as of 2018 -- when no more moderation on Nettime...
def parse_xmessage(html):
	rr = r'<!--X-Body-of-Message-->.*?<!--X-Body-of-Message-End-->'
	s = re.search(rr, html, re.DOTALL)
	se = BeautifulSoup(s.group(), "html5lib")
	return se.get_text()
many many things... 2017-11-04 13:34:05 +01:00			`import urllib.request, urllib.parse`
			`import logging, os, sys, traceback, re, time, json, gzip`
crawl all 2019-12-21 21:09:34 +01:00			`from datetime import datetime`
many many things... 2017-11-04 13:34:05 +01:00			`from bs4 import BeautifulSoup`
crawling 2019-12-22 08:22:20 +01:00			`import lists.util`
many many things... 2017-11-04 13:34:05 +01:00
			`DELAY = 0.2`

			`def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):`

crawling 2019-12-22 08:22:20 +01:00			`# response = urllib.request.urlopen(url)`
			`# html = response.read()`
			`# soup = BeautifulSoup(html, "html5lib")`

			`soup = lists.util.request(url)`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`# base url`
crawling 2019-12-22 08:22:20 +01:00			`try:`
			`base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')`
			`except:`
			`base_url = url`
many many things... 2017-11-04 13:34:05 +01:00
updated nettime crawler 2019-12-21 14:35:37 +01:00			`logging.debug(base_url)`

			`# #collect name`
			`# list_name = soup.select('body p:nth-of-type(2) title')[0].string`
			`# logging.info("Getting " + list_name + " list archive for " + sublist_name)`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`# create (main) directory`
			`# this is where all temp files will be created`
			`d = os.path.join(base_archive_dir, name)`
			`if not os.path.exists(d):`
			`os.makedirs(d)`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`threads = []`
crawling 2019-12-22 08:22:20 +01:00			`# lists = soup.select('ul:nth-of-type(2) li')`
			`li = soup.select('ul li')`
many many things... 2017-11-04 13:34:05 +01:00
crawling 2019-12-22 08:22:20 +01:00			`for l in li:`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`if l.strong is None:`
			`continue`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`name = l.strong.string`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`if name.lower() == sublist_name.lower():`
many many things... 2017-11-04 13:34:05 +01:00
updated nettime crawler 2019-12-21 14:35:37 +01:00			`logging.debug(name)`

index +search info 2019-07-17 12:55:47 +02:00			`threads_url_list = []`
			`threads_links = l.select('ul li a')`
			`for t in threads_links:`
			`thread_url = urllib.parse.urljoin(base_url, t.get('href'))`
			`threads_url_list.append(thread_url)`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`nbr_threads = str(len(threads_url_list))`
			`n = 0`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`for u in threads_url_list:`
			`time.sleep(DELAY)`
			`n += 1`
			`logging.info("## " + str(n) + " / " + nbr_threads + " ##")`
			`try:`
			`threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))`
			`except KeyboardInterrupt:`
			`sys.exit(0)`
			`except:`
updated nettime crawler 2019-12-21 14:35:37 +01:00			`logging.warning("Error archiving: " + name + "... Continuing.")`
index +search info 2019-07-17 12:55:47 +02:00			`ex_t, ex, tb = sys.exc_info()`
			`print(ex_t)`
			`traceback.print_tb(tb)`
			`del tb`
			`continue`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`return threads`

			`# for u in threads_url_list[0:10]:`
			`# print "---------------------------------------"`
			`# tt = collect_threads_from_url(u, base_archive_dir, mbox)`
			`# threads.append(tt)`

			`return None`
many many things... 2017-11-04 13:34:05 +01:00
crawl all 2019-12-21 21:09:34 +01:00			`def new_name(n):`
crawling 2019-12-22 08:22:20 +01:00			`# ex: nettime-bold_Mar_99`
			`dt = datetime.strptime(n[-6:], '%b_%y')`
crawl all 2019-12-21 21:09:34 +01:00			`return dt.strftime('%B_%Y')`


many many things... 2017-11-04 13:34:05 +01:00			`def collect_threads_from_url(url, base_archive_dir, mbox=False):`

crawling 2019-12-22 08:22:20 +01:00			`# response = urllib.request.urlopen(url)`
			`# html = response.read()`
			`# soup = BeautifulSoup(html, "html5lib")`

			`soup = lists.util.request(url)`
index +search info 2019-07-17 12:55:47 +02:00
updated nettime crawler 2019-12-21 14:35:37 +01:00			`logging.debug("collecting: " + url)`

index +search info 2019-07-17 12:55:47 +02:00			`# base url`
			`base_url = url`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`# collect name`
updated nettime crawler 2019-12-21 14:35:37 +01:00			`threads_name = soup.select('head title')[0].string`
index +search info 2019-07-17 12:55:47 +02:00			`threads_name = threads_name.replace(' ', '_')`
many many things... 2017-11-04 13:34:05 +01:00
crawl all 2019-12-21 21:09:34 +01:00			`#fix name for database (re: nettime-l to nettime_l) and consitency with other archives`
			`new_threads_name = new_name(threads_name)`

updated nettime crawler 2019-12-21 14:35:37 +01:00			`logging.debug(threads_name)`

index +search info 2019-07-17 12:55:47 +02:00			`# thread data struct`
crawl all 2019-12-21 21:09:34 +01:00			`threads = {'name' : new_threads_name, 'url' : base_url, 'threads' : []}`
many many things... 2017-11-04 13:34:05 +01:00
crawl all 2019-12-21 21:09:34 +01:00			`logging.info("Collecting Threads of: " + new_threads_name)`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`# check if archive already exists`
			`file_path = os.path.join(base_archive_dir, threads['name'] + ".json")`
			`if os.path.isfile(file_path):`
			`logging.info("archive already exists. loading from file " + file_path)`
			`with open(file_path, 'r') as fpin:`
			`threads = json.load(fpin)`
			`else:`
crawling 2019-12-22 08:22:20 +01:00			`li = soup.select('ul:nth-of-type(1) > li')`
many many things... 2017-11-04 13:34:05 +01:00
crawling 2019-12-22 08:22:20 +01:00			`nbr_threads = str(len(li))`
index +search info 2019-07-17 12:55:47 +02:00			`n = 0`
many many things... 2017-11-04 13:34:05 +01:00
crawling 2019-12-22 08:22:20 +01:00			`for l in li:`
index +search info 2019-07-17 12:55:47 +02:00			`n += 1`
			`logging.info("> " + str(n) + " / " + nbr_threads)`
many many things... 2017-11-04 13:34:05 +01:00
updated nettime crawler 2019-12-21 14:35:37 +01:00			`thread = archive_thread(l, base_url, None)`
			`threads['threads'].append(thread)`


			`# try:`
			`# thread = archive_thread(l, base_url, None)`
			`# threads['threads'].append(thread)`
			`# except:`
			`# ex_type, ex, tb = sys.exc_info()`
			`# traceback.print_tb(tb)`
			`# del tb`
			`# continue`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`time.sleep(DELAY)`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`# write`
			`logging.info("writing archive to file " + file_path)`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`with open(file_path, 'w') as fp:`
			`json.dump(threads, fp, indent=4)`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`logging.info("done. ")`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`return threads`


many many things... 2017-11-04 13:34:05 +01:00
			`def archive_thread(li, base_url, parent_thread_data):`

			`thread_link = li.select('strong a')[0]`
			`thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))`
			`thread_id = thread_link.get('name')`
			`thread_title = thread_link.string`
			`thread_author_name = li.select('em')[0].string`

			`message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}`

			`collect_message(thread_url, message)`

			`follow = li.select('ul > li')`
			`if len(follow) > 0:`
			`for f in follow:`
			`follow_link = f.select('strong a')`
			`if len (follow_link) > 0:`
			`archive_thread(f, base_url, message) ## recursion`

			`if parent_thread_data is None:`
			`return message`

			`if u'follow-up' not in parent_thread_data:`
			`parent_thread_data[u'follow-up'] = []`

			`parent_thread_data[u'follow-up'].append(message)`

			`return message`


			`def collect_message(url, message):`

updated nettime crawler 2019-12-21 14:35:37 +01:00			`logging.debug("collecting message: " + url)`

crawling 2019-12-22 08:22:20 +01:00			`# response = urllib.request.urlopen(url)`
			`# html = response.read().decode(encoding="utf-8")`
			`# # html = response.read()`
			`# soup = BeautifulSoup(html, "html5lib")`

			`soup = lists.util.request(url)`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`# mhonarc xcomments`
			`# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html`
			`message['subject'] = parse_xcomment(soup, "X-Subject")`
			`message['date'] = parse_xcomment(soup, "X-Date")`
			`message['from'] = parse_xcomment(soup, "X-From-R13") #useless...`
			`message['message-id'] = parse_xcomment(soup, 'X-Message-Id')`
			`message['content-type'] = parse_xcomment(soup, 'X-Content-Type')`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`# parse what is displayed on the page`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`info = soup.select('ul:nth-of-type(1) > li')`
many many things... 2017-11-04 13:34:05 +01:00
index +search info 2019-07-17 12:55:47 +02:00			`for i in info:`
			`if i.em == None:`
			`continue`
			`field = i.em.string`
			`if field.lower() in message_labels:`
			`message[field.lower()] = i.text.strip(field + ": ")`
many many things... 2017-11-04 13:34:05 +01:00

updated nettime crawler 2019-12-21 14:35:37 +01:00			`# old way`
			`# message['content'] = soup.select('pre:nth-of-type(2)')[0].text`
many many things... 2017-11-04 13:34:05 +01:00
updated nettime crawler 2019-12-21 14:35:37 +01:00			`# new way`
crawling 2019-12-22 08:22:20 +01:00			`message['content'] = parse_xmessage(str(soup))`
many many things... 2017-11-04 13:34:05 +01:00
			`# mhonarc xcomments`
			`# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html`
			`def parse_xcomment(soup, xcom):`
index +search info 2019-07-17 12:55:47 +02:00			`com = soup.find(text=re.compile(xcom))`
			`if com is not None:`
			`return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()`
			`return com`
many many things... 2017-11-04 13:34:05 +01:00
updated nettime crawler 2019-12-21 14:35:37 +01:00			`# (edit 21.12.2019): this is the new way as of 2018 -- when no more moderation on Nettime...`
			`def parse_xmessage(html):`
			`rr = r'<!--X-Body-of-Message-->.*?<!--X-Body-of-Message-End-->'`
			`s = re.search(rr, html, re.DOTALL)`
			`se = BeautifulSoup(s.group(), "html5lib")`
			`return se.get_text()`