From e560cb4cdb68218fcd5eb2d977c41c5ed3186a28 Mon Sep 17 00:00:00 2001
From: gauthiier <d@gauthiier.info>
Date: Sat, 21 Dec 2019 14:35:37 +0100
Subject: [PATCH] updated nettime crawler

---
 archive.py => crawl.py   |  0
 lists/mhonarc_nettime.py | 72 +++++++++++++++++++++-------------------
 2 files changed, 37 insertions(+), 35 deletions(-)
 rename archive.py => crawl.py (100%)

diff --git a/archive.py b/crawl.py
similarity index 100%
rename from archive.py
rename to crawl.py
diff --git a/lists/mhonarc_nettime.py b/lists/mhonarc_nettime.py
index 6380c75..5fc08cf 100644
--- a/lists/mhonarc_nettime.py
+++ b/lists/mhonarc_nettime.py
@@ -13,9 +13,11 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
 	# base url 
 	base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
 
-	#collect name
-	list_name = soup.select('body p:nth-of-type(2) title')[0].string
-	logging.info("Getting " + list_name + " list archive for " + sublist_name)
+	logging.debug(base_url)
+
+	# #collect name
+ #    list_name = soup.select('body p:nth-of-type(2) title')[0].string
+ #    logging.info("Getting " + list_name + " list archive for " + sublist_name)
 
 	# create (main) directory 
 	# this is where all temp files will be created
@@ -26,20 +28,19 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
 	threads = []
 	lists = soup.select('ul:nth-of-type(2) li')    
 
-
 	for l in lists:
 
 		if l.strong is None:
 			continue
 
 		name = l.strong.string
-		print(name + " - " + sublist_name)
 
 		if name.lower() == sublist_name.lower():
 
+			logging.debug(name)
+
 			threads_url_list = []
 			threads_links = l.select('ul li a')
-
 			for t in threads_links:
 				thread_url = urllib.parse.urljoin(base_url, t.get('href'))
 				threads_url_list.append(thread_url)
@@ -56,7 +57,7 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
 				except KeyboardInterrupt:
 					sys.exit(0)
 				except:
-					logging.warning("Error archiving: " + l[1] + "... Continuing.")
+					logging.warning("Error archiving: " + name + "... Continuing.")
 					ex_t, ex, tb = sys.exc_info()
 					print(ex_t)
 					traceback.print_tb(tb)
@@ -78,17 +79,17 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
 	html = response.read()
 	soup = BeautifulSoup(html, "html5lib")
 
+	logging.debug("collecting: " + url)
+
 	# base url 
 	base_url = url
 
 	# collect name
-
-	e = soup.select('p:nth-of-type(1) title')
-	print(soup)
-
-	threads_name = soup.select('p:nth-of-type(1) title')[0].string
+	threads_name = soup.select('head title')[0].string
 	threads_name = threads_name.replace(' ', '_')
 
+	logging.debug(threads_name)
+
 	# thread data struct
 	threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
 
@@ -110,14 +111,18 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
 			n += 1
 			logging.info("> " + str(n) + " / " + nbr_threads)
 
-			try:
-				thread = archive_thread(l, base_url, None)
-				threads['threads'].append(thread)
-			except:
-				ex_type, ex, tb = sys.exc_info()
-				traceback.print_tb(tb)
-				del tb                
-				continue
+			thread = archive_thread(l, base_url, None)
+			threads['threads'].append(thread)
+			
+
+			# try:
+			# 	thread = archive_thread(l, base_url, None)
+			# 	threads['threads'].append(thread)
+			# except:
+			# 	ex_type, ex, tb = sys.exc_info()
+			# 	traceback.print_tb(tb)
+			# 	del tb                
+			# 	continue
 
 			time.sleep(DELAY)
 
@@ -165,6 +170,8 @@ def archive_thread(li, base_url, parent_thread_data):
 
 def collect_message(url, message):
 
+	logging.debug("collecting message: " + url)
+
 	response = urllib.request.urlopen(url)
 	html = response.read().decode(encoding="utf-8")
 	# html = response.read()
@@ -193,21 +200,12 @@ def collect_message(url, message):
 		if field.lower() in message_labels:
 			message[field.lower()] = i.text.strip(field + ": ")
 
-	## reformat from -- [author_name, email_addr]
 
-	# from_addr = email.utils.parseaddr(message['from'])
-	# message['author_name'] = from_addr[0]
-	# message['from'] = from_addr[1]
+	# old way
+	# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
 
-	## -- content --
-	# test
-	# c1 = soup.select('pre:nth-of-type(1)')
-	# if len(c1) > 0:
-	#     message['content'] = c1[0].text
-	# else:
-	#     message['content'] = soup.select('pre:nth-of-type(2)')[0].text
-
-	message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+	# new way
+	message['content'] = parse_xmessage(html)
 
 # mhonarc xcomments
 # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
@@ -217,5 +215,9 @@ def parse_xcomment(soup, xcom):
 		return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
 	return com
 
-def test_xcomment(soup):
-	return soup.find(text=re.compile('X-Message-Id')) is not None
+# (edit 21.12.2019): this is the new way as of 2018 -- when no more moderation on Nettime...
+def parse_xmessage(html):
+	rr = r'<!--X-Body-of-Message-->.*?<!--X-Body-of-Message-End-->'
+	s = re.search(rr, html, re.DOTALL)
+	se = BeautifulSoup(s.group(), "html5lib")
+	return se.get_text()