crawling

2019-12-22 08:22:20 +01:00 · 2019-12-22 08:22:20 +01:00 · a8cfaee935
commit a8cfaee935
parent a6fa141bbd
6 changed files with 97 additions and 51 deletions
--- a/lists/crawl.py
+++ b/lists/crawl.py
@ -30,6 +30,9 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"):
 			sublist_name = name
 		mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
 	elif "oldboys" in name:
 		mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
 	else:
 		print('mhonarc?')
--- a/lists/listserv.py
+++ b/lists/listserv.py
@ -1,16 +1,19 @@
 import urllib.request, urllib.parse
 import logging, os, sys, traceback, re, time, json, gzip, difflib
 from bs4 import BeautifulSoup
 import lists.util
 DELAY = 0.2
 def collect_from_url(url, name, base_archive_dir):
-	response = urllib.request.urlopen(url)
+	# response = urllib.request.urlopen(url)
-	#html = response.read().decode(encoding="utf-8")
+	# #html = response.read().decode(encoding="utf-8")
-	html = response.read()
+	# html = response.read()
-	soup = BeautifulSoup(html, "html5lib")
+	# soup = BeautifulSoup(html, "html5lib")
 	soup = lists.util.request(url)
 	threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
 	lists = []
@ -74,10 +77,12 @@ def collect_threads_from_url(url, name, base_arch_dir):
 				logging.info("can't open archive " + file_path + "... rearchiving.")
-	response = urllib.request.urlopen(url)
+	# response = urllib.request.urlopen(url)
-	#html = response.read().decode(encoding="utf-8")
+	# #html = response.read().decode(encoding="utf-8")
-	html = response.read()
+	# html = response.read()
-	soup = BeautifulSoup(html, "html5lib")
+	# soup = BeautifulSoup(html, "html5lib")
 	soup = lists.util.request(url)
 	table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
 	lists = []
@ -135,10 +140,12 @@ def collect_threads_from_url(url, name, base_arch_dir):
 def collect_message(url, message):
-	response = urllib.request.urlopen(url)
+	# response = urllib.request.urlopen(url)
-	#html = response.read().decode(encoding="utf-8")
+	# #html = response.read().decode(encoding="utf-8")
-	html = response.read()
+	# html = response.read()
-	soup = BeautifulSoup(html, "html5lib")
+	# soup = BeautifulSoup(html, "html5lib")
 	soup = lists.util.request(url)
 	tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)
--- a/lists/mhonarc.py
+++ b/lists/mhonarc.py
@ -1,14 +1,17 @@
 import urllib.request, urllib.parse
 import logging, os, sys, traceback, re, time, json, gzip
 from bs4 import BeautifulSoup
 import lists.util
 DELAY = 0.2
 def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
-    response = urllib.request.urlopen(url)
+    # response = urllib.request.urlopen(url)
-    html = response.read()
+    # html = response.read()
-    soup = BeautifulSoup(html, "html5lib")
+    # soup = BeautifulSoup(html, "html5lib")
    soup = lists.util.request(url)
    # base url 
    base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
@ -71,10 +74,12 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
 def collect_threads_from_url(url, base_archive_dir, mbox=False):
-    response = urllib.request.urlopen(url)
+    # response = urllib.request.urlopen(url)
-    html = response.read()
+    # html = response.read()
-    soup = BeautifulSoup(html, "html5lib")
+    # soup = BeautifulSoup(html, "html5lib")
    soup = lists.util.request(url)
    # base url 
    base_url = url
--- a/lists/mhonarc_nettime.py
+++ b/lists/mhonarc_nettime.py
@ -2,17 +2,23 @@ import urllib.request, urllib.parse
 import logging, os, sys, traceback, re, time, json, gzip
 from datetime import datetime
 from bs4 import BeautifulSoup
 import lists.util
 DELAY = 0.2
 def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
-	response = urllib.request.urlopen(url)
+	# response = urllib.request.urlopen(url)
-	html = response.read()
+	# html = response.read()
-	soup = BeautifulSoup(html, "html5lib")
+	# soup = BeautifulSoup(html, "html5lib")
 	soup = lists.util.request(url)
 	# base url 
-	base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
+	try:
 		base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
 	except:
 		base_url = url
 	logging.debug(base_url)
@ -27,9 +33,10 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
 		os.makedirs(d)
 	threads = []
-	lists = soup.select('ul:nth-of-type(2) li')    
+	# lists = soup.select('ul:nth-of-type(2) li')
 	li = soup.select('ul li')    
-	for l in lists:
+	for l in li:
 		if l.strong is None:
 			continue
@ -75,15 +82,18 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
 	return None
 def new_name(n):
-	dt = datetime.strptime(n, 'nettime-l_%b_%y')
+	# ex: nettime-bold_Mar_99
 	dt = datetime.strptime(n[-6:], '%b_%y')
 	return dt.strftime('%B_%Y')
 def collect_threads_from_url(url, base_archive_dir, mbox=False):
-	response = urllib.request.urlopen(url)
+	# response = urllib.request.urlopen(url)
-	html = response.read()
+	# html = response.read()
-	soup = BeautifulSoup(html, "html5lib")
+	# soup = BeautifulSoup(html, "html5lib")
 	soup = lists.util.request(url)	
 	logging.debug("collecting: " + url)
@ -111,12 +121,12 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
 		with open(file_path, 'r') as fpin:
 			threads = json.load(fpin)
 	else:
-		lists = soup.select('ul:nth-of-type(1) > li')
+		li = soup.select('ul:nth-of-type(1) > li')
-		nbr_threads = str(len(lists))
+		nbr_threads = str(len(li))
 		n = 0
-		for l in lists:
+		for l in li:
 			n += 1
 			logging.info("> " + str(n) + " / " + nbr_threads)
@ -181,10 +191,12 @@ def collect_message(url, message):
 	logging.debug("collecting message: " + url)
-	response = urllib.request.urlopen(url)
+	# response = urllib.request.urlopen(url)
-	html = response.read().decode(encoding="utf-8")
+	# html = response.read().decode(encoding="utf-8")
-	# html = response.read()
+	# # html = response.read()
-	soup = BeautifulSoup(html, "html5lib")    
+	# soup = BeautifulSoup(html, "html5lib")    
 	soup = lists.util.request(url)	
 	#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
@ -214,7 +226,7 @@ def collect_message(url, message):
 	# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
 	# new way
-	message['content'] = parse_xmessage(html)
+	message['content'] = parse_xmessage(str(soup))
 # mhonarc xcomments
 # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@ -2,15 +2,18 @@ import urllib.request, urllib.parse
 import logging, os, sys, traceback, re, time, json, gzip, difflib
 from bs4 import BeautifulSoup
 import lists.mhonarc
 import lists.util
 DELAY = 0.2
 def collect_from_url(url, name, base_archive_dir):
-	response = urllib.request.urlopen(url)
+	soup = lists.util.request(url)	
-	# html = response.read().decode(encoding="utf-8")
+
-	html = response.read()
+	# response = urllib.request.urlopen(url)
-	soup = BeautifulSoup(html, "html5lib")
+	# # html = response.read().decode(encoding="utf-8")
 	# html = response.read()
 	# soup = BeautifulSoup(html, "html5lib")
 	threads_list = soup.find_all('tr')
 	lists = []
@ -67,16 +70,17 @@ def collect_threads_from_url(url, name, base_arch_dir):
 			except:
 				logging.info("can't open archive " + file_path + "... rearchiving.")
 	soup = lists.util.request(url)
-	response = urllib.request.urlopen(url)
+	# response = urllib.request.urlopen(url)
-	html = response.read()
+	# html = response.read()
-	try:
+	# try:
-		html = html.decode(encoding="utf-8")
+	# 	html = html.decode(encoding="utf-8")
-	except:
+	# except:
-		logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
+	# 	logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
-	soup = BeautifulSoup(html, "html5lib")
+	# soup = BeautifulSoup(html, "html5lib")
 	ul = soup.find_all('ul')[1];
 	lists = ul.find_all('li', recursive=False)
@ -201,10 +205,12 @@ def archive_thread_hybrid_mhonarc(li, base_url, parent_thread_data):
 def collect_message(url, message):
 	# logging.info("	+ " + url)
-	response = urllib.request.urlopen(url)
+	# response = urllib.request.urlopen(url)
-	# html = response.read().decode(encoding="utf-8")
+	# # html = response.read().decode(encoding="utf-8")
-	html = response.read()
+	# html = response.read()
-	soup = BeautifulSoup(html, "html5lib")
+	# soup = BeautifulSoup(html, "html5lib")
 	soup = lists.util.request(url)
 	if lists.mhonarc.test_xcomment(soup):
 		logging.info("Mhonarc detected, switching to mhonarc parsing...")
--- a/lists/util.py
+++ b/lists/util.py
@ -0,0 +1,13 @@
 import urllib.request, urllib.parse
 import logging, os, sys, traceback, re, time, json, gzip, difflib
 from bs4 import BeautifulSoup
 def request(url):
 	response = urllib.request.urlopen(url)
 	html = response.read()
 	try:
 		html = html.decode(encoding="utf-8")
 	except:
 		logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
 	soup = BeautifulSoup(html, "html5lib")
 	return soup