listserv and www

2017-07-25 11:30:04 +02:00 · 2017-07-25 11:30:04 +02:00 · 064a05b806
commit 064a05b806
parent cca498d887
12 changed files with 469 additions and 86 deletions
--- a/lists/crawl.py
+++ b/lists/crawl.py
@ -1,11 +1,15 @@
 from urllib.parse import urlparse
 import lists.pipermail as pipermail
+import lists.listserv as listserv

 DELAY = 0.2

 def crawl(url, name, archive_dir):
 	u = urlparse(url)

+	# the following type 'tests' are very weak...
+	# how to test is list is pipermail / listserv / mhonarc?
+
 	if 'pipermail' in u.path:
 		# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
 		if name is None:
@ -14,8 +18,10 @@ def crawl(url, name, archive_dir):

 		pipermail.collect_from_url(url, name, archive_dir)
 			
+	elif 'cgi-bin' in u.path:
+		listserv.collect_from_url(url, name, archive_dir)
+
 	else:
 		print('mhonarc?')

-	
 	return
--- a/lists/listserv.py
+++ b/lists/listserv.py
@ -0,0 +1,149 @@
+import urllib.request, urllib.parse
+import logging, os, sys, traceback, re, time, json, gzip, difflib
+from bs4 import BeautifulSoup
+
+
+DELAY = 0.2
+
+def collect_from_url(url, name, base_archive_dir):
+
+	response = urllib.request.urlopen(url)
+	#html = response.read().decode(encoding="utf-8")
+	html = response.read()
+	soup = BeautifulSoup(html, "html5lib")
+
+	threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
+	lists = []
+	for t in threads_list:
+		thread_label = t.text.strip()
+		thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
+		lists.append((thread_label, thread_url))
+
+	# create (main) directory 
+	# this is where all temp files will be created
+	d = os.path.join(base_archive_dir, name)
+	if not os.path.exists(d):
+		os.makedirs(d)		
+
+	threads = []
+	nbr_threads = str(len(lists))
+	n = 0
+	for l in lists: ### change this
+		n += 1
+		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
+		try:
+			threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
+		except KeyboardInterrupt:
+			sys.exit(0)					
+		except:
+			logging.warning("Error archiving: " + l[1] + "... Continuing.")
+			ex_t, ex, tb = sys.exc_info()
+			print(ex_t)
+			traceback.print_tb(tb)
+			del tb
+			continue
+
+def collect_threads_from_url(url, name, base_arch_dir):
+
+	threads = {'name' : name, 'url' : url, 'threads' : []}
+	
+	logging.info("Collecting threads of: " + name)
+
+	arch_name = name.replace(' ', '_')
+
+	# check if archive already exists
+	file_path = os.path.join(base_arch_dir, arch_name + '.json')
+	if os.path.isfile(file_path):
+		logging.info("archive " + name + " already exists. loading from file " + file_path)
+		with open(file_path, 'r') as fin:
+			try:
+				threads = json.load(fin)
+				return threads  
+			except:
+				logging.info("can't open archive " + file_path + "... rearchiving.")
+
+
+	response = urllib.request.urlopen(url)
+	#html = response.read().decode(encoding="utf-8")
+	html = response.read()
+	soup = BeautifulSoup(html, "html5lib")
+
+	table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
+	lists = []
+	for tr in table:
+		if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
+			lists.append(tr)
+
+	# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
+	# hence the thread parsing algorithm will also be flat -- re: a single loop
+
+	nbr_msgs = str(len(lists))
+	n = 0	
+	last_message = None	
+	for tr in lists:
+		n += 1
+		logging.info("	> " + str(n) + "/" + nbr_msgs)
+		td = tr.find_all('td')
+		thread_a = td[0].select("p span a")[0]
+		thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
+		thread_title = thread_a.text.strip()
+
+		try:
+
+			message = {u'id': 0, u'subject': thread_title, u'url': thread_url, u'author_name': 'n/a'}
+
+			threads['threads'].append(collect_message(thread_url, message))
+
+			if last_message and similar(last_message['subject'], message['subject']):
+				if u'follow-up' not in last_message:
+					last_message[u'follow-up'] = []
+				print(message['subject'] + " - follows - " + last_message['subject'])
+				last_message[u'follow-up'].append(message)
+
+			else:
+				last_message = message
+
+		except KeyboardInterrupt:
+			sys.exit(0)		
+		except:
+			ex_t, ex, tb = sys.exc_info()
+			print(ex_t)
+			traceback.print_tb(tb)
+			del tb
+			continue
+
+		time.sleep(DELAY)	
+
+	logging.info("writing archive to file " + file_path)
+
+	with open(file_path, 'w') as fp:
+		json.dump(threads, fp, indent=4)
+
+	logging.info("done.")
+
+
+def collect_message(url, message):
+
+	response = urllib.request.urlopen(url)
+	#html = response.read().decode(encoding="utf-8")
+	html = response.read()
+	soup = BeautifulSoup(html, "html5lib")
+
+	tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)
+
+	header = tr[0].find_all('tbody')[0].find_all('tr', recursive=False)
+	message['subject'] = header[0].select("p a")[0].text.strip()
+	message['from'] = header[1].select("p")[1].text.replace("<[log in to unmask]>", "").strip()
+	message['author_name'] = message['from']
+	message['date'] = header[3].select("p")[1].text.strip()
+	message['content-type'] = header[4].select("p")[1].text.strip()
+
+	message['content'] = tr[1].find_all('pre')[0].text	
+
+	return message
+
+
+def similar(str_a, str_b):
+	r = difflib.SequenceMatcher(None, str_a, str_b).ratio()
+	return r > 0.75
+
--- a/lists/mhonarc.py
+++ b/lists/mhonarc.py
@ -1,26 +1,14 @@
-import urllib2, urllib, urlparse
-import logging
+import urllib.request, urllib.parse
+import logging, os, sys, traceback, re, time, json, gzip
 from bs4 import BeautifulSoup
-import email, email.parser
-from email.mime.text import MIMEText
-import mailbox
-import time, dateutil, string
-from pprint import pprint as pp
-import sys, os, re, json, gzip
-import traceback

 DELAY = 0.2

-# hack for the mailbox module (re: force mbox.add() encoding to utf8)
-reload(sys) 
-sys.setdefaultencoding('utf8')
-
-
 def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):

-    response = urllib2.urlopen(url)
-    html = response.read()
-    soup = BeautifulSoup(html, "html.parser")
+    response = urllib.request.urlopen(url)
+    html = response.read().decode(encoding="utf-8")
+    soup = BeautifulSoup(html, "html5lib")

    # base url 
    base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
@ -68,9 +56,9 @@ def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):

 def collect_threads_from_url(url, base_arch_dir, mbox):

-    response = urllib2.urlopen(url)
-    html = response.read()
-    soup = BeautifulSoup(html, "html.parser")
+    response = urllib.request.urlopen(url)
+    html = response.read().decode(encoding="utf-8")
+    soup = BeautifulSoup(html, "html5lib")

    # base url 
    base_url = url
@ -105,8 +93,6 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
                threads['threads'].append(thread)
            except:
                ex_type, ex, tb = sys.exc_info()
-                print ex_type
-                print ex
                traceback.print_tb(tb)
                del tb                
                continue
@ -119,33 +105,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
        with open(file_path, 'w') as fp:
            json.dump(threads, fp, indent=4)

-    if mbox:
-        mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt")
-        mbox_path_gz = mbox_path + ".gz"
-        logging.info("writing mbox  " + mbox_path)
-        if not os.path.isfile(mbox_path):
-            box = mailbox.mbox(mbox_path)
-            box.lock()
-            try:
-                for t in threads['threads']:
-                    write_mbox_message(t, box)
-                box.flush()
-            except:
-                ex_type, ex, tb = sys.exc_info()
-                print ex_type
-                print ex
-                traceback.print_tb(tb)
-                del tb
-            finally:
-                box.unlock()
-
-            with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout:
-                fpout.writelines(fpin)
-
-        else:
-            logging.info("mbox  " + mbox_path + " already exists.")            
-
-    logging.info("done. ")
+        logging.info("done. ")

    return threads

@ -183,11 +143,9 @@ def archive_thread(li, base_url, parent_thread_data):

 def collect_message(url, message):

-    print url
-
-    response = urllib2.urlopen(url)
-    html = response.read()
-    soup = BeautifulSoup(html, "html.parser")
+    response = urllib.request.urlopen(url)
+    html = response.read().decode(encoding="utf-8")
+    soup = BeautifulSoup(html, "html5lib")    

    #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>

@ -208,9 +166,9 @@ def collect_message(url, message):
    for i in info:
        if i.em == None:
            continue
-    	field = i.em.string
-    	if field.lower() in message_labels:
-    		message[field.lower()] = i.text.strip(field + ": ")
+        field = i.em.string
+        if field.lower() in message_labels:
+        	message[field.lower()] = i.text.strip(field + ": ")

    ## reformat from -- [author_name, email_addr]

@ -219,7 +177,12 @@ def collect_message(url, message):
    # message['from'] = from_addr[1]

    ## -- content --
-    message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+    # test
+    c1 = soup.select('pre:nth-of-type(1)')
+    if len(c1) > 0:
+        message['content'] = c1[0].text
+    else:
+        message['content'] = soup.select('pre:nth-of-type(2)')[0].text

 # mhonarc xcomments
 # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
@ -229,22 +192,5 @@ def parse_xcomment(soup, xcom):
        return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
    return com

-def to_mbox_message(msg):
-    mime = MIMEText('', 'plain', _charset='utf8')
-    mime['From'] = msg['from']
-    mime['Subject'] = msg['subject']
-    mime['Message-Id'] = msg['message-id']
-    mime['Date'] = msg['date']
-    mime.set_payload(msg['content'], charset='utf8')
-    mbox_message = mailbox.mboxMessage(mime)
-    mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
-    return mbox_message
-
-# throws exception
-def write_mbox_message(msg, mbox):
-    mbox_msg = to_mbox_message(msg)
-    mbox.add(mbox_msg) # here
-    if u'follow-up' in msg:
-        for f in msg['follow-up']:
-            write_mbox_message(f, mbox)
-
+def test_xcomment(soup):
+    return soup.find(text=re.compile('X-Message-Id')) is not None
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@ -1,6 +1,7 @@
 import urllib.request, urllib.parse
-import logging, os, sys, traceback, time, json, gzip
+import logging, os, sys, traceback, re, time, json, gzip, difflib
 from bs4 import BeautifulSoup
+import lists.mhonarc

 DELAY = 0.2

@ -34,11 +35,20 @@ def collect_from_url(url, name, base_archive_dir):
 	for l in lists: ### change this
 		n += 1
 		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
-		threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
+		try:
+			threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
+		except KeyboardInterrupt:
+			sys.exit(0)					
+		except:
+			logging.warning("Error archiving: " + l[1] + "... Continuing.")
+			ex_t, ex, tb = sys.exc_info()
+			print(ex_t)
+			traceback.print_tb(tb)
+			del tb
+			continue

 def collect_threads_from_url(url, name, base_arch_dir):

-
 	threads = {'name' : name, 'url' : url, 'threads' : []}
 	
 	logging.info("Collecting threads of: " + name)
@ -56,6 +66,7 @@ def collect_threads_from_url(url, name, base_arch_dir):
 			except:
 				logging.info("can't open archive " + file_path + "... rearchiving.")

+
 	response = urllib.request.urlopen(url)
 	html = response.read().decode(encoding="utf-8")
 	soup = BeautifulSoup(html, "html5lib")
@ -63,6 +74,8 @@ def collect_threads_from_url(url, name, base_arch_dir):
 	ul = soup.find_all('ul')[1];
 	lists = ul.find_all('li', recursive=False)

+	is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None
+
 	#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
 	nbr_msgs = str(len(lists))
 	n = 0		
@ -70,7 +83,11 @@ def collect_threads_from_url(url, name, base_arch_dir):
 		n += 1
 		logging.info("	> " + str(n) + "/" + nbr_msgs)
 		try:
-			thread = archive_thread(li, url.replace('thread.html', ''), None)
+			if is_mhonarc_hybrid:
+				logging.info("Mhonarc detected, switching to mhonarc parsing...")
+				thread = archive_thread_hybrid_mhonarc(li, url.replace('thread.html', ''), None)
+			else:
+				thread = archive_thread(li, url.replace('thread.html', ''), None)
 			threads['threads'].append(thread)
 		except KeyboardInterrupt:
 			sys.exit(0)		
@ -97,15 +114,17 @@ def archive_thread(li, base_url, parent_thread_data):
 	thread_a = li.select('a:nth-of-type(1)')[0]
 	url = (base_url + "/") if not base_url.endswith('/') else base_url
 	thread_url = urllib.parse.urljoin(url, thread_a.get("href"))	
-	thread_id = li.select('a:nth-of-type(2)')[0].get("name")
 	thread_title = thread_a.text.strip()
+
+	# this may not always be there... 
+	# ex. http://lists.cofa.unsw.edu.au/pipermail/empyre/2007-September/thread.html
+	thread_id = li.select('a:nth-of-type(2)')[0].get("name")
 	thread_author_name = li.select('i')[0].text.strip()

 	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}

 	collect_message(thread_url, message)
 	
-	
 	ul = li.find_all('ul');
 	if len(ul) == 0:
 		if parent_thread_data is None:
@ -132,6 +151,45 @@ def archive_thread(li, base_url, parent_thread_data):
 	parent_thread_data[u'follow-up'].append(message)
 	return message

+def archive_thread_hybrid_mhonarc(li, base_url, parent_thread_data):
+
+	thread_a = li.select('a:nth-of-type(1)')[0]
+	url = (base_url + "/") if not base_url.endswith('/') else base_url
+	thread_url = urllib.parse.urljoin(url, thread_a.get("href"))	
+	thread_title = thread_a.text.strip()
+
+	thread_id = thread_a.get("name")
+	thread_author_name = 'n/a'
+
+	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
+
+	lists.mhonarc.collect_message(thread_url, message)
+	
+	ul = li.find_all('ul');
+	if len(ul) == 0:
+		if parent_thread_data is None:
+			return message
+
+		if u'follow-up' not in parent_thread_data:
+			parent_thread_data[u'follow-up'] = []
+		parent_thread_data[u'follow-up'].append(message)
+		return message
+
+
+	follow = ul[0].find_all('li', recursive=False)	
+	if len(follow) > 0:
+		for f in follow:
+			follow_a = f.select('a')
+			if len(follow_a) > 0:
+				archive_thread_hybrid_mhonarc(f, base_url, message)
+		
+	if parent_thread_data is None:
+		return message
+
+	if u'follow-up' not in parent_thread_data:
+		parent_thread_data[u'follow-up'] = []
+	parent_thread_data[u'follow-up'].append(message)
+	return message	

 def collect_message(url, message):
 	# logging.info("	+ " + url)
@ -140,6 +198,10 @@ def collect_message(url, message):
 	html = response.read().decode(encoding="utf-8")
 	soup = BeautifulSoup(html, "html5lib")

+	if lists.mhonarc.test_xcomment(soup):
+		logging.info("Mhonarc detected, switching to mhonarc parsing...")
+		lists.mhonarc.collect_message(url, message)
+
 	#message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')

 	message['subject'] = soup.select('h1:nth-of-type(1)')[0].text.strip()
--- a/www-serve.py
+++ b/www-serve.py
@ -0,0 +1,2 @@
+from www import app
+app.run(debug=True)
--- a/www/init.py
+++ b/www/init.py
@ -0,0 +1,10 @@
+from flask import Flask
+
+app = Flask(__name__)
+
+from www import routes
+
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+# from www import archives
--- a/www/archives.py
+++ b/www/archives.py
@ -0,0 +1,63 @@
+import logging, os, json
+
+class Singleton(type):
+    _instances = {}
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+class Archives(metaclass=Singleton):
+
+	def __init__(self, archives_dir=None):
+		if archives_dir==None:
+			self.archives_dir = "archives/"
+		else:
+			self.archives_dir = archives_dir
+
+		self.loaded = False
+
+	def load(self):
+
+		if self.loaded:
+			return
+
+		if not os.path.isdir(self.archives_dir):
+			logging.error("Archives:: the path - " + self.archives_dir + " - is not a valid directory. Aborting.")
+			return
+
+		arch = [d for d in os.listdir(self.archives_dir) if os.path.isdir(os.path.join(self.archives_dir, d))]
+
+		self.data = {}
+		for a in arch:
+
+			logging.info("loading " + a)
+
+			archive_path = os.path.join(self.archives_dir, a)
+			self.data[a] = self.load_archive(archive_path)
+
+			logging.info("done.")
+		
+
+	def load_archive(self, archive_dir):
+
+		if not os.path.isdir(archive_dir):
+			logging.error("Archives:: the path - " + archive_dir + " - is not a valid directory. Aborting.")
+			return
+
+		files = [f for f in os.listdir(archive_dir) if f.endswith('.json')]
+
+		arch = {}
+		for f in files:
+			file_path = os.path.join(archive_dir, f)
+			with open(file_path) as fdata:
+				arch[f.replace('.json', '')] = json.load(fdata)
+
+		return arch	
+
+arch = Archives()
+arch.load()
+archives_data = arch.data
+
+
+
--- a/www/routes.py
+++ b/www/routes.py
@ -0,0 +1,91 @@
+from flask import render_template
+from www import app
+from www import archives
+from datetime import datetime
+
+@app.route('/')
+def index():
+	k = archives.archives_data.keys()
+	return render_template("index.html", archives=k)
+
+def get_key(kv_tuple):
+
+	k = kv_tuple[0]
+
+	# k is of the form "Month_Year" - ex.: "January_2001"
+	try:
+		return datetime.strptime(k, "%B_%Y")
+	except Exception:
+		pass
+
+	# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
+	try:
+		return datetime.strptime(k, "%b_%y")
+	except Exception:
+		pass
+
+	# k is of the form "Year" - ex.: "2001"
+	try:
+		return datetime.strptime(k, "%Y")
+	except Exception:
+		pass
+
+	return None
+
+@app.route('/<list>')
+def get_list(list):
+	if list in archives.archives_data:
+		d = []
+		for k, v in sorted(archives.archives_data[list].items(), key=get_key, reverse=True):
+			d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])})
+		return render_template("list.html", list_name=list, list=d)
+
+	else:
+		return 'nee nee'
+
+@app.route('/<list>/<sublist>')
+def get_sublist(list, sublist):
+
+	sublist = sublist.replace(' ', '_')
+	if list in archives.archives_data and sublist in archives.archives_data[list]:
+		return render_template("threads.html", sublist_name=sublist, threads=archives.archives_data[list][sublist]['threads'])
+	else:
+		return 'na na'
+
+@app.route('/<list>/<sublist>/<int:index>')
+def get_message(list, sublist, index):
+
+	sublist = sublist.replace(' ', '_')
+	index = int(index)
+	if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
+		return render_template("message.html", message=archives.archives_data[list][sublist]['threads'][index])
+	else:
+		'non non'
+
+@app.route('/<list>/<sublist>/<int:index>/<path:follow_ups>')
+def get_follow_ups(list, sublist, index, follow_ups):
+
+	sublist = sublist.replace(' ', '_')
+	index = int(index)
+
+	ups = follow_ups.split('/')
+	follow = []
+	for u in ups:
+		follow.append(int(u))
+
+	if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
+		message = archives.archives_data[list][sublist]['threads'][index]
+		for f in follow:
+			message = message['follow-up'][f]
+		return render_template("message.html", message=message)
+	else:
+		'nope nope'
+
+
+
+
+
+
+
+
+
--- a/www/templates/index.html
+++ b/www/templates/index.html
@ -0,0 +1,8 @@
+<html>
+<head></head>
+<body>
+	 {% for a in archives %}
+	 <a href="/{{ a }}"><h3>{{ a }}</h3></a>
+	 {% endfor %}
+</body>
+</html>
--- a/www/templates/list.html
+++ b/www/templates/list.html
@ -0,0 +1,10 @@
+<html>
+<head></head>
+<body>
+	<ul>
+	 {% for t in list %}
+	 <li><a href="{{ list_name }}/{{ t.name }}"><h3>{{ t.name }} -- {{ t.nbr_threads }}</h3></a></li>
+	 {% endfor %}
+	 </ul>
+</body>
+</html>
--- a/www/templates/message.html
+++ b/www/templates/message.html
@ -0,0 +1,11 @@
+<html>
+<head>
+	<meta charset="UTF-8">
+</head>
+<body>
+	 <h3>{{ message.subject }}</h3>
+	 <h4>{{ message.author_name }}</h4>
+	 <h4>{{ message.date }}</h4>
+	 <p>{{ message.content }} </p>
+</body>
+</html>
--- a/www/templates/threads.html
+++ b/www/templates/threads.html
@ -0,0 +1,25 @@
+<html>
+<head></head>
+<body>
+{% macro message(m, index, urlpath)-%}
+{% set path = urlpath + '/' + index|string %}
+	<li>
+		 {{ index }}. <a href="{{ path }}">{{ m.subject }}</a> <i>{{ m.author_name }}</i>
+		 {% if m.get('follow-up') %}
+		 <ul>
+		 	{% for msg in m.get('follow-up') %}
+		 		{{ message(m=msg, index=loop.index - 1, urlpath=path) }}
+		 	{% endfor %}
+		 </ul>
+		 {% endif %}		
+	</li>
+{%- endmacro %}
+
+	<ul>
+	 {% for m in threads recursive %}
+	 	{{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }}
+	 {% endfor %}
+	 </ul>
+
+</body>
+</html>