listserv and www

2017-07-25 11:30:04 +02:00 · 2017-07-25 11:30:04 +02:00 · 064a05b806
commit 064a05b806
parent cca498d887
12 changed files with 469 additions and 86 deletions
--- a/lists/crawl.py
+++ b/lists/crawl.py
@ -1,11 +1,15 @@
 from urllib.parse import urlparse
 import lists.pipermail as pipermail
 import lists.listserv as listserv
 DELAY = 0.2
 def crawl(url, name, archive_dir):
 	u = urlparse(url)
 	# the following type 'tests' are very weak...
 	# how to test is list is pipermail / listserv / mhonarc?
 	if 'pipermail' in u.path:
 		# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
 		if name is None:
@ -14,8 +18,10 @@ def crawl(url, name, archive_dir):
 		pipermail.collect_from_url(url, name, archive_dir)
 	elif 'cgi-bin' in u.path:
 		listserv.collect_from_url(url, name, archive_dir)
 	else:
 		print('mhonarc?')
 	return
--- a/lists/listserv.py
+++ b/lists/listserv.py
@ -0,0 +1,149 @@
 import urllib.request, urllib.parse
 import logging, os, sys, traceback, re, time, json, gzip, difflib
 from bs4 import BeautifulSoup
 DELAY = 0.2
 def collect_from_url(url, name, base_archive_dir):
 	response = urllib.request.urlopen(url)
 	#html = response.read().decode(encoding="utf-8")
 	html = response.read()
 	soup = BeautifulSoup(html, "html5lib")
 	threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
 	lists = []
 	for t in threads_list:
 		thread_label = t.text.strip()
 		thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
 		lists.append((thread_label, thread_url))
 	# create (main) directory 
 	# this is where all temp files will be created
 	d = os.path.join(base_archive_dir, name)
 	if not os.path.exists(d):
 		os.makedirs(d)		
 	threads = []
 	nbr_threads = str(len(lists))
 	n = 0
 	for l in lists: ### change this
 		n += 1
 		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
 		try:
 			threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
 		except KeyboardInterrupt:
 			sys.exit(0)					
 		except:
 			logging.warning("Error archiving: " + l[1] + "... Continuing.")
 			ex_t, ex, tb = sys.exc_info()
 			print(ex_t)
 			traceback.print_tb(tb)
 			del tb
 			continue
 def collect_threads_from_url(url, name, base_arch_dir):
 	threads = {'name' : name, 'url' : url, 'threads' : []}
 	logging.info("Collecting threads of: " + name)
 	arch_name = name.replace(' ', '_')
 	# check if archive already exists
 	file_path = os.path.join(base_arch_dir, arch_name + '.json')
 	if os.path.isfile(file_path):
 		logging.info("archive " + name + " already exists. loading from file " + file_path)
 		with open(file_path, 'r') as fin:
 			try:
 				threads = json.load(fin)
 				return threads  
 			except:
 				logging.info("can't open archive " + file_path + "... rearchiving.")
 	response = urllib.request.urlopen(url)
 	#html = response.read().decode(encoding="utf-8")
 	html = response.read()
 	soup = BeautifulSoup(html, "html5lib")
 	table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
 	lists = []
 	for tr in table:
 		if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
 			lists.append(tr)
 	# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
 	# hence the thread parsing algorithm will also be flat -- re: a single loop
 	nbr_msgs = str(len(lists))
 	n = 0	
 	last_message = None	
 	for tr in lists:
 		n += 1
 		logging.info("	> " + str(n) + "/" + nbr_msgs)
 		td = tr.find_all('td')
 		thread_a = td[0].select("p span a")[0]
 		thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
 		thread_title = thread_a.text.strip()
 		try:
 			message = {u'id': 0, u'subject': thread_title, u'url': thread_url, u'author_name': 'n/a'}
 			threads['threads'].append(collect_message(thread_url, message))
 			if last_message and similar(last_message['subject'], message['subject']):
 				if u'follow-up' not in last_message:
 					last_message[u'follow-up'] = []
 				print(message['subject'] + " - follows - " + last_message['subject'])
 				last_message[u'follow-up'].append(message)
 			else:
 				last_message = message
 		except KeyboardInterrupt:
 			sys.exit(0)		
 		except:
 			ex_t, ex, tb = sys.exc_info()
 			print(ex_t)
 			traceback.print_tb(tb)
 			del tb
 			continue
 		time.sleep(DELAY)	
 	logging.info("writing archive to file " + file_path)
 	with open(file_path, 'w') as fp:
 		json.dump(threads, fp, indent=4)
 	logging.info("done.")
 def collect_message(url, message):
 	response = urllib.request.urlopen(url)
 	#html = response.read().decode(encoding="utf-8")
 	html = response.read()
 	soup = BeautifulSoup(html, "html5lib")
 	tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)
 	header = tr[0].find_all('tbody')[0].find_all('tr', recursive=False)
 	message['subject'] = header[0].select("p a")[0].text.strip()
 	message['from'] = header[1].select("p")[1].text.replace("<[log in to unmask]>", "").strip()
 	message['author_name'] = message['from']
 	message['date'] = header[3].select("p")[1].text.strip()
 	message['content-type'] = header[4].select("p")[1].text.strip()
 	message['content'] = tr[1].find_all('pre')[0].text	
 	return message
 def similar(str_a, str_b):
 	r = difflib.SequenceMatcher(None, str_a, str_b).ratio()
 	return r > 0.75
--- a/lists/mhonarc.py
+++ b/lists/mhonarc.py
@ -1,26 +1,14 @@
-import urllib2, urllib, urlparse
+import urllib.request, urllib.parse
-import logging
+import logging, os, sys, traceback, re, time, json, gzip
 from bs4 import BeautifulSoup
 import email, email.parser
 from email.mime.text import MIMEText
 import mailbox
 import time, dateutil, string
 from pprint import pprint as pp
 import sys, os, re, json, gzip
 import traceback
 DELAY = 0.2
 # hack for the mailbox module (re: force mbox.add() encoding to utf8)
 reload(sys) 
 sys.setdefaultencoding('utf8')
 def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
-    response = urllib2.urlopen(url)
+    response = urllib.request.urlopen(url)
-    html = response.read()
+    html = response.read().decode(encoding="utf-8")
-    soup = BeautifulSoup(html, "html.parser")
+    soup = BeautifulSoup(html, "html5lib")
    # base url 
    base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
@ -68,9 +56,9 @@ def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
 def collect_threads_from_url(url, base_arch_dir, mbox):
-    response = urllib2.urlopen(url)
+    response = urllib.request.urlopen(url)
-    html = response.read()
+    html = response.read().decode(encoding="utf-8")
-    soup = BeautifulSoup(html, "html.parser")
+    soup = BeautifulSoup(html, "html5lib")
    # base url 
    base_url = url
@ -105,8 +93,6 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
                threads['threads'].append(thread)
            except:
                ex_type, ex, tb = sys.exc_info()
                print ex_type
                print ex
                traceback.print_tb(tb)
                del tb                
                continue
@ -119,33 +105,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
        with open(file_path, 'w') as fp:
            json.dump(threads, fp, indent=4)
-    if mbox:
+        logging.info("done. ")
        mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt")
        mbox_path_gz = mbox_path + ".gz"
        logging.info("writing mbox  " + mbox_path)
        if not os.path.isfile(mbox_path):
            box = mailbox.mbox(mbox_path)
            box.lock()
            try:
                for t in threads['threads']:
                    write_mbox_message(t, box)
                box.flush()
            except:
                ex_type, ex, tb = sys.exc_info()
                print ex_type
                print ex
                traceback.print_tb(tb)
                del tb
            finally:
                box.unlock()
            with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout:
                fpout.writelines(fpin)
        else:
            logging.info("mbox  " + mbox_path + " already exists.")            
    logging.info("done. ")
    return threads
@ -183,11 +143,9 @@ def archive_thread(li, base_url, parent_thread_data):
 def collect_message(url, message):
-    print url
+    response = urllib.request.urlopen(url)
-
+    html = response.read().decode(encoding="utf-8")
-    response = urllib2.urlopen(url)
+    soup = BeautifulSoup(html, "html5lib")    
    html = response.read()
    soup = BeautifulSoup(html, "html.parser")
    #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
@ -208,9 +166,9 @@ def collect_message(url, message):
    for i in info:
        if i.em == None:
            continue
-    	field = i.em.string
+        field = i.em.string
-    	if field.lower() in message_labels:
+        if field.lower() in message_labels:
-    		message[field.lower()] = i.text.strip(field + ": ")
+        	message[field.lower()] = i.text.strip(field + ": ")
    ## reformat from -- [author_name, email_addr]
@ -219,7 +177,12 @@ def collect_message(url, message):
    # message['from'] = from_addr[1]
    ## -- content --
-    message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+    # test
    c1 = soup.select('pre:nth-of-type(1)')
    if len(c1) > 0:
        message['content'] = c1[0].text
    else:
        message['content'] = soup.select('pre:nth-of-type(2)')[0].text
 # mhonarc xcomments
 # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
@ -229,22 +192,5 @@ def parse_xcomment(soup, xcom):
        return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
    return com
-def to_mbox_message(msg):
+def test_xcomment(soup):
-    mime = MIMEText('', 'plain', _charset='utf8')
+    return soup.find(text=re.compile('X-Message-Id')) is not None
    mime['From'] = msg['from']
    mime['Subject'] = msg['subject']
    mime['Message-Id'] = msg['message-id']
    mime['Date'] = msg['date']
    mime.set_payload(msg['content'], charset='utf8')
    mbox_message = mailbox.mboxMessage(mime)
    mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
    return mbox_message
 # throws exception
 def write_mbox_message(msg, mbox):
    mbox_msg = to_mbox_message(msg)
    mbox.add(mbox_msg) # here
    if u'follow-up' in msg:
        for f in msg['follow-up']:
            write_mbox_message(f, mbox)
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@ -1,6 +1,7 @@
 import urllib.request, urllib.parse
-import logging, os, sys, traceback, time, json, gzip
+import logging, os, sys, traceback, re, time, json, gzip, difflib
 from bs4 import BeautifulSoup
 import lists.mhonarc
 DELAY = 0.2
@ -34,10 +35,19 @@ def collect_from_url(url, name, base_archive_dir):
 	for l in lists: ### change this
 		n += 1
 		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
-		threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
+		try:
-		
+			threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
-def collect_threads_from_url(url, name, base_arch_dir):
+		except KeyboardInterrupt:
 			sys.exit(0)					
 		except:
 			logging.warning("Error archiving: " + l[1] + "... Continuing.")
 			ex_t, ex, tb = sys.exc_info()
 			print(ex_t)
 			traceback.print_tb(tb)
 			del tb
 			continue
 def collect_threads_from_url(url, name, base_arch_dir):
 	threads = {'name' : name, 'url' : url, 'threads' : []}
@ -56,6 +66,7 @@ def collect_threads_from_url(url, name, base_arch_dir):
 			except:
 				logging.info("can't open archive " + file_path + "... rearchiving.")
 	response = urllib.request.urlopen(url)
 	html = response.read().decode(encoding="utf-8")
 	soup = BeautifulSoup(html, "html5lib")
@ -63,6 +74,8 @@ def collect_threads_from_url(url, name, base_arch_dir):
 	ul = soup.find_all('ul')[1];
 	lists = ul.find_all('li', recursive=False)
 	is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None
 	#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
 	nbr_msgs = str(len(lists))
 	n = 0		
@ -70,7 +83,11 @@ def collect_threads_from_url(url, name, base_arch_dir):
 		n += 1
 		logging.info("	> " + str(n) + "/" + nbr_msgs)
 		try:
-			thread = archive_thread(li, url.replace('thread.html', ''), None)
+			if is_mhonarc_hybrid:
 				logging.info("Mhonarc detected, switching to mhonarc parsing...")
 				thread = archive_thread_hybrid_mhonarc(li, url.replace('thread.html', ''), None)
 			else:
 				thread = archive_thread(li, url.replace('thread.html', ''), None)
 			threads['threads'].append(thread)
 		except KeyboardInterrupt:
 			sys.exit(0)		
@ -96,15 +113,17 @@ def archive_thread(li, base_url, parent_thread_data):
 	thread_a = li.select('a:nth-of-type(1)')[0]
 	url = (base_url + "/") if not base_url.endswith('/') else base_url
-	thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
+	thread_url = urllib.parse.urljoin(url, thread_a.get("href"))	
 	thread_id = li.select('a:nth-of-type(2)')[0].get("name")
 	thread_title = thread_a.text.strip()
 	# this may not always be there... 
 	# ex. http://lists.cofa.unsw.edu.au/pipermail/empyre/2007-September/thread.html
 	thread_id = li.select('a:nth-of-type(2)')[0].get("name")
 	thread_author_name = li.select('i')[0].text.strip()
 	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
 	collect_message(thread_url, message)
 	ul = li.find_all('ul');
 	if len(ul) == 0:
@ -132,6 +151,45 @@ def archive_thread(li, base_url, parent_thread_data):
 	parent_thread_data[u'follow-up'].append(message)
 	return message
 def archive_thread_hybrid_mhonarc(li, base_url, parent_thread_data):
 	thread_a = li.select('a:nth-of-type(1)')[0]
 	url = (base_url + "/") if not base_url.endswith('/') else base_url
 	thread_url = urllib.parse.urljoin(url, thread_a.get("href"))	
 	thread_title = thread_a.text.strip()
 	thread_id = thread_a.get("name")
 	thread_author_name = 'n/a'
 	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
 	lists.mhonarc.collect_message(thread_url, message)
 	ul = li.find_all('ul');
 	if len(ul) == 0:
 		if parent_thread_data is None:
 			return message
 		if u'follow-up' not in parent_thread_data:
 			parent_thread_data[u'follow-up'] = []
 		parent_thread_data[u'follow-up'].append(message)
 		return message
 	follow = ul[0].find_all('li', recursive=False)	
 	if len(follow) > 0:
 		for f in follow:
 			follow_a = f.select('a')
 			if len(follow_a) > 0:
 				archive_thread_hybrid_mhonarc(f, base_url, message)
 	if parent_thread_data is None:
 		return message
 	if u'follow-up' not in parent_thread_data:
 		parent_thread_data[u'follow-up'] = []
 	parent_thread_data[u'follow-up'].append(message)
 	return message	
 def collect_message(url, message):
 	# logging.info("	+ " + url)
@ -140,6 +198,10 @@ def collect_message(url, message):
 	html = response.read().decode(encoding="utf-8")
 	soup = BeautifulSoup(html, "html5lib")
 	if lists.mhonarc.test_xcomment(soup):
 		logging.info("Mhonarc detected, switching to mhonarc parsing...")
 		lists.mhonarc.collect_message(url, message)
 	#message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
 	message['subject'] = soup.select('h1:nth-of-type(1)')[0].text.strip()
--- a/www-serve.py
+++ b/www-serve.py
@ -0,0 +1,2 @@
 from www import app
 app.run(debug=True)
--- a/www/init.py
+++ b/www/init.py
@ -0,0 +1,10 @@
 from flask import Flask
 app = Flask(__name__)
 from www import routes
 import logging
 logging.basicConfig(level=logging.DEBUG)
 # from www import archives
--- a/www/archives.py
+++ b/www/archives.py
@ -0,0 +1,63 @@
 import logging, os, json
 class Singleton(type):
    _instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
        return cls._instances[cls]
 class Archives(metaclass=Singleton):
 	def __init__(self, archives_dir=None):
 		if archives_dir==None:
 			self.archives_dir = "archives/"
 		else:
 			self.archives_dir = archives_dir
 		self.loaded = False
 	def load(self):
 		if self.loaded:
 			return
 		if not os.path.isdir(self.archives_dir):
 			logging.error("Archives:: the path - " + self.archives_dir + " - is not a valid directory. Aborting.")
 			return
 		arch = [d for d in os.listdir(self.archives_dir) if os.path.isdir(os.path.join(self.archives_dir, d))]
 		self.data = {}
 		for a in arch:
 			logging.info("loading " + a)
 			archive_path = os.path.join(self.archives_dir, a)
 			self.data[a] = self.load_archive(archive_path)
 			logging.info("done.")
 	def load_archive(self, archive_dir):
 		if not os.path.isdir(archive_dir):
 			logging.error("Archives:: the path - " + archive_dir + " - is not a valid directory. Aborting.")
 			return
 		files = [f for f in os.listdir(archive_dir) if f.endswith('.json')]
 		arch = {}
 		for f in files:
 			file_path = os.path.join(archive_dir, f)
 			with open(file_path) as fdata:
 				arch[f.replace('.json', '')] = json.load(fdata)
 		return arch	
 arch = Archives()
 arch.load()
 archives_data = arch.data
--- a/www/routes.py
+++ b/www/routes.py
@ -0,0 +1,91 @@
 from flask import render_template
 from www import app
 from www import archives
 from datetime import datetime
@app.route('/')
 def index():
 	k = archives.archives_data.keys()
 	return render_template("index.html", archives=k)
 def get_key(kv_tuple):
 	k = kv_tuple[0]
 	# k is of the form "Month_Year" - ex.: "January_2001"
 	try:
 		return datetime.strptime(k, "%B_%Y")
 	except Exception:
 		pass
 	# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
 	try:
 		return datetime.strptime(k, "%b_%y")
 	except Exception:
 		pass
 	# k is of the form "Year" - ex.: "2001"
 	try:
 		return datetime.strptime(k, "%Y")
 	except Exception:
 		pass
 	return None
@app.route('/<list>')
 def get_list(list):
 	if list in archives.archives_data:
 		d = []
 		for k, v in sorted(archives.archives_data[list].items(), key=get_key, reverse=True):
 			d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])})
 		return render_template("list.html", list_name=list, list=d)
 	else:
 		return 'nee nee'
@app.route('/<list>/<sublist>')
 def get_sublist(list, sublist):
 	sublist = sublist.replace(' ', '_')
 	if list in archives.archives_data and sublist in archives.archives_data[list]:
 		return render_template("threads.html", sublist_name=sublist, threads=archives.archives_data[list][sublist]['threads'])
 	else:
 		return 'na na'
@app.route('/<list>/<sublist>/<int:index>')
 def get_message(list, sublist, index):
 	sublist = sublist.replace(' ', '_')
 	index = int(index)
 	if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
 		return render_template("message.html", message=archives.archives_data[list][sublist]['threads'][index])
 	else:
 		'non non'
@app.route('/<list>/<sublist>/<int:index>/<path:follow_ups>')
 def get_follow_ups(list, sublist, index, follow_ups):
 	sublist = sublist.replace(' ', '_')
 	index = int(index)
 	ups = follow_ups.split('/')
 	follow = []
 	for u in ups:
 		follow.append(int(u))
 	if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
 		message = archives.archives_data[list][sublist]['threads'][index]
 		for f in follow:
 			message = message['follow-up'][f]
 		return render_template("message.html", message=message)
 	else:
 		'nope nope'
--- a/www/templates/index.html
+++ b/www/templates/index.html
@ -0,0 +1,8 @@
 <html>
 <head></head>
 <body>
 	 {% for a in archives %}
 	 <a href="/{{ a }}"><h3>{{ a }}</h3></a>
 	 {% endfor %}
 </body>
 </html>
--- a/www/templates/list.html
+++ b/www/templates/list.html
@ -0,0 +1,10 @@
 <html>
 <head></head>
 <body>
 	<ul>
 	 {% for t in list %}
 	 <li><a href="{{ list_name }}/{{ t.name }}"><h3>{{ t.name }} -- {{ t.nbr_threads }}</h3></a></li>
 	 {% endfor %}
 	 </ul>
 </body>
 </html>
--- a/www/templates/message.html
+++ b/www/templates/message.html
@ -0,0 +1,11 @@
 <html>
 <head>
 	<meta charset="UTF-8">
 </head>
 <body>
 	 <h3>{{ message.subject }}</h3>
 	 <h4>{{ message.author_name }}</h4>
 	 <h4>{{ message.date }}</h4>
 	 <p>{{ message.content }} </p>
 </body>
 </html>
--- a/www/templates/threads.html
+++ b/www/templates/threads.html
@ -0,0 +1,25 @@
 <html>
 <head></head>
 <body>
 {% macro message(m, index, urlpath)-%}
 {% set path = urlpath + '/' + index|string %}
 	<li>
 		 {{ index }}. <a href="{{ path }}">{{ m.subject }}</a> <i>{{ m.author_name }}</i>
 		 {% if m.get('follow-up') %}
 		 <ul>
 		 	{% for msg in m.get('follow-up') %}
 		 		{{ message(m=msg, index=loop.index - 1, urlpath=path) }}
 		 	{% endfor %}
 		 </ul>
 		 {% endif %}		
 	</li>
 {%- endmacro %}
 	<ul>
 	 {% for m in threads recursive %}
 	 	{{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }}
 	 {% endfor %}
 	 </ul>
 </body>
 </html>