diff --git a/lists/crawl.py b/lists/crawl.py index d70b382..5bcb38b 100644 --- a/lists/crawl.py +++ b/lists/crawl.py @@ -1,11 +1,15 @@ from urllib.parse import urlparse import lists.pipermail as pipermail +import lists.listserv as listserv DELAY = 0.2 def crawl(url, name, archive_dir): u = urlparse(url) + # the following type 'tests' are very weak... + # how to test is list is pipermail / listserv / mhonarc? + if 'pipermail' in u.path: # if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz') if name is None: @@ -14,8 +18,10 @@ def crawl(url, name, archive_dir): pipermail.collect_from_url(url, name, archive_dir) + elif 'cgi-bin' in u.path: + listserv.collect_from_url(url, name, archive_dir) + else: print('mhonarc?') - return \ No newline at end of file diff --git a/lists/listserv.py b/lists/listserv.py new file mode 100644 index 0000000..df3230d --- /dev/null +++ b/lists/listserv.py @@ -0,0 +1,149 @@ +import urllib.request, urllib.parse +import logging, os, sys, traceback, re, time, json, gzip, difflib +from bs4 import BeautifulSoup + + +DELAY = 0.2 + +def collect_from_url(url, name, base_archive_dir): + + response = urllib.request.urlopen(url) + #html = response.read().decode(encoding="utf-8") + html = response.read() + soup = BeautifulSoup(html, "html5lib") + + threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li') + lists = [] + for t in threads_list: + thread_label = t.text.strip() + thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href')) + lists.append((thread_label, thread_url)) + + # create (main) directory + # this is where all temp files will be created + d = os.path.join(base_archive_dir, name) + if not os.path.exists(d): + os.makedirs(d) + + threads = [] + nbr_threads = str(len(lists)) + n = 0 + for l in lists: ### change this + n += 1 + logging.info("## " + str(n) + " / " + nbr_threads + " ##") + try: + threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d)) + except KeyboardInterrupt: + sys.exit(0) + except: + logging.warning("Error archiving: " + l[1] + "... Continuing.") + ex_t, ex, tb = sys.exc_info() + print(ex_t) + traceback.print_tb(tb) + del tb + continue + +def collect_threads_from_url(url, name, base_arch_dir): + + threads = {'name' : name, 'url' : url, 'threads' : []} + + logging.info("Collecting threads of: " + name) + + arch_name = name.replace(' ', '_') + + # check if archive already exists + file_path = os.path.join(base_arch_dir, arch_name + '.json') + if os.path.isfile(file_path): + logging.info("archive " + name + " already exists. loading from file " + file_path) + with open(file_path, 'r') as fin: + try: + threads = json.load(fin) + return threads + except: + logging.info("can't open archive " + file_path + "... rearchiving.") + + + response = urllib.request.urlopen(url) + #html = response.read().decode(encoding="utf-8") + html = response.read() + soup = BeautifulSoup(html, "html5lib") + + table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr') + lists = [] + for tr in table: + if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'): + lists.append(tr) + + # the thread structure here is flat -- re: non-hierarchical, unlike pipermail + # hence the thread parsing algorithm will also be flat -- re: a single loop + + nbr_msgs = str(len(lists)) + n = 0 + last_message = None + for tr in lists: + n += 1 + logging.info(" > " + str(n) + "/" + nbr_msgs) + td = tr.find_all('td') + thread_a = td[0].select("p span a")[0] + thread_url = urllib.parse.urljoin(url, thread_a.get("href")) + thread_title = thread_a.text.strip() + + try: + + message = {u'id': 0, u'subject': thread_title, u'url': thread_url, u'author_name': 'n/a'} + + threads['threads'].append(collect_message(thread_url, message)) + + if last_message and similar(last_message['subject'], message['subject']): + if u'follow-up' not in last_message: + last_message[u'follow-up'] = [] + print(message['subject'] + " - follows - " + last_message['subject']) + last_message[u'follow-up'].append(message) + + else: + last_message = message + + except KeyboardInterrupt: + sys.exit(0) + except: + ex_t, ex, tb = sys.exc_info() + print(ex_t) + traceback.print_tb(tb) + del tb + continue + + time.sleep(DELAY) + + logging.info("writing archive to file " + file_path) + + with open(file_path, 'w') as fp: + json.dump(threads, fp, indent=4) + + logging.info("done.") + + +def collect_message(url, message): + + response = urllib.request.urlopen(url) + #html = response.read().decode(encoding="utf-8") + html = response.read() + soup = BeautifulSoup(html, "html5lib") + + tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False) + + header = tr[0].find_all('tbody')[0].find_all('tr', recursive=False) + message['subject'] = header[0].select("p a")[0].text.strip() + message['from'] = header[1].select("p")[1].text.replace("<[log in to unmask]>", "").strip() + message['author_name'] = message['from'] + message['date'] = header[3].select("p")[1].text.strip() + message['content-type'] = header[4].select("p")[1].text.strip() + + message['content'] = tr[1].find_all('pre')[0].text + + return message + + +def similar(str_a, str_b): + r = difflib.SequenceMatcher(None, str_a, str_b).ratio() + return r > 0.75 + diff --git a/lists/mhonarc.py b/lists/mhonarc.py index 192a0b2..68398df 100644 --- a/lists/mhonarc.py +++ b/lists/mhonarc.py @@ -1,26 +1,14 @@ -import urllib2, urllib, urlparse -import logging +import urllib.request, urllib.parse +import logging, os, sys, traceback, re, time, json, gzip from bs4 import BeautifulSoup -import email, email.parser -from email.mime.text import MIMEText -import mailbox -import time, dateutil, string -from pprint import pprint as pp -import sys, os, re, json, gzip -import traceback DELAY = 0.2 -# hack for the mailbox module (re: force mbox.add() encoding to utf8) -reload(sys) -sys.setdefaultencoding('utf8') - - def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False): - response = urllib2.urlopen(url) - html = response.read() - soup = BeautifulSoup(html, "html.parser") + response = urllib.request.urlopen(url) + html = response.read().decode(encoding="utf-8") + soup = BeautifulSoup(html, "html5lib") # base url base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') @@ -68,9 +56,9 @@ def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False): def collect_threads_from_url(url, base_arch_dir, mbox): - response = urllib2.urlopen(url) - html = response.read() - soup = BeautifulSoup(html, "html.parser") + response = urllib.request.urlopen(url) + html = response.read().decode(encoding="utf-8") + soup = BeautifulSoup(html, "html5lib") # base url base_url = url @@ -105,8 +93,6 @@ def collect_threads_from_url(url, base_arch_dir, mbox): threads['threads'].append(thread) except: ex_type, ex, tb = sys.exc_info() - print ex_type - print ex traceback.print_tb(tb) del tb continue @@ -119,33 +105,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox): with open(file_path, 'w') as fp: json.dump(threads, fp, indent=4) - if mbox: - mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt") - mbox_path_gz = mbox_path + ".gz" - logging.info("writing mbox " + mbox_path) - if not os.path.isfile(mbox_path): - box = mailbox.mbox(mbox_path) - box.lock() - try: - for t in threads['threads']: - write_mbox_message(t, box) - box.flush() - except: - ex_type, ex, tb = sys.exc_info() - print ex_type - print ex - traceback.print_tb(tb) - del tb - finally: - box.unlock() - - with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout: - fpout.writelines(fpin) - - else: - logging.info("mbox " + mbox_path + " already exists.") - - logging.info("done. ") + logging.info("done. ") return threads @@ -183,11 +143,9 @@ def archive_thread(li, base_url, parent_thread_data): def collect_message(url, message): - print url - - response = urllib2.urlopen(url) - html = response.read() - soup = BeautifulSoup(html, "html.parser") + response = urllib.request.urlopen(url) + html = response.read().decode(encoding="utf-8") + soup = BeautifulSoup(html, "html5lib") #note: this should follow an RFC header standard -- MHonArc has header info in the 1th
 
@@ -208,9 +166,9 @@ def collect_message(url, message):
     for i in info:
         if i.em == None:
             continue
-    	field = i.em.string
-    	if field.lower() in message_labels:
-    		message[field.lower()] = i.text.strip(field + ": ")
+        field = i.em.string
+        if field.lower() in message_labels:
+        	message[field.lower()] = i.text.strip(field + ": ")
 
     ## reformat from -- [author_name, email_addr]
 
@@ -219,7 +177,12 @@ def collect_message(url, message):
     # message['from'] = from_addr[1]
 
     ## -- content --
-    message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+    # test
+    c1 = soup.select('pre:nth-of-type(1)')
+    if len(c1) > 0:
+        message['content'] = c1[0].text
+    else:
+        message['content'] = soup.select('pre:nth-of-type(2)')[0].text
 
 # mhonarc xcomments
 # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
@@ -229,22 +192,5 @@ def parse_xcomment(soup, xcom):
         return com.strip('').strip(xcom + ":").strip()
     return com
 
-def to_mbox_message(msg):
-    mime = MIMEText('', 'plain', _charset='utf8')
-    mime['From'] = msg['from']
-    mime['Subject'] = msg['subject']
-    mime['Message-Id'] = msg['message-id']
-    mime['Date'] = msg['date']
-    mime.set_payload(msg['content'], charset='utf8')
-    mbox_message = mailbox.mboxMessage(mime)
-    mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
-    return mbox_message
-
-# throws exception
-def write_mbox_message(msg, mbox):
-    mbox_msg = to_mbox_message(msg)
-    mbox.add(mbox_msg) # here
-    if u'follow-up' in msg:
-        for f in msg['follow-up']:
-            write_mbox_message(f, mbox)
-
+def test_xcomment(soup):
+    return soup.find(text=re.compile('X-Message-Id')) is not None
diff --git a/lists/pipermail.py b/lists/pipermail.py
index c139611..dcdf757 100644
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@@ -1,6 +1,7 @@
 import urllib.request, urllib.parse
-import logging, os, sys, traceback, time, json, gzip
+import logging, os, sys, traceback, re, time, json, gzip, difflib
 from bs4 import BeautifulSoup
+import lists.mhonarc
 
 DELAY = 0.2
 
@@ -34,10 +35,19 @@ def collect_from_url(url, name, base_archive_dir):
 	for l in lists: ### change this
 		n += 1
 		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
-		threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
-		
-def collect_threads_from_url(url, name, base_arch_dir):
+		try:
+			threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
+		except KeyboardInterrupt:
+			sys.exit(0)					
+		except:
+			logging.warning("Error archiving: " + l[1] + "... Continuing.")
+			ex_t, ex, tb = sys.exc_info()
+			print(ex_t)
+			traceback.print_tb(tb)
+			del tb
+			continue
 
+def collect_threads_from_url(url, name, base_arch_dir):
 
 	threads = {'name' : name, 'url' : url, 'threads' : []}
 	
@@ -56,6 +66,7 @@ def collect_threads_from_url(url, name, base_arch_dir):
 			except:
 				logging.info("can't open archive " + file_path + "... rearchiving.")
 
+
 	response = urllib.request.urlopen(url)
 	html = response.read().decode(encoding="utf-8")
 	soup = BeautifulSoup(html, "html5lib")
@@ -63,6 +74,8 @@ def collect_threads_from_url(url, name, base_arch_dir):
 	ul = soup.find_all('ul')[1];
 	lists = ul.find_all('li', recursive=False)
 
+	is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None
+
 	#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
 	nbr_msgs = str(len(lists))
 	n = 0		
@@ -70,7 +83,11 @@ def collect_threads_from_url(url, name, base_arch_dir):
 		n += 1
 		logging.info("	> " + str(n) + "/" + nbr_msgs)
 		try:
-			thread = archive_thread(li, url.replace('thread.html', ''), None)
+			if is_mhonarc_hybrid:
+				logging.info("Mhonarc detected, switching to mhonarc parsing...")
+				thread = archive_thread_hybrid_mhonarc(li, url.replace('thread.html', ''), None)
+			else:
+				thread = archive_thread(li, url.replace('thread.html', ''), None)
 			threads['threads'].append(thread)
 		except KeyboardInterrupt:
 			sys.exit(0)		
@@ -96,15 +113,17 @@ def archive_thread(li, base_url, parent_thread_data):
 
 	thread_a = li.select('a:nth-of-type(1)')[0]
 	url = (base_url + "/") if not base_url.endswith('/') else base_url
-	thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
-	thread_id = li.select('a:nth-of-type(2)')[0].get("name")
+	thread_url = urllib.parse.urljoin(url, thread_a.get("href"))	
 	thread_title = thread_a.text.strip()
+
+	# this may not always be there... 
+	# ex. http://lists.cofa.unsw.edu.au/pipermail/empyre/2007-September/thread.html
+	thread_id = li.select('a:nth-of-type(2)')[0].get("name")
 	thread_author_name = li.select('i')[0].text.strip()
 
 	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
 
 	collect_message(thread_url, message)
-
 	
 	ul = li.find_all('ul');
 	if len(ul) == 0:
@@ -132,6 +151,45 @@ def archive_thread(li, base_url, parent_thread_data):
 	parent_thread_data[u'follow-up'].append(message)
 	return message
 
+def archive_thread_hybrid_mhonarc(li, base_url, parent_thread_data):
+
+	thread_a = li.select('a:nth-of-type(1)')[0]
+	url = (base_url + "/") if not base_url.endswith('/') else base_url
+	thread_url = urllib.parse.urljoin(url, thread_a.get("href"))	
+	thread_title = thread_a.text.strip()
+
+	thread_id = thread_a.get("name")
+	thread_author_name = 'n/a'
+
+	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
+
+	lists.mhonarc.collect_message(thread_url, message)
+	
+	ul = li.find_all('ul');
+	if len(ul) == 0:
+		if parent_thread_data is None:
+			return message
+
+		if u'follow-up' not in parent_thread_data:
+			parent_thread_data[u'follow-up'] = []
+		parent_thread_data[u'follow-up'].append(message)
+		return message
+
+
+	follow = ul[0].find_all('li', recursive=False)	
+	if len(follow) > 0:
+		for f in follow:
+			follow_a = f.select('a')
+			if len(follow_a) > 0:
+				archive_thread_hybrid_mhonarc(f, base_url, message)
+		
+	if parent_thread_data is None:
+		return message
+
+	if u'follow-up' not in parent_thread_data:
+		parent_thread_data[u'follow-up'] = []
+	parent_thread_data[u'follow-up'].append(message)
+	return message	
 
 def collect_message(url, message):
 	# logging.info("	+ " + url)
@@ -140,6 +198,10 @@ def collect_message(url, message):
 	html = response.read().decode(encoding="utf-8")
 	soup = BeautifulSoup(html, "html5lib")
 
+	if lists.mhonarc.test_xcomment(soup):
+		logging.info("Mhonarc detected, switching to mhonarc parsing...")
+		lists.mhonarc.collect_message(url, message)
+
 	#message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
 
 	message['subject'] = soup.select('h1:nth-of-type(1)')[0].text.strip()
diff --git a/www-serve.py b/www-serve.py
new file mode 100644
index 0000000..df30941
--- /dev/null
+++ b/www-serve.py
@@ -0,0 +1,2 @@
+from www import app
+app.run(debug=True)
diff --git a/www/__init__.py b/www/__init__.py
new file mode 100644
index 0000000..6e525d6
--- /dev/null
+++ b/www/__init__.py
@@ -0,0 +1,10 @@
+from flask import Flask
+
+app = Flask(__name__)
+
+from www import routes
+
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+# from www import archives
\ No newline at end of file
diff --git a/www/archives.py b/www/archives.py
new file mode 100644
index 0000000..c516a0e
--- /dev/null
+++ b/www/archives.py
@@ -0,0 +1,63 @@
+import logging, os, json
+
+class Singleton(type):
+    _instances = {}
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+class Archives(metaclass=Singleton):
+
+	def __init__(self, archives_dir=None):
+		if archives_dir==None:
+			self.archives_dir = "archives/"
+		else:
+			self.archives_dir = archives_dir
+
+		self.loaded = False
+
+	def load(self):
+
+		if self.loaded:
+			return
+
+		if not os.path.isdir(self.archives_dir):
+			logging.error("Archives:: the path - " + self.archives_dir + " - is not a valid directory. Aborting.")
+			return
+
+		arch = [d for d in os.listdir(self.archives_dir) if os.path.isdir(os.path.join(self.archives_dir, d))]
+
+		self.data = {}
+		for a in arch:
+
+			logging.info("loading " + a)
+
+			archive_path = os.path.join(self.archives_dir, a)
+			self.data[a] = self.load_archive(archive_path)
+
+			logging.info("done.")
+		
+
+	def load_archive(self, archive_dir):
+
+		if not os.path.isdir(archive_dir):
+			logging.error("Archives:: the path - " + archive_dir + " - is not a valid directory. Aborting.")
+			return
+
+		files = [f for f in os.listdir(archive_dir) if f.endswith('.json')]
+
+		arch = {}
+		for f in files:
+			file_path = os.path.join(archive_dir, f)
+			with open(file_path) as fdata:
+				arch[f.replace('.json', '')] = json.load(fdata)
+
+		return arch	
+
+arch = Archives()
+arch.load()
+archives_data = arch.data
+
+
+
diff --git a/www/routes.py b/www/routes.py
new file mode 100644
index 0000000..e9ac177
--- /dev/null
+++ b/www/routes.py
@@ -0,0 +1,91 @@
+from flask import render_template
+from www import app
+from www import archives
+from datetime import datetime
+
+@app.route('/')
+def index():
+	k = archives.archives_data.keys()
+	return render_template("index.html", archives=k)
+
+def get_key(kv_tuple):
+
+	k = kv_tuple[0]
+
+	# k is of the form "Month_Year" - ex.: "January_2001"
+	try:
+		return datetime.strptime(k, "%B_%Y")
+	except Exception:
+		pass
+
+	# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
+	try:
+		return datetime.strptime(k, "%b_%y")
+	except Exception:
+		pass
+
+	# k is of the form "Year" - ex.: "2001"
+	try:
+		return datetime.strptime(k, "%Y")
+	except Exception:
+		pass
+
+	return None
+
+@app.route('/')
+def get_list(list):
+	if list in archives.archives_data:
+		d = []
+		for k, v in sorted(archives.archives_data[list].items(), key=get_key, reverse=True):
+			d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])})
+		return render_template("list.html", list_name=list, list=d)
+
+	else:
+		return 'nee nee'
+
+@app.route('//')
+def get_sublist(list, sublist):
+
+	sublist = sublist.replace(' ', '_')
+	if list in archives.archives_data and sublist in archives.archives_data[list]:
+		return render_template("threads.html", sublist_name=sublist, threads=archives.archives_data[list][sublist]['threads'])
+	else:
+		return 'na na'
+
+@app.route('///')
+def get_message(list, sublist, index):
+
+	sublist = sublist.replace(' ', '_')
+	index = int(index)
+	if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
+		return render_template("message.html", message=archives.archives_data[list][sublist]['threads'][index])
+	else:
+		'non non'
+
+@app.route('////')
+def get_follow_ups(list, sublist, index, follow_ups):
+
+	sublist = sublist.replace(' ', '_')
+	index = int(index)
+
+	ups = follow_ups.split('/')
+	follow = []
+	for u in ups:
+		follow.append(int(u))
+
+	if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
+		message = archives.archives_data[list][sublist]['threads'][index]
+		for f in follow:
+			message = message['follow-up'][f]
+		return render_template("message.html", message=message)
+	else:
+		'nope nope'
+
+
+
+
+
+
+
+
+
diff --git a/www/templates/index.html b/www/templates/index.html
new file mode 100644
index 0000000..1476481
--- /dev/null
+++ b/www/templates/index.html
@@ -0,0 +1,8 @@
+
+
+
+	 {% for a in archives %}
+	 

{{ a }}

+ {% endfor %} + + \ No newline at end of file diff --git a/www/templates/list.html b/www/templates/list.html new file mode 100644 index 0000000..9a47098 --- /dev/null +++ b/www/templates/list.html @@ -0,0 +1,10 @@ + + + + + + \ No newline at end of file diff --git a/www/templates/message.html b/www/templates/message.html new file mode 100644 index 0000000..7125408 --- /dev/null +++ b/www/templates/message.html @@ -0,0 +1,11 @@ + + + + + +

{{ message.subject }}

+

{{ message.author_name }}

+

{{ message.date }}

+

{{ message.content }}

+ + \ No newline at end of file diff --git a/www/templates/threads.html b/www/templates/threads.html new file mode 100644 index 0000000..050e3cf --- /dev/null +++ b/www/templates/threads.html @@ -0,0 +1,25 @@ + + + +{% macro message(m, index, urlpath)-%} +{% set path = urlpath + '/' + index|string %} +
  • + {{ index }}. {{ m.subject }} {{ m.author_name }} + {% if m.get('follow-up') %} +
      + {% for msg in m.get('follow-up') %} + {{ message(m=msg, index=loop.index - 1, urlpath=path) }} + {% endfor %} +
    + {% endif %} +
  • +{%- endmacro %} + +
      + {% for m in threads recursive %} + {{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }} + {% endfor %} +
    + + + \ No newline at end of file