listserv and www

This commit is contained in:
gauthiier 2017-07-25 11:30:04 +02:00
parent cca498d887
commit 064a05b806
12 changed files with 469 additions and 86 deletions

View File

@ -1,11 +1,15 @@
from urllib.parse import urlparse
import lists.pipermail as pipermail
import lists.listserv as listserv
DELAY = 0.2
def crawl(url, name, archive_dir):
u = urlparse(url)
# the following type 'tests' are very weak...
# how to test is list is pipermail / listserv / mhonarc?
if 'pipermail' in u.path:
# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
if name is None:
@ -14,8 +18,10 @@ def crawl(url, name, archive_dir):
pipermail.collect_from_url(url, name, archive_dir)
elif 'cgi-bin' in u.path:
listserv.collect_from_url(url, name, archive_dir)
else:
print('mhonarc?')
return

149
lists/listserv.py Normal file
View File

@ -0,0 +1,149 @@
import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip, difflib
from bs4 import BeautifulSoup
DELAY = 0.2
def collect_from_url(url, name, base_archive_dir):
response = urllib.request.urlopen(url)
#html = response.read().decode(encoding="utf-8")
html = response.read()
soup = BeautifulSoup(html, "html5lib")
threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
lists = []
for t in threads_list:
thread_label = t.text.strip()
thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
lists.append((thread_label, thread_url))
# create (main) directory
# this is where all temp files will be created
d = os.path.join(base_archive_dir, name)
if not os.path.exists(d):
os.makedirs(d)
threads = []
nbr_threads = str(len(lists))
n = 0
for l in lists: ### change this
n += 1
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
try:
threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
except KeyboardInterrupt:
sys.exit(0)
except:
logging.warning("Error archiving: " + l[1] + "... Continuing.")
ex_t, ex, tb = sys.exc_info()
print(ex_t)
traceback.print_tb(tb)
del tb
continue
def collect_threads_from_url(url, name, base_arch_dir):
threads = {'name' : name, 'url' : url, 'threads' : []}
logging.info("Collecting threads of: " + name)
arch_name = name.replace(' ', '_')
# check if archive already exists
file_path = os.path.join(base_arch_dir, arch_name + '.json')
if os.path.isfile(file_path):
logging.info("archive " + name + " already exists. loading from file " + file_path)
with open(file_path, 'r') as fin:
try:
threads = json.load(fin)
return threads
except:
logging.info("can't open archive " + file_path + "... rearchiving.")
response = urllib.request.urlopen(url)
#html = response.read().decode(encoding="utf-8")
html = response.read()
soup = BeautifulSoup(html, "html5lib")
table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
lists = []
for tr in table:
if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
lists.append(tr)
# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
# hence the thread parsing algorithm will also be flat -- re: a single loop
nbr_msgs = str(len(lists))
n = 0
last_message = None
for tr in lists:
n += 1
logging.info(" > " + str(n) + "/" + nbr_msgs)
td = tr.find_all('td')
thread_a = td[0].select("p span a")[0]
thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
thread_title = thread_a.text.strip()
try:
message = {u'id': 0, u'subject': thread_title, u'url': thread_url, u'author_name': 'n/a'}
threads['threads'].append(collect_message(thread_url, message))
if last_message and similar(last_message['subject'], message['subject']):
if u'follow-up' not in last_message:
last_message[u'follow-up'] = []
print(message['subject'] + " - follows - " + last_message['subject'])
last_message[u'follow-up'].append(message)
else:
last_message = message
except KeyboardInterrupt:
sys.exit(0)
except:
ex_t, ex, tb = sys.exc_info()
print(ex_t)
traceback.print_tb(tb)
del tb
continue
time.sleep(DELAY)
logging.info("writing archive to file " + file_path)
with open(file_path, 'w') as fp:
json.dump(threads, fp, indent=4)
logging.info("done.")
def collect_message(url, message):
response = urllib.request.urlopen(url)
#html = response.read().decode(encoding="utf-8")
html = response.read()
soup = BeautifulSoup(html, "html5lib")
tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)
header = tr[0].find_all('tbody')[0].find_all('tr', recursive=False)
message['subject'] = header[0].select("p a")[0].text.strip()
message['from'] = header[1].select("p")[1].text.replace("<[log in to unmask]>", "").strip()
message['author_name'] = message['from']
message['date'] = header[3].select("p")[1].text.strip()
message['content-type'] = header[4].select("p")[1].text.strip()
message['content'] = tr[1].find_all('pre')[0].text
return message
def similar(str_a, str_b):
r = difflib.SequenceMatcher(None, str_a, str_b).ratio()
return r > 0.75

View File

@ -1,26 +1,14 @@
import urllib2, urllib, urlparse
import logging
import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip
from bs4 import BeautifulSoup
import email, email.parser
from email.mime.text import MIMEText
import mailbox
import time, dateutil, string
from pprint import pprint as pp
import sys, os, re, json, gzip
import traceback
DELAY = 0.2
# hack for the mailbox module (re: force mbox.add() encoding to utf8)
reload(sys)
sys.setdefaultencoding('utf8')
def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
soup = BeautifulSoup(html, "html5lib")
# base url
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
@ -68,9 +56,9 @@ def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
def collect_threads_from_url(url, base_arch_dir, mbox):
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
soup = BeautifulSoup(html, "html5lib")
# base url
base_url = url
@ -105,8 +93,6 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
threads['threads'].append(thread)
except:
ex_type, ex, tb = sys.exc_info()
print ex_type
print ex
traceback.print_tb(tb)
del tb
continue
@ -119,33 +105,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
with open(file_path, 'w') as fp:
json.dump(threads, fp, indent=4)
if mbox:
mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt")
mbox_path_gz = mbox_path + ".gz"
logging.info("writing mbox " + mbox_path)
if not os.path.isfile(mbox_path):
box = mailbox.mbox(mbox_path)
box.lock()
try:
for t in threads['threads']:
write_mbox_message(t, box)
box.flush()
except:
ex_type, ex, tb = sys.exc_info()
print ex_type
print ex
traceback.print_tb(tb)
del tb
finally:
box.unlock()
with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout:
fpout.writelines(fpin)
else:
logging.info("mbox " + mbox_path + " already exists.")
logging.info("done. ")
logging.info("done. ")
return threads
@ -183,11 +143,9 @@ def archive_thread(li, base_url, parent_thread_data):
def collect_message(url, message):
print url
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "html.parser")
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
soup = BeautifulSoup(html, "html5lib")
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
@ -208,9 +166,9 @@ def collect_message(url, message):
for i in info:
if i.em == None:
continue
field = i.em.string
if field.lower() in message_labels:
message[field.lower()] = i.text.strip(field + ": ")
field = i.em.string
if field.lower() in message_labels:
message[field.lower()] = i.text.strip(field + ": ")
## reformat from -- [author_name, email_addr]
@ -219,7 +177,12 @@ def collect_message(url, message):
# message['from'] = from_addr[1]
## -- content --
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# test
c1 = soup.select('pre:nth-of-type(1)')
if len(c1) > 0:
message['content'] = c1[0].text
else:
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
@ -229,22 +192,5 @@ def parse_xcomment(soup, xcom):
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
return com
def to_mbox_message(msg):
mime = MIMEText('', 'plain', _charset='utf8')
mime['From'] = msg['from']
mime['Subject'] = msg['subject']
mime['Message-Id'] = msg['message-id']
mime['Date'] = msg['date']
mime.set_payload(msg['content'], charset='utf8')
mbox_message = mailbox.mboxMessage(mime)
mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
return mbox_message
# throws exception
def write_mbox_message(msg, mbox):
mbox_msg = to_mbox_message(msg)
mbox.add(mbox_msg) # here
if u'follow-up' in msg:
for f in msg['follow-up']:
write_mbox_message(f, mbox)
def test_xcomment(soup):
return soup.find(text=re.compile('X-Message-Id')) is not None

View File

@ -1,6 +1,7 @@
import urllib.request, urllib.parse
import logging, os, sys, traceback, time, json, gzip
import logging, os, sys, traceback, re, time, json, gzip, difflib
from bs4 import BeautifulSoup
import lists.mhonarc
DELAY = 0.2
@ -34,11 +35,20 @@ def collect_from_url(url, name, base_archive_dir):
for l in lists: ### change this
n += 1
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
try:
threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
except KeyboardInterrupt:
sys.exit(0)
except:
logging.warning("Error archiving: " + l[1] + "... Continuing.")
ex_t, ex, tb = sys.exc_info()
print(ex_t)
traceback.print_tb(tb)
del tb
continue
def collect_threads_from_url(url, name, base_arch_dir):
threads = {'name' : name, 'url' : url, 'threads' : []}
logging.info("Collecting threads of: " + name)
@ -56,6 +66,7 @@ def collect_threads_from_url(url, name, base_arch_dir):
except:
logging.info("can't open archive " + file_path + "... rearchiving.")
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
soup = BeautifulSoup(html, "html5lib")
@ -63,6 +74,8 @@ def collect_threads_from_url(url, name, base_arch_dir):
ul = soup.find_all('ul')[1];
lists = ul.find_all('li', recursive=False)
is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None
#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
nbr_msgs = str(len(lists))
n = 0
@ -70,7 +83,11 @@ def collect_threads_from_url(url, name, base_arch_dir):
n += 1
logging.info(" > " + str(n) + "/" + nbr_msgs)
try:
thread = archive_thread(li, url.replace('thread.html', ''), None)
if is_mhonarc_hybrid:
logging.info("Mhonarc detected, switching to mhonarc parsing...")
thread = archive_thread_hybrid_mhonarc(li, url.replace('thread.html', ''), None)
else:
thread = archive_thread(li, url.replace('thread.html', ''), None)
threads['threads'].append(thread)
except KeyboardInterrupt:
sys.exit(0)
@ -97,15 +114,17 @@ def archive_thread(li, base_url, parent_thread_data):
thread_a = li.select('a:nth-of-type(1)')[0]
url = (base_url + "/") if not base_url.endswith('/') else base_url
thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
thread_id = li.select('a:nth-of-type(2)')[0].get("name")
thread_title = thread_a.text.strip()
# this may not always be there...
# ex. http://lists.cofa.unsw.edu.au/pipermail/empyre/2007-September/thread.html
thread_id = li.select('a:nth-of-type(2)')[0].get("name")
thread_author_name = li.select('i')[0].text.strip()
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
collect_message(thread_url, message)
ul = li.find_all('ul');
if len(ul) == 0:
if parent_thread_data is None:
@ -132,6 +151,45 @@ def archive_thread(li, base_url, parent_thread_data):
parent_thread_data[u'follow-up'].append(message)
return message
def archive_thread_hybrid_mhonarc(li, base_url, parent_thread_data):
thread_a = li.select('a:nth-of-type(1)')[0]
url = (base_url + "/") if not base_url.endswith('/') else base_url
thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
thread_title = thread_a.text.strip()
thread_id = thread_a.get("name")
thread_author_name = 'n/a'
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
lists.mhonarc.collect_message(thread_url, message)
ul = li.find_all('ul');
if len(ul) == 0:
if parent_thread_data is None:
return message
if u'follow-up' not in parent_thread_data:
parent_thread_data[u'follow-up'] = []
parent_thread_data[u'follow-up'].append(message)
return message
follow = ul[0].find_all('li', recursive=False)
if len(follow) > 0:
for f in follow:
follow_a = f.select('a')
if len(follow_a) > 0:
archive_thread_hybrid_mhonarc(f, base_url, message)
if parent_thread_data is None:
return message
if u'follow-up' not in parent_thread_data:
parent_thread_data[u'follow-up'] = []
parent_thread_data[u'follow-up'].append(message)
return message
def collect_message(url, message):
# logging.info(" + " + url)
@ -140,6 +198,10 @@ def collect_message(url, message):
html = response.read().decode(encoding="utf-8")
soup = BeautifulSoup(html, "html5lib")
if lists.mhonarc.test_xcomment(soup):
logging.info("Mhonarc detected, switching to mhonarc parsing...")
lists.mhonarc.collect_message(url, message)
#message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
message['subject'] = soup.select('h1:nth-of-type(1)')[0].text.strip()

2
www-serve.py Normal file
View File

@ -0,0 +1,2 @@
from www import app
app.run(debug=True)

10
www/__init__.py Normal file
View File

@ -0,0 +1,10 @@
from flask import Flask
app = Flask(__name__)
from www import routes
import logging
logging.basicConfig(level=logging.DEBUG)
# from www import archives

63
www/archives.py Normal file
View File

@ -0,0 +1,63 @@
import logging, os, json
class Singleton(type):
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
return cls._instances[cls]
class Archives(metaclass=Singleton):
def __init__(self, archives_dir=None):
if archives_dir==None:
self.archives_dir = "archives/"
else:
self.archives_dir = archives_dir
self.loaded = False
def load(self):
if self.loaded:
return
if not os.path.isdir(self.archives_dir):
logging.error("Archives:: the path - " + self.archives_dir + " - is not a valid directory. Aborting.")
return
arch = [d for d in os.listdir(self.archives_dir) if os.path.isdir(os.path.join(self.archives_dir, d))]
self.data = {}
for a in arch:
logging.info("loading " + a)
archive_path = os.path.join(self.archives_dir, a)
self.data[a] = self.load_archive(archive_path)
logging.info("done.")
def load_archive(self, archive_dir):
if not os.path.isdir(archive_dir):
logging.error("Archives:: the path - " + archive_dir + " - is not a valid directory. Aborting.")
return
files = [f for f in os.listdir(archive_dir) if f.endswith('.json')]
arch = {}
for f in files:
file_path = os.path.join(archive_dir, f)
with open(file_path) as fdata:
arch[f.replace('.json', '')] = json.load(fdata)
return arch
arch = Archives()
arch.load()
archives_data = arch.data

91
www/routes.py Normal file
View File

@ -0,0 +1,91 @@
from flask import render_template
from www import app
from www import archives
from datetime import datetime
@app.route('/')
def index():
k = archives.archives_data.keys()
return render_template("index.html", archives=k)
def get_key(kv_tuple):
k = kv_tuple[0]
# k is of the form "Month_Year" - ex.: "January_2001"
try:
return datetime.strptime(k, "%B_%Y")
except Exception:
pass
# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
try:
return datetime.strptime(k, "%b_%y")
except Exception:
pass
# k is of the form "Year" - ex.: "2001"
try:
return datetime.strptime(k, "%Y")
except Exception:
pass
return None
@app.route('/<list>')
def get_list(list):
if list in archives.archives_data:
d = []
for k, v in sorted(archives.archives_data[list].items(), key=get_key, reverse=True):
d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])})
return render_template("list.html", list_name=list, list=d)
else:
return 'nee nee'
@app.route('/<list>/<sublist>')
def get_sublist(list, sublist):
sublist = sublist.replace(' ', '_')
if list in archives.archives_data and sublist in archives.archives_data[list]:
return render_template("threads.html", sublist_name=sublist, threads=archives.archives_data[list][sublist]['threads'])
else:
return 'na na'
@app.route('/<list>/<sublist>/<int:index>')
def get_message(list, sublist, index):
sublist = sublist.replace(' ', '_')
index = int(index)
if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
return render_template("message.html", message=archives.archives_data[list][sublist]['threads'][index])
else:
'non non'
@app.route('/<list>/<sublist>/<int:index>/<path:follow_ups>')
def get_follow_ups(list, sublist, index, follow_ups):
sublist = sublist.replace(' ', '_')
index = int(index)
ups = follow_ups.split('/')
follow = []
for u in ups:
follow.append(int(u))
if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
message = archives.archives_data[list][sublist]['threads'][index]
for f in follow:
message = message['follow-up'][f]
return render_template("message.html", message=message)
else:
'nope nope'

8
www/templates/index.html Normal file
View File

@ -0,0 +1,8 @@
<html>
<head></head>
<body>
{% for a in archives %}
<a href="/{{ a }}"><h3>{{ a }}</h3></a>
{% endfor %}
</body>
</html>

10
www/templates/list.html Normal file
View File

@ -0,0 +1,10 @@
<html>
<head></head>
<body>
<ul>
{% for t in list %}
<li><a href="{{ list_name }}/{{ t.name }}"><h3>{{ t.name }} -- {{ t.nbr_threads }}</h3></a></li>
{% endfor %}
</ul>
</body>
</html>

View File

@ -0,0 +1,11 @@
<html>
<head>
<meta charset="UTF-8">
</head>
<body>
<h3>{{ message.subject }}</h3>
<h4>{{ message.author_name }}</h4>
<h4>{{ message.date }}</h4>
<p>{{ message.content }} </p>
</body>
</html>

View File

@ -0,0 +1,25 @@
<html>
<head></head>
<body>
{% macro message(m, index, urlpath)-%}
{% set path = urlpath + '/' + index|string %}
<li>
{{ index }}. <a href="{{ path }}">{{ m.subject }}</a> <i>{{ m.author_name }}</i>
{% if m.get('follow-up') %}
<ul>
{% for msg in m.get('follow-up') %}
{{ message(m=msg, index=loop.index - 1, urlpath=path) }}
{% endfor %}
</ul>
{% endif %}
</li>
{%- endmacro %}
<ul>
{% for m in threads recursive %}
{{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }}
{% endfor %}
</ul>
</body>
</html>