listserv and www
This commit is contained in:
parent
cca498d887
commit
064a05b806
@ -1,11 +1,15 @@
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import lists.pipermail as pipermail
|
import lists.pipermail as pipermail
|
||||||
|
import lists.listserv as listserv
|
||||||
|
|
||||||
DELAY = 0.2
|
DELAY = 0.2
|
||||||
|
|
||||||
def crawl(url, name, archive_dir):
|
def crawl(url, name, archive_dir):
|
||||||
u = urlparse(url)
|
u = urlparse(url)
|
||||||
|
|
||||||
|
# the following type 'tests' are very weak...
|
||||||
|
# how to test is list is pipermail / listserv / mhonarc?
|
||||||
|
|
||||||
if 'pipermail' in u.path:
|
if 'pipermail' in u.path:
|
||||||
# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
|
# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
|
||||||
if name is None:
|
if name is None:
|
||||||
@ -14,8 +18,10 @@ def crawl(url, name, archive_dir):
|
|||||||
|
|
||||||
pipermail.collect_from_url(url, name, archive_dir)
|
pipermail.collect_from_url(url, name, archive_dir)
|
||||||
|
|
||||||
|
elif 'cgi-bin' in u.path:
|
||||||
|
listserv.collect_from_url(url, name, archive_dir)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('mhonarc?')
|
print('mhonarc?')
|
||||||
|
|
||||||
|
|
||||||
return
|
return
|
||||||
149
lists/listserv.py
Normal file
149
lists/listserv.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
import urllib.request, urllib.parse
|
||||||
|
import logging, os, sys, traceback, re, time, json, gzip, difflib
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
DELAY = 0.2
|
||||||
|
|
||||||
|
def collect_from_url(url, name, base_archive_dir):
|
||||||
|
|
||||||
|
response = urllib.request.urlopen(url)
|
||||||
|
#html = response.read().decode(encoding="utf-8")
|
||||||
|
html = response.read()
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
|
||||||
|
lists = []
|
||||||
|
for t in threads_list:
|
||||||
|
thread_label = t.text.strip()
|
||||||
|
thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
|
||||||
|
lists.append((thread_label, thread_url))
|
||||||
|
|
||||||
|
# create (main) directory
|
||||||
|
# this is where all temp files will be created
|
||||||
|
d = os.path.join(base_archive_dir, name)
|
||||||
|
if not os.path.exists(d):
|
||||||
|
os.makedirs(d)
|
||||||
|
|
||||||
|
threads = []
|
||||||
|
nbr_threads = str(len(lists))
|
||||||
|
n = 0
|
||||||
|
for l in lists: ### change this
|
||||||
|
n += 1
|
||||||
|
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||||
|
try:
|
||||||
|
threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(0)
|
||||||
|
except:
|
||||||
|
logging.warning("Error archiving: " + l[1] + "... Continuing.")
|
||||||
|
ex_t, ex, tb = sys.exc_info()
|
||||||
|
print(ex_t)
|
||||||
|
traceback.print_tb(tb)
|
||||||
|
del tb
|
||||||
|
continue
|
||||||
|
|
||||||
|
def collect_threads_from_url(url, name, base_arch_dir):
|
||||||
|
|
||||||
|
threads = {'name' : name, 'url' : url, 'threads' : []}
|
||||||
|
|
||||||
|
logging.info("Collecting threads of: " + name)
|
||||||
|
|
||||||
|
arch_name = name.replace(' ', '_')
|
||||||
|
|
||||||
|
# check if archive already exists
|
||||||
|
file_path = os.path.join(base_arch_dir, arch_name + '.json')
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
logging.info("archive " + name + " already exists. loading from file " + file_path)
|
||||||
|
with open(file_path, 'r') as fin:
|
||||||
|
try:
|
||||||
|
threads = json.load(fin)
|
||||||
|
return threads
|
||||||
|
except:
|
||||||
|
logging.info("can't open archive " + file_path + "... rearchiving.")
|
||||||
|
|
||||||
|
|
||||||
|
response = urllib.request.urlopen(url)
|
||||||
|
#html = response.read().decode(encoding="utf-8")
|
||||||
|
html = response.read()
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
|
||||||
|
lists = []
|
||||||
|
for tr in table:
|
||||||
|
if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
|
||||||
|
lists.append(tr)
|
||||||
|
|
||||||
|
# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
|
||||||
|
# hence the thread parsing algorithm will also be flat -- re: a single loop
|
||||||
|
|
||||||
|
nbr_msgs = str(len(lists))
|
||||||
|
n = 0
|
||||||
|
last_message = None
|
||||||
|
for tr in lists:
|
||||||
|
n += 1
|
||||||
|
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
||||||
|
td = tr.find_all('td')
|
||||||
|
thread_a = td[0].select("p span a")[0]
|
||||||
|
thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
|
||||||
|
thread_title = thread_a.text.strip()
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
message = {u'id': 0, u'subject': thread_title, u'url': thread_url, u'author_name': 'n/a'}
|
||||||
|
|
||||||
|
threads['threads'].append(collect_message(thread_url, message))
|
||||||
|
|
||||||
|
if last_message and similar(last_message['subject'], message['subject']):
|
||||||
|
if u'follow-up' not in last_message:
|
||||||
|
last_message[u'follow-up'] = []
|
||||||
|
print(message['subject'] + " - follows - " + last_message['subject'])
|
||||||
|
last_message[u'follow-up'].append(message)
|
||||||
|
|
||||||
|
else:
|
||||||
|
last_message = message
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(0)
|
||||||
|
except:
|
||||||
|
ex_t, ex, tb = sys.exc_info()
|
||||||
|
print(ex_t)
|
||||||
|
traceback.print_tb(tb)
|
||||||
|
del tb
|
||||||
|
continue
|
||||||
|
|
||||||
|
time.sleep(DELAY)
|
||||||
|
|
||||||
|
logging.info("writing archive to file " + file_path)
|
||||||
|
|
||||||
|
with open(file_path, 'w') as fp:
|
||||||
|
json.dump(threads, fp, indent=4)
|
||||||
|
|
||||||
|
logging.info("done.")
|
||||||
|
|
||||||
|
|
||||||
|
def collect_message(url, message):
|
||||||
|
|
||||||
|
response = urllib.request.urlopen(url)
|
||||||
|
#html = response.read().decode(encoding="utf-8")
|
||||||
|
html = response.read()
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)
|
||||||
|
|
||||||
|
header = tr[0].find_all('tbody')[0].find_all('tr', recursive=False)
|
||||||
|
message['subject'] = header[0].select("p a")[0].text.strip()
|
||||||
|
message['from'] = header[1].select("p")[1].text.replace("<[log in to unmask]>", "").strip()
|
||||||
|
message['author_name'] = message['from']
|
||||||
|
message['date'] = header[3].select("p")[1].text.strip()
|
||||||
|
message['content-type'] = header[4].select("p")[1].text.strip()
|
||||||
|
|
||||||
|
message['content'] = tr[1].find_all('pre')[0].text
|
||||||
|
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
def similar(str_a, str_b):
|
||||||
|
r = difflib.SequenceMatcher(None, str_a, str_b).ratio()
|
||||||
|
return r > 0.75
|
||||||
|
|
||||||
100
lists/mhonarc.py
100
lists/mhonarc.py
@ -1,26 +1,14 @@
|
|||||||
import urllib2, urllib, urlparse
|
import urllib.request, urllib.parse
|
||||||
import logging
|
import logging, os, sys, traceback, re, time, json, gzip
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import email, email.parser
|
|
||||||
from email.mime.text import MIMEText
|
|
||||||
import mailbox
|
|
||||||
import time, dateutil, string
|
|
||||||
from pprint import pprint as pp
|
|
||||||
import sys, os, re, json, gzip
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
DELAY = 0.2
|
DELAY = 0.2
|
||||||
|
|
||||||
# hack for the mailbox module (re: force mbox.add() encoding to utf8)
|
|
||||||
reload(sys)
|
|
||||||
sys.setdefaultencoding('utf8')
|
|
||||||
|
|
||||||
|
|
||||||
def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
|
def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
|
||||||
|
|
||||||
response = urllib2.urlopen(url)
|
response = urllib.request.urlopen(url)
|
||||||
html = response.read()
|
html = response.read().decode(encoding="utf-8")
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
# base url
|
# base url
|
||||||
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
||||||
@ -68,9 +56,9 @@ def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
|
|||||||
|
|
||||||
def collect_threads_from_url(url, base_arch_dir, mbox):
|
def collect_threads_from_url(url, base_arch_dir, mbox):
|
||||||
|
|
||||||
response = urllib2.urlopen(url)
|
response = urllib.request.urlopen(url)
|
||||||
html = response.read()
|
html = response.read().decode(encoding="utf-8")
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
# base url
|
# base url
|
||||||
base_url = url
|
base_url = url
|
||||||
@ -105,8 +93,6 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
|
|||||||
threads['threads'].append(thread)
|
threads['threads'].append(thread)
|
||||||
except:
|
except:
|
||||||
ex_type, ex, tb = sys.exc_info()
|
ex_type, ex, tb = sys.exc_info()
|
||||||
print ex_type
|
|
||||||
print ex
|
|
||||||
traceback.print_tb(tb)
|
traceback.print_tb(tb)
|
||||||
del tb
|
del tb
|
||||||
continue
|
continue
|
||||||
@ -119,33 +105,7 @@ def collect_threads_from_url(url, base_arch_dir, mbox):
|
|||||||
with open(file_path, 'w') as fp:
|
with open(file_path, 'w') as fp:
|
||||||
json.dump(threads, fp, indent=4)
|
json.dump(threads, fp, indent=4)
|
||||||
|
|
||||||
if mbox:
|
logging.info("done. ")
|
||||||
mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt")
|
|
||||||
mbox_path_gz = mbox_path + ".gz"
|
|
||||||
logging.info("writing mbox " + mbox_path)
|
|
||||||
if not os.path.isfile(mbox_path):
|
|
||||||
box = mailbox.mbox(mbox_path)
|
|
||||||
box.lock()
|
|
||||||
try:
|
|
||||||
for t in threads['threads']:
|
|
||||||
write_mbox_message(t, box)
|
|
||||||
box.flush()
|
|
||||||
except:
|
|
||||||
ex_type, ex, tb = sys.exc_info()
|
|
||||||
print ex_type
|
|
||||||
print ex
|
|
||||||
traceback.print_tb(tb)
|
|
||||||
del tb
|
|
||||||
finally:
|
|
||||||
box.unlock()
|
|
||||||
|
|
||||||
with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout:
|
|
||||||
fpout.writelines(fpin)
|
|
||||||
|
|
||||||
else:
|
|
||||||
logging.info("mbox " + mbox_path + " already exists.")
|
|
||||||
|
|
||||||
logging.info("done. ")
|
|
||||||
|
|
||||||
return threads
|
return threads
|
||||||
|
|
||||||
@ -183,11 +143,9 @@ def archive_thread(li, base_url, parent_thread_data):
|
|||||||
|
|
||||||
def collect_message(url, message):
|
def collect_message(url, message):
|
||||||
|
|
||||||
print url
|
response = urllib.request.urlopen(url)
|
||||||
|
html = response.read().decode(encoding="utf-8")
|
||||||
response = urllib2.urlopen(url)
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
html = response.read()
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
|
|
||||||
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
||||||
|
|
||||||
@ -208,9 +166,9 @@ def collect_message(url, message):
|
|||||||
for i in info:
|
for i in info:
|
||||||
if i.em == None:
|
if i.em == None:
|
||||||
continue
|
continue
|
||||||
field = i.em.string
|
field = i.em.string
|
||||||
if field.lower() in message_labels:
|
if field.lower() in message_labels:
|
||||||
message[field.lower()] = i.text.strip(field + ": ")
|
message[field.lower()] = i.text.strip(field + ": ")
|
||||||
|
|
||||||
## reformat from -- [author_name, email_addr]
|
## reformat from -- [author_name, email_addr]
|
||||||
|
|
||||||
@ -219,7 +177,12 @@ def collect_message(url, message):
|
|||||||
# message['from'] = from_addr[1]
|
# message['from'] = from_addr[1]
|
||||||
|
|
||||||
## -- content --
|
## -- content --
|
||||||
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
# test
|
||||||
|
c1 = soup.select('pre:nth-of-type(1)')
|
||||||
|
if len(c1) > 0:
|
||||||
|
message['content'] = c1[0].text
|
||||||
|
else:
|
||||||
|
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
||||||
|
|
||||||
# mhonarc xcomments
|
# mhonarc xcomments
|
||||||
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
||||||
@ -229,22 +192,5 @@ def parse_xcomment(soup, xcom):
|
|||||||
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
|
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
|
||||||
return com
|
return com
|
||||||
|
|
||||||
def to_mbox_message(msg):
|
def test_xcomment(soup):
|
||||||
mime = MIMEText('', 'plain', _charset='utf8')
|
return soup.find(text=re.compile('X-Message-Id')) is not None
|
||||||
mime['From'] = msg['from']
|
|
||||||
mime['Subject'] = msg['subject']
|
|
||||||
mime['Message-Id'] = msg['message-id']
|
|
||||||
mime['Date'] = msg['date']
|
|
||||||
mime.set_payload(msg['content'], charset='utf8')
|
|
||||||
mbox_message = mailbox.mboxMessage(mime)
|
|
||||||
mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
|
|
||||||
return mbox_message
|
|
||||||
|
|
||||||
# throws exception
|
|
||||||
def write_mbox_message(msg, mbox):
|
|
||||||
mbox_msg = to_mbox_message(msg)
|
|
||||||
mbox.add(mbox_msg) # here
|
|
||||||
if u'follow-up' in msg:
|
|
||||||
for f in msg['follow-up']:
|
|
||||||
write_mbox_message(f, mbox)
|
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
import urllib.request, urllib.parse
|
import urllib.request, urllib.parse
|
||||||
import logging, os, sys, traceback, time, json, gzip
|
import logging, os, sys, traceback, re, time, json, gzip, difflib
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import lists.mhonarc
|
||||||
|
|
||||||
DELAY = 0.2
|
DELAY = 0.2
|
||||||
|
|
||||||
@ -34,10 +35,19 @@ def collect_from_url(url, name, base_archive_dir):
|
|||||||
for l in lists: ### change this
|
for l in lists: ### change this
|
||||||
n += 1
|
n += 1
|
||||||
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||||
threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
|
try:
|
||||||
|
threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
|
||||||
def collect_threads_from_url(url, name, base_arch_dir):
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(0)
|
||||||
|
except:
|
||||||
|
logging.warning("Error archiving: " + l[1] + "... Continuing.")
|
||||||
|
ex_t, ex, tb = sys.exc_info()
|
||||||
|
print(ex_t)
|
||||||
|
traceback.print_tb(tb)
|
||||||
|
del tb
|
||||||
|
continue
|
||||||
|
|
||||||
|
def collect_threads_from_url(url, name, base_arch_dir):
|
||||||
|
|
||||||
threads = {'name' : name, 'url' : url, 'threads' : []}
|
threads = {'name' : name, 'url' : url, 'threads' : []}
|
||||||
|
|
||||||
@ -56,6 +66,7 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
|||||||
except:
|
except:
|
||||||
logging.info("can't open archive " + file_path + "... rearchiving.")
|
logging.info("can't open archive " + file_path + "... rearchiving.")
|
||||||
|
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
response = urllib.request.urlopen(url)
|
||||||
html = response.read().decode(encoding="utf-8")
|
html = response.read().decode(encoding="utf-8")
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
@ -63,6 +74,8 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
|||||||
ul = soup.find_all('ul')[1];
|
ul = soup.find_all('ul')[1];
|
||||||
lists = ul.find_all('li', recursive=False)
|
lists = ul.find_all('li', recursive=False)
|
||||||
|
|
||||||
|
is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None
|
||||||
|
|
||||||
#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
|
#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
|
||||||
nbr_msgs = str(len(lists))
|
nbr_msgs = str(len(lists))
|
||||||
n = 0
|
n = 0
|
||||||
@ -70,7 +83,11 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
|||||||
n += 1
|
n += 1
|
||||||
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
||||||
try:
|
try:
|
||||||
thread = archive_thread(li, url.replace('thread.html', ''), None)
|
if is_mhonarc_hybrid:
|
||||||
|
logging.info("Mhonarc detected, switching to mhonarc parsing...")
|
||||||
|
thread = archive_thread_hybrid_mhonarc(li, url.replace('thread.html', ''), None)
|
||||||
|
else:
|
||||||
|
thread = archive_thread(li, url.replace('thread.html', ''), None)
|
||||||
threads['threads'].append(thread)
|
threads['threads'].append(thread)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
@ -96,15 +113,17 @@ def archive_thread(li, base_url, parent_thread_data):
|
|||||||
|
|
||||||
thread_a = li.select('a:nth-of-type(1)')[0]
|
thread_a = li.select('a:nth-of-type(1)')[0]
|
||||||
url = (base_url + "/") if not base_url.endswith('/') else base_url
|
url = (base_url + "/") if not base_url.endswith('/') else base_url
|
||||||
thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
|
thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
|
||||||
thread_id = li.select('a:nth-of-type(2)')[0].get("name")
|
|
||||||
thread_title = thread_a.text.strip()
|
thread_title = thread_a.text.strip()
|
||||||
|
|
||||||
|
# this may not always be there...
|
||||||
|
# ex. http://lists.cofa.unsw.edu.au/pipermail/empyre/2007-September/thread.html
|
||||||
|
thread_id = li.select('a:nth-of-type(2)')[0].get("name")
|
||||||
thread_author_name = li.select('i')[0].text.strip()
|
thread_author_name = li.select('i')[0].text.strip()
|
||||||
|
|
||||||
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
|
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
|
||||||
|
|
||||||
collect_message(thread_url, message)
|
collect_message(thread_url, message)
|
||||||
|
|
||||||
|
|
||||||
ul = li.find_all('ul');
|
ul = li.find_all('ul');
|
||||||
if len(ul) == 0:
|
if len(ul) == 0:
|
||||||
@ -132,6 +151,45 @@ def archive_thread(li, base_url, parent_thread_data):
|
|||||||
parent_thread_data[u'follow-up'].append(message)
|
parent_thread_data[u'follow-up'].append(message)
|
||||||
return message
|
return message
|
||||||
|
|
||||||
|
def archive_thread_hybrid_mhonarc(li, base_url, parent_thread_data):
|
||||||
|
|
||||||
|
thread_a = li.select('a:nth-of-type(1)')[0]
|
||||||
|
url = (base_url + "/") if not base_url.endswith('/') else base_url
|
||||||
|
thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
|
||||||
|
thread_title = thread_a.text.strip()
|
||||||
|
|
||||||
|
thread_id = thread_a.get("name")
|
||||||
|
thread_author_name = 'n/a'
|
||||||
|
|
||||||
|
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
|
||||||
|
|
||||||
|
lists.mhonarc.collect_message(thread_url, message)
|
||||||
|
|
||||||
|
ul = li.find_all('ul');
|
||||||
|
if len(ul) == 0:
|
||||||
|
if parent_thread_data is None:
|
||||||
|
return message
|
||||||
|
|
||||||
|
if u'follow-up' not in parent_thread_data:
|
||||||
|
parent_thread_data[u'follow-up'] = []
|
||||||
|
parent_thread_data[u'follow-up'].append(message)
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
follow = ul[0].find_all('li', recursive=False)
|
||||||
|
if len(follow) > 0:
|
||||||
|
for f in follow:
|
||||||
|
follow_a = f.select('a')
|
||||||
|
if len(follow_a) > 0:
|
||||||
|
archive_thread_hybrid_mhonarc(f, base_url, message)
|
||||||
|
|
||||||
|
if parent_thread_data is None:
|
||||||
|
return message
|
||||||
|
|
||||||
|
if u'follow-up' not in parent_thread_data:
|
||||||
|
parent_thread_data[u'follow-up'] = []
|
||||||
|
parent_thread_data[u'follow-up'].append(message)
|
||||||
|
return message
|
||||||
|
|
||||||
def collect_message(url, message):
|
def collect_message(url, message):
|
||||||
# logging.info(" + " + url)
|
# logging.info(" + " + url)
|
||||||
@ -140,6 +198,10 @@ def collect_message(url, message):
|
|||||||
html = response.read().decode(encoding="utf-8")
|
html = response.read().decode(encoding="utf-8")
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
if lists.mhonarc.test_xcomment(soup):
|
||||||
|
logging.info("Mhonarc detected, switching to mhonarc parsing...")
|
||||||
|
lists.mhonarc.collect_message(url, message)
|
||||||
|
|
||||||
#message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
|
#message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
|
||||||
|
|
||||||
message['subject'] = soup.select('h1:nth-of-type(1)')[0].text.strip()
|
message['subject'] = soup.select('h1:nth-of-type(1)')[0].text.strip()
|
||||||
|
|||||||
2
www-serve.py
Normal file
2
www-serve.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
from www import app
|
||||||
|
app.run(debug=True)
|
||||||
10
www/__init__.py
Normal file
10
www/__init__.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from flask import Flask
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
from www import routes
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
# from www import archives
|
||||||
63
www/archives.py
Normal file
63
www/archives.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
import logging, os, json
|
||||||
|
|
||||||
|
class Singleton(type):
|
||||||
|
_instances = {}
|
||||||
|
def __call__(cls, *args, **kwargs):
|
||||||
|
if cls not in cls._instances:
|
||||||
|
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
|
||||||
|
return cls._instances[cls]
|
||||||
|
|
||||||
|
class Archives(metaclass=Singleton):
|
||||||
|
|
||||||
|
def __init__(self, archives_dir=None):
|
||||||
|
if archives_dir==None:
|
||||||
|
self.archives_dir = "archives/"
|
||||||
|
else:
|
||||||
|
self.archives_dir = archives_dir
|
||||||
|
|
||||||
|
self.loaded = False
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
|
||||||
|
if self.loaded:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not os.path.isdir(self.archives_dir):
|
||||||
|
logging.error("Archives:: the path - " + self.archives_dir + " - is not a valid directory. Aborting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
arch = [d for d in os.listdir(self.archives_dir) if os.path.isdir(os.path.join(self.archives_dir, d))]
|
||||||
|
|
||||||
|
self.data = {}
|
||||||
|
for a in arch:
|
||||||
|
|
||||||
|
logging.info("loading " + a)
|
||||||
|
|
||||||
|
archive_path = os.path.join(self.archives_dir, a)
|
||||||
|
self.data[a] = self.load_archive(archive_path)
|
||||||
|
|
||||||
|
logging.info("done.")
|
||||||
|
|
||||||
|
|
||||||
|
def load_archive(self, archive_dir):
|
||||||
|
|
||||||
|
if not os.path.isdir(archive_dir):
|
||||||
|
logging.error("Archives:: the path - " + archive_dir + " - is not a valid directory. Aborting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
files = [f for f in os.listdir(archive_dir) if f.endswith('.json')]
|
||||||
|
|
||||||
|
arch = {}
|
||||||
|
for f in files:
|
||||||
|
file_path = os.path.join(archive_dir, f)
|
||||||
|
with open(file_path) as fdata:
|
||||||
|
arch[f.replace('.json', '')] = json.load(fdata)
|
||||||
|
|
||||||
|
return arch
|
||||||
|
|
||||||
|
arch = Archives()
|
||||||
|
arch.load()
|
||||||
|
archives_data = arch.data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
91
www/routes.py
Normal file
91
www/routes.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
from flask import render_template
|
||||||
|
from www import app
|
||||||
|
from www import archives
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
@app.route('/')
|
||||||
|
def index():
|
||||||
|
k = archives.archives_data.keys()
|
||||||
|
return render_template("index.html", archives=k)
|
||||||
|
|
||||||
|
def get_key(kv_tuple):
|
||||||
|
|
||||||
|
k = kv_tuple[0]
|
||||||
|
|
||||||
|
# k is of the form "Month_Year" - ex.: "January_2001"
|
||||||
|
try:
|
||||||
|
return datetime.strptime(k, "%B_%Y")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# k is of the form "Month(abv)_Year(abv)" - ex.: "Jan_01"
|
||||||
|
try:
|
||||||
|
return datetime.strptime(k, "%b_%y")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# k is of the form "Year" - ex.: "2001"
|
||||||
|
try:
|
||||||
|
return datetime.strptime(k, "%Y")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
@app.route('/<list>')
|
||||||
|
def get_list(list):
|
||||||
|
if list in archives.archives_data:
|
||||||
|
d = []
|
||||||
|
for k, v in sorted(archives.archives_data[list].items(), key=get_key, reverse=True):
|
||||||
|
d.append({"name": k, "url": v['url'], "nbr_threads": len(v['threads'])})
|
||||||
|
return render_template("list.html", list_name=list, list=d)
|
||||||
|
|
||||||
|
else:
|
||||||
|
return 'nee nee'
|
||||||
|
|
||||||
|
@app.route('/<list>/<sublist>')
|
||||||
|
def get_sublist(list, sublist):
|
||||||
|
|
||||||
|
sublist = sublist.replace(' ', '_')
|
||||||
|
if list in archives.archives_data and sublist in archives.archives_data[list]:
|
||||||
|
return render_template("threads.html", sublist_name=sublist, threads=archives.archives_data[list][sublist]['threads'])
|
||||||
|
else:
|
||||||
|
return 'na na'
|
||||||
|
|
||||||
|
@app.route('/<list>/<sublist>/<int:index>')
|
||||||
|
def get_message(list, sublist, index):
|
||||||
|
|
||||||
|
sublist = sublist.replace(' ', '_')
|
||||||
|
index = int(index)
|
||||||
|
if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
|
||||||
|
return render_template("message.html", message=archives.archives_data[list][sublist]['threads'][index])
|
||||||
|
else:
|
||||||
|
'non non'
|
||||||
|
|
||||||
|
@app.route('/<list>/<sublist>/<int:index>/<path:follow_ups>')
|
||||||
|
def get_follow_ups(list, sublist, index, follow_ups):
|
||||||
|
|
||||||
|
sublist = sublist.replace(' ', '_')
|
||||||
|
index = int(index)
|
||||||
|
|
||||||
|
ups = follow_ups.split('/')
|
||||||
|
follow = []
|
||||||
|
for u in ups:
|
||||||
|
follow.append(int(u))
|
||||||
|
|
||||||
|
if list in archives.archives_data and sublist in archives.archives_data[list] and index < len(archives.archives_data[list][sublist]['threads']):
|
||||||
|
message = archives.archives_data[list][sublist]['threads'][index]
|
||||||
|
for f in follow:
|
||||||
|
message = message['follow-up'][f]
|
||||||
|
return render_template("message.html", message=message)
|
||||||
|
else:
|
||||||
|
'nope nope'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
8
www/templates/index.html
Normal file
8
www/templates/index.html
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<html>
|
||||||
|
<head></head>
|
||||||
|
<body>
|
||||||
|
{% for a in archives %}
|
||||||
|
<a href="/{{ a }}"><h3>{{ a }}</h3></a>
|
||||||
|
{% endfor %}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
10
www/templates/list.html
Normal file
10
www/templates/list.html
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<html>
|
||||||
|
<head></head>
|
||||||
|
<body>
|
||||||
|
<ul>
|
||||||
|
{% for t in list %}
|
||||||
|
<li><a href="{{ list_name }}/{{ t.name }}"><h3>{{ t.name }} -- {{ t.nbr_threads }}</h3></a></li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
11
www/templates/message.html
Normal file
11
www/templates/message.html
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h3>{{ message.subject }}</h3>
|
||||||
|
<h4>{{ message.author_name }}</h4>
|
||||||
|
<h4>{{ message.date }}</h4>
|
||||||
|
<p>{{ message.content }} </p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
25
www/templates/threads.html
Normal file
25
www/templates/threads.html
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
<html>
|
||||||
|
<head></head>
|
||||||
|
<body>
|
||||||
|
{% macro message(m, index, urlpath)-%}
|
||||||
|
{% set path = urlpath + '/' + index|string %}
|
||||||
|
<li>
|
||||||
|
{{ index }}. <a href="{{ path }}">{{ m.subject }}</a> <i>{{ m.author_name }}</i>
|
||||||
|
{% if m.get('follow-up') %}
|
||||||
|
<ul>
|
||||||
|
{% for msg in m.get('follow-up') %}
|
||||||
|
{{ message(m=msg, index=loop.index - 1, urlpath=path) }}
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
{% endif %}
|
||||||
|
</li>
|
||||||
|
{%- endmacro %}
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
{% for m in threads recursive %}
|
||||||
|
{{ message(m=m, index=loop.index - 1, urlpath=sublist_name) }}
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Loading…
x
Reference in New Issue
Block a user