listservs/lists/listserv.py

168 lines
4.6 KiB
Python
Raw Normal View History

2017-07-25 11:30:04 +02:00
import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip, difflib
from bs4 import BeautifulSoup
2019-12-22 08:22:20 +01:00
import lists.util
2017-07-25 11:30:04 +02:00
DELAY = 0.2
def collect_from_url(url, name, base_archive_dir):
2019-12-22 08:22:20 +01:00
# response = urllib.request.urlopen(url)
# #html = response.read().decode(encoding="utf-8")
# html = response.read()
# soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
2017-07-25 11:30:04 +02:00
threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
2019-12-31 17:53:47 +01:00
li = []
2017-07-25 11:30:04 +02:00
for t in threads_list:
thread_label = t.text.strip()
thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
2019-12-31 17:53:47 +01:00
li.append((thread_label, thread_url))
2017-07-25 11:30:04 +02:00
# create (main) directory
# this is where all temp files will be created
d = os.path.join(base_archive_dir, name)
if not os.path.exists(d):
os.makedirs(d)
threads = []
2019-12-31 17:53:47 +01:00
nbr_threads = str(len(li))
2017-07-25 11:30:04 +02:00
n = 0
2019-12-31 17:53:47 +01:00
for l in li: ### change this
2017-07-25 11:30:04 +02:00
n += 1
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
try:
threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
except KeyboardInterrupt:
sys.exit(0)
except:
logging.warning("Error archiving: " + l[1] + "... Continuing.")
ex_t, ex, tb = sys.exc_info()
print(ex_t)
traceback.print_tb(tb)
del tb
continue
2017-11-04 13:34:05 +01:00
# archive['name'] = name
# archive['list'] = threads
# file_path = os.path.join(base_arch_dir, name + '.json')
# with open(file_path, 'w') as fp:
# json.dump(archive, fp, indent=4)
# logging.info("done.")
2017-07-25 11:30:04 +02:00
def collect_threads_from_url(url, name, base_arch_dir):
threads = {'name' : name, 'url' : url, 'threads' : []}
logging.info("Collecting threads of: " + name)
arch_name = name.replace(' ', '_')
# check if archive already exists
file_path = os.path.join(base_arch_dir, arch_name + '.json')
if os.path.isfile(file_path):
logging.info("archive " + name + " already exists. loading from file " + file_path)
with open(file_path, 'r') as fin:
try:
threads = json.load(fin)
return threads
except:
logging.info("can't open archive " + file_path + "... rearchiving.")
2019-12-22 08:22:20 +01:00
# response = urllib.request.urlopen(url)
# #html = response.read().decode(encoding="utf-8")
# html = response.read()
# soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
2017-07-25 11:30:04 +02:00
table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
2019-12-31 17:53:47 +01:00
li = []
2017-07-25 11:30:04 +02:00
for tr in table:
if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
2019-12-31 17:53:47 +01:00
li.append(tr)
2017-07-25 11:30:04 +02:00
# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
# hence the thread parsing algorithm will also be flat -- re: a single loop
2019-12-31 17:53:47 +01:00
nbr_msgs = str(len(li))
2017-07-25 11:30:04 +02:00
n = 0
last_message = None
2019-12-31 17:53:47 +01:00
for tr in li:
2017-07-25 11:30:04 +02:00
n += 1
logging.info(" > " + str(n) + "/" + nbr_msgs)
td = tr.find_all('td')
thread_a = td[0].select("p span a")[0]
thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
thread_title = thread_a.text.strip()
try:
message = {u'id': 0, u'subject': thread_title, u'url': thread_url, u'author_name': 'n/a'}
threads['threads'].append(collect_message(thread_url, message))
if last_message and similar(last_message['subject'], message['subject']):
if u'follow-up' not in last_message:
last_message[u'follow-up'] = []
print(message['subject'] + " - follows - " + last_message['subject'])
last_message[u'follow-up'].append(message)
else:
last_message = message
except KeyboardInterrupt:
sys.exit(0)
except:
ex_t, ex, tb = sys.exc_info()
print(ex_t)
traceback.print_tb(tb)
del tb
continue
time.sleep(DELAY)
logging.info("writing archive to file " + file_path)
with open(file_path, 'w') as fp:
json.dump(threads, fp, indent=4)
logging.info("done.")
def collect_message(url, message):
2019-12-22 08:22:20 +01:00
# response = urllib.request.urlopen(url)
# #html = response.read().decode(encoding="utf-8")
# html = response.read()
# soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
2017-07-25 11:30:04 +02:00
tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)
header = tr[0].find_all('tbody')[0].find_all('tr', recursive=False)
message['subject'] = header[0].select("p a")[0].text.strip()
message['from'] = header[1].select("p")[1].text.replace("<[log in to unmask]>", "").strip()
message['author_name'] = message['from']
message['date'] = header[3].select("p")[1].text.strip()
message['content-type'] = header[4].select("p")[1].text.strip()
message['content'] = tr[1].find_all('pre')[0].text
return message
def similar(str_a, str_b):
r = difflib.SequenceMatcher(None, str_a, str_b).ratio()
return r > 0.75