listservs/lists/mhonarc_nettime.py

245 lines
6.4 KiB
Python
Raw Normal View History

2017-11-04 13:34:05 +01:00
import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip
2019-12-21 21:09:34 +01:00
from datetime import datetime
2017-11-04 13:34:05 +01:00
from bs4 import BeautifulSoup
2019-12-22 08:22:20 +01:00
import lists.util
2017-11-04 13:34:05 +01:00
DELAY = 0.2
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
2019-12-22 08:22:20 +01:00
# response = urllib.request.urlopen(url)
# html = response.read()
# soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
# base url
2019-12-22 08:22:20 +01:00
try:
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
except:
base_url = url
2017-11-04 13:34:05 +01:00
2019-12-21 14:35:37 +01:00
logging.debug(base_url)
# #collect name
# list_name = soup.select('body p:nth-of-type(2) title')[0].string
# logging.info("Getting " + list_name + " list archive for " + sublist_name)
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
# create (main) directory
# this is where all temp files will be created
d = os.path.join(base_archive_dir, name)
if not os.path.exists(d):
os.makedirs(d)
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
threads = []
2019-12-22 08:22:20 +01:00
# lists = soup.select('ul:nth-of-type(2) li')
li = soup.select('ul li')
2017-11-04 13:34:05 +01:00
2019-12-22 08:22:20 +01:00
for l in li:
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
if l.strong is None:
continue
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
name = l.strong.string
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
if name.lower() == sublist_name.lower():
2017-11-04 13:34:05 +01:00
2019-12-21 14:35:37 +01:00
logging.debug(name)
2019-07-17 12:55:47 +02:00
threads_url_list = []
threads_links = l.select('ul li a')
for t in threads_links:
thread_url = urllib.parse.urljoin(base_url, t.get('href'))
threads_url_list.append(thread_url)
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
nbr_threads = str(len(threads_url_list))
n = 0
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
for u in threads_url_list:
time.sleep(DELAY)
n += 1
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
try:
threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))
except KeyboardInterrupt:
sys.exit(0)
except:
2019-12-21 14:35:37 +01:00
logging.warning("Error archiving: " + name + "... Continuing.")
2019-07-17 12:55:47 +02:00
ex_t, ex, tb = sys.exc_info()
print(ex_t)
traceback.print_tb(tb)
del tb
continue
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
return threads
# for u in threads_url_list[0:10]:
# print "---------------------------------------"
# tt = collect_threads_from_url(u, base_archive_dir, mbox)
# threads.append(tt)
return None
2017-11-04 13:34:05 +01:00
2019-12-21 21:09:34 +01:00
def new_name(n):
2019-12-22 08:22:20 +01:00
# ex: nettime-bold_Mar_99
dt = datetime.strptime(n[-6:], '%b_%y')
2019-12-21 21:09:34 +01:00
return dt.strftime('%B_%Y')
2017-11-04 13:34:05 +01:00
def collect_threads_from_url(url, base_archive_dir, mbox=False):
2019-12-22 08:22:20 +01:00
# response = urllib.request.urlopen(url)
# html = response.read()
# soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
2019-07-17 12:55:47 +02:00
2019-12-21 14:35:37 +01:00
logging.debug("collecting: " + url)
2019-07-17 12:55:47 +02:00
# base url
base_url = url
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
# collect name
2019-12-21 14:35:37 +01:00
threads_name = soup.select('head title')[0].string
2019-07-17 12:55:47 +02:00
threads_name = threads_name.replace(' ', '_')
2017-11-04 13:34:05 +01:00
2019-12-21 21:09:34 +01:00
#fix name for database (re: nettime-l to nettime_l) and consitency with other archives
new_threads_name = new_name(threads_name)
2019-12-21 14:35:37 +01:00
logging.debug(threads_name)
2019-07-17 12:55:47 +02:00
# thread data struct
2019-12-21 21:09:34 +01:00
threads = {'name' : new_threads_name, 'url' : base_url, 'threads' : []}
2017-11-04 13:34:05 +01:00
2019-12-21 21:09:34 +01:00
logging.info("Collecting Threads of: " + new_threads_name)
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
# check if archive already exists
file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
if os.path.isfile(file_path):
logging.info("archive already exists. loading from file " + file_path)
with open(file_path, 'r') as fpin:
threads = json.load(fpin)
else:
2019-12-22 08:22:20 +01:00
li = soup.select('ul:nth-of-type(1) > li')
2017-11-04 13:34:05 +01:00
2019-12-22 08:22:20 +01:00
nbr_threads = str(len(li))
2019-07-17 12:55:47 +02:00
n = 0
2017-11-04 13:34:05 +01:00
2019-12-22 08:22:20 +01:00
for l in li:
2019-07-17 12:55:47 +02:00
n += 1
logging.info("> " + str(n) + " / " + nbr_threads)
2017-11-04 13:34:05 +01:00
2019-12-21 14:35:37 +01:00
thread = archive_thread(l, base_url, None)
threads['threads'].append(thread)
# try:
# thread = archive_thread(l, base_url, None)
# threads['threads'].append(thread)
# except:
# ex_type, ex, tb = sys.exc_info()
# traceback.print_tb(tb)
# del tb
# continue
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
time.sleep(DELAY)
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
# write
logging.info("writing archive to file " + file_path)
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
with open(file_path, 'w') as fp:
json.dump(threads, fp, indent=4)
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
logging.info("done. ")
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
return threads
2017-11-04 13:34:05 +01:00
def archive_thread(li, base_url, parent_thread_data):
thread_link = li.select('strong a')[0]
thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
thread_id = thread_link.get('name')
thread_title = thread_link.string
thread_author_name = li.select('em')[0].string
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
collect_message(thread_url, message)
follow = li.select('ul > li')
if len(follow) > 0:
for f in follow:
follow_link = f.select('strong a')
if len (follow_link) > 0:
archive_thread(f, base_url, message) ## recursion
if parent_thread_data is None:
return message
if u'follow-up' not in parent_thread_data:
parent_thread_data[u'follow-up'] = []
parent_thread_data[u'follow-up'].append(message)
return message
def collect_message(url, message):
2019-12-21 14:35:37 +01:00
logging.debug("collecting message: " + url)
2019-12-22 08:22:20 +01:00
# response = urllib.request.urlopen(url)
# html = response.read().decode(encoding="utf-8")
# # html = response.read()
# soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
message['subject'] = parse_xcomment(soup, "X-Subject")
message['date'] = parse_xcomment(soup, "X-Date")
message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
# parse what is displayed on the page
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
info = soup.select('ul:nth-of-type(1) > li')
2017-11-04 13:34:05 +01:00
2019-07-17 12:55:47 +02:00
for i in info:
if i.em == None:
continue
field = i.em.string
if field.lower() in message_labels:
message[field.lower()] = i.text.strip(field + ": ")
2017-11-04 13:34:05 +01:00
2019-12-21 14:35:37 +01:00
# old way
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
2017-11-04 13:34:05 +01:00
2019-12-21 14:35:37 +01:00
# new way
2019-12-22 08:22:20 +01:00
message['content'] = parse_xmessage(str(soup))
2017-11-04 13:34:05 +01:00
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
def parse_xcomment(soup, xcom):
2019-07-17 12:55:47 +02:00
com = soup.find(text=re.compile(xcom))
if com is not None:
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
return com
2017-11-04 13:34:05 +01:00
2019-12-21 14:35:37 +01:00
# (edit 21.12.2019): this is the new way as of 2018 -- when no more moderation on Nettime...
def parse_xmessage(html):
rr = r'<!--X-Body-of-Message-->.*?<!--X-Body-of-Message-End-->'
s = re.search(rr, html, re.DOTALL)
se = BeautifulSoup(s.group(), "html5lib")
return se.get_text()