commit 520cb7ea44afe52c922544f1784c360cf1c74c31 Author: gauthiier Date: Thu Jun 23 15:12:16 2016 +0200 haha! commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8a8accc --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +archives diff --git a/README b/README new file mode 100644 index 0000000..ef216a8 --- /dev/null +++ b/README @@ -0,0 +1,8 @@ +Usage: archive_nettime.py [options] + +Options: + -h, --help show this help message and exit + -u URL, --url=URL nettime url + -l LIST, --list=LIST nettime's list name (ex: nettime-l) + + Dependencies: bs4 \ No newline at end of file diff --git a/archive_nettime.py b/archive_nettime.py new file mode 100644 index 0000000..3f36fac --- /dev/null +++ b/archive_nettime.py @@ -0,0 +1,28 @@ +import sys, logging +from optparse import OptionParser +import lib.nettime as nettime + +logging.basicConfig(level=logging.DEBUG) + +def run(options): + + if not options.url: + sys.exit('No url. Aborting.') + + if not options.list: + sys.exit('No list. Aborting.') + + ## check valid url?... nej + + nettime.archive_from_url(options.url, options.list) + sys.exit() + +if __name__ == "__main__": + + p = OptionParser(); + p.add_option('-u', '--url', action="store", help="nettime url", default="http://www.nettime.org/archives.php") + p.add_option('-l', '--list', action="store", help="nettime's list name (ex: nettime-l)", default="nettime-l") + + options, args = p.parse_args() + + run(options) diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/mhonarccrawl.py b/lib/mhonarccrawl.py new file mode 100644 index 0000000..2391faf --- /dev/null +++ b/lib/mhonarccrawl.py @@ -0,0 +1,242 @@ +import urllib2, urllib, urlparse +import logging +from bs4 import BeautifulSoup +import email, email.parser +from email.mime.text import MIMEText +import mailbox +import time, dateutil, string +from pprint import pprint as pp +import sys, os, re, json, gzip +import traceback + +DELAY = 0.5 + +# hack for the mailbox module (re: force mbox.add() encoding to utf8) +reload(sys) +sys.setdefaultencoding('utf8') + + +def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False): + + response = urllib2.urlopen(url) + html = response.read() + soup = BeautifulSoup(html, "html.parser") + + # base url + base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') + + #collect name + list_name = soup.select('body p:nth-of-type(2) base title')[0].string + logging.info("Getting " + list_name + " list archive for " + sublist_name) + + lists = soup.select('ul:nth-of-type(2) li') + + threads = [] + + for l in lists: + + if l.strong is None: + continue + + name = l.strong.string + + if name.lower() == sublist_name.lower(): + + threads_url_list = [] + threads_links = l.select('ul li a') + for t in threads_links: + thread_url = urlparse.urljoin(base_url, t.get('href')) + threads_url_list.append(thread_url) + + nbr_threads = str(len(threads_url_list)) + n = 0 + + for u in threads_url_list: + n += 1 + logging.info("## " + str(n) + " / " + nbr_threads + " ##") + threads.append(collect_threads_from_url(u, base_arch_dir, mbox)) + + return threads + + # for u in threads_url_list[0:10]: + # print "---------------------------------------" + # tt = collect_threads_from_url(u, base_arch_dir, mbox) + # threads.append(tt) + + + return None + +def collect_threads_from_url(url, base_arch_dir, mbox): + + response = urllib2.urlopen(url) + html = response.read() + soup = BeautifulSoup(html, "html.parser") + + # base url + base_url = url + + # collect name + threads_name = soup.select('p:nth-of-type(1) title')[0].string + threads_name = threads_name.replace(' ', '_') + + # thread data struct + threads = {'name' : threads_name, 'url' : base_url, 'threads' : []} + + logging.info("Collecting Threads of: " + threads_name) + + # check if archive already exists + file_path = os.path.join(base_arch_dir, threads['name'] + ".json") + if os.path.isfile(file_path): + logging.info("archive already exists. loading from file " + file_path) + with open(file_path, 'r') as fpin: + threads = json.load(fpin) + else: + lists = soup.select('ul:nth-of-type(1) > li') + + nbr_threads = str(len(lists)) + n = 0 + + for l in lists: + n += 1 + logging.info("> " + str(n) + " / " + nbr_threads) + thread = archive_thread(l, base_url, None) + threads['threads'].append(thread) + time.sleep(DELAY) + + # write + logging.info("writing archive to file " + file_path) + + with open(file_path, 'w') as fp: + json.dump(threads, fp, indent=4) + + if mbox: + mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt") + mbox_path_gz = mbox_path + ".gz" + logging.info("writing mbox " + mbox_path) + if not os.path.isfile(mbox_path): + box = mailbox.mbox(mbox_path) + box.lock() + try: + for t in threads['threads']: + write_mbox_message(t, box) + box.flush() + except: + ex_type, ex, tb = sys.exc_info() + print ex_type + print ex + traceback.print_tb(tb) + del tb + finally: + box.unlock() + + with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout: + fpout.writelines(fpin) + + else: + logging.info("mbox " + mbox_path + " already exists.") + + logging.info("done. ") + + return threads + + + +def archive_thread(li, base_url, parent_thread_data): + + thread_link = li.select('strong a')[0] + thread_url = urlparse.urljoin(base_url, thread_link.get('href')) + thread_id = thread_link.get('name') + thread_title = thread_link.string + thread_author_name = li.select('em')[0].string + + message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name} + + collect_message(thread_url, message) + + follow = li.select('ul > li') + if len(follow) > 0: + for f in follow: + follow_link = f.select('strong a') + if len (follow_link) > 0: + archive_thread(f, base_url, message) ## recursion + + if parent_thread_data is None: + return message + + if u'follow-up' not in parent_thread_data: + parent_thread_data[u'follow-up'] = [] + + parent_thread_data[u'follow-up'].append(message) + + return message + + +def collect_message(url, message): + + print url + + response = urllib2.urlopen(url) + html = response.read() + soup = BeautifulSoup(html, "html.parser") + + #note: this should follow an RFC header standard -- MHonArc has header info in the 1th
+
+    message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    
+
+    # mhonarc xcomments
+    # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+    message['subject'] = parse_xcomment(soup, "X-Subject")
+    message['date'] = parse_xcomment(soup, "X-Date")
+    message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
+    message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
+    message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
+
+    # parse what is displayed on the page
+
+    info = soup.select('ul:nth-of-type(1) > li')
+
+    for i in info:
+        if i.em == None:
+            continue
+    	field = i.em.string
+    	if field.lower() in message_labels:
+    		message[field.lower()] = i.text.strip(field + ": ")
+
+    ## reformat from -- [author_name, email_addr]
+
+    # from_addr = email.utils.parseaddr(message['from'])
+    # message['author_name'] = from_addr[0]
+    # message['from'] = from_addr[1]
+
+    ## -- content --
+    message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
+# mhonarc xcomments
+# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+def parse_xcomment(soup, xcom):
+    com = soup.find(text=re.compile(xcom))
+    if com is not None:
+        return com.strip('').strip(xcom + ":").strip()
+    return com
+
+def to_mbox_message(msg):
+    mime = MIMEText('', 'plain', _charset='utf8')
+    mime['From'] = msg['from']
+    mime['Subject'] = msg['subject']
+    mime['Message-Id'] = msg['message-id']
+    mime['Date'] = msg['date']
+    mime.set_payload(msg['content'], charset='utf8')
+    mbox_message = mailbox.mboxMessage(mime)
+    mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
+    return mbox_message
+
+# throws exception
+def write_mbox_message(msg, mbox):
+    mbox_msg = to_mbox_message(msg)
+    mbox.add(mbox_msg) # here
+    if u'follow-up' in msg:
+        for f in msg['follow-up']:
+            write_mbox_message(f, mbox)
+
+
+
diff --git a/lib/nettime.py b/lib/nettime.py
new file mode 100644
index 0000000..56af522
--- /dev/null
+++ b/lib/nettime.py
@@ -0,0 +1,26 @@
+import urllib2, urllib, urlparse
+import os, re, json, gzip
+import mhonarccrawl
+import datetime
+
+def archive_from_url(url, sublist_name="nettime-l", archive_dir="archives"):
+    url = url.rstrip()
+    archive_list_dir = check_dir(archive_dir, sublist_name)
+
+    archive_name = sublist_name.lower()
+    archive_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
+    archive = {'name' : sublist_name.lower(), 'url': url, 'date': archive_date, 'threads' : []}
+
+    archive['threads'] = mhonarccrawl.collect_from_url(url, sublist_name, archive_list_dir, mbox=True)
+
+    file_path = os.path.join(archive_dir, archive_name + "_" + archive_date + ".json.gz")
+    with gzip.open(file_path, 'w') as fp:
+        json.dump(archive, fp, indent=4)
+
+    return
+
+def check_dir(base_dir, list_name):
+    arc_dir = os.path.join(base_dir, list_name)
+    if not os.path.exists(arc_dir):
+        os.makedirs(arc_dir)
+    return arc_dir
\ No newline at end of file