nettime/lib/mhonarccrawl.py

import urllib2, urllib, urlparse
import logging
from bs4 import BeautifulSoup
import email, email.parser
from email.mime.text import MIMEText
import mailbox
import time, dateutil, string
from pprint import pprint as pp
import sys, os, re, json, gzip
import traceback

DELAY = 0.2

# hack for the mailbox module (re: force mbox.add() encoding to utf8)
reload(sys) 
sys.setdefaultencoding('utf8')


def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):

    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, "html.parser")

    # base url 
    base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')

	#collect name
    list_name = soup.select('body p:nth-of-type(2) base title')[0].string
    logging.info("Getting " + list_name + " list archive for " + sublist_name)

    lists = soup.select('ul:nth-of-type(2) li')

    threads = []

    for l in lists:

    	if l.strong is None:
    		continue

    	name = l.strong.string

    	if name.lower() == sublist_name.lower():

            threads_url_list = []
            threads_links = l.select('ul li a')
            for t in threads_links:
                thread_url = urlparse.urljoin(base_url, t.get('href'))
                threads_url_list.append(thread_url)

            nbr_threads = str(len(threads_url_list))
            n = 0

            for u in threads_url_list:
                n += 1
                logging.info("## " + str(n) + " / " + nbr_threads + " ##")
                threads.append(collect_threads_from_url(u, base_arch_dir, mbox))                

            return threads

            # for u in threads_url_list[0:10]:
            #     print "---------------------------------------"
            #     tt = collect_threads_from_url(u, base_arch_dir, mbox)
            #     threads.append(tt)
                

    return None

def collect_threads_from_url(url, base_arch_dir, mbox):

    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, "html.parser")

    # base url 
    base_url = url

    # collect name
    threads_name = soup.select('p:nth-of-type(1) title')[0].string
    threads_name = threads_name.replace(' ', '_')

    # thread data struct
    threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}

    logging.info("Collecting Threads of: " + threads_name)

    # check if archive already exists
    file_path = os.path.join(base_arch_dir, threads['name'] + ".json")
    if os.path.isfile(file_path):
        logging.info("archive already exists. loading from file " + file_path)
        with open(file_path, 'r') as fpin:
            threads = json.load(fpin)
    else:
        lists = soup.select('ul:nth-of-type(1) > li')

        nbr_threads = str(len(lists))
        n = 0

        for l in lists:
            n += 1
            logging.info("> " + str(n) + " / " + nbr_threads)

            try:
                thread = archive_thread(l, base_url, None)
                threads['threads'].append(thread)
            except:
                ex_type, ex, tb = sys.exc_info()
                print ex_type
                print ex
                traceback.print_tb(tb)
                del tb                
                continue

            time.sleep(DELAY)

        # write 
        logging.info("writing archive to file " + file_path)

        with open(file_path, 'w') as fp:
            json.dump(threads, fp, indent=4)

    if mbox:
        mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt")
        mbox_path_gz = mbox_path + ".gz"
        logging.info("writing mbox  " + mbox_path)
        if not os.path.isfile(mbox_path):
            box = mailbox.mbox(mbox_path)
            box.lock()
            try:
                for t in threads['threads']:
                    write_mbox_message(t, box)
                box.flush()
            except:
                ex_type, ex, tb = sys.exc_info()
                print ex_type
                print ex
                traceback.print_tb(tb)
                del tb
            finally:
                box.unlock()

            with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout:
                fpout.writelines(fpin)

        else:
            logging.info("mbox  " + mbox_path + " already exists.")            

    logging.info("done. ")

    return threads

    
def archive_thread(li, base_url, parent_thread_data):

	thread_link = li.select('strong a')[0]
	thread_url = urlparse.urljoin(base_url, thread_link.get('href'))
	thread_id = thread_link.get('name')
	thread_title = thread_link.string
	thread_author_name = li.select('em')[0].string

	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}

	collect_message(thread_url, message)

	follow = li.select('ul > li')
	if len(follow) > 0:
		for f in follow:
			follow_link = f.select('strong a')
			if len (follow_link) > 0:
				archive_thread(f, base_url, message)  ## recursion
	
	if parent_thread_data is None:
		return message

	if u'follow-up' not in parent_thread_data:
		parent_thread_data[u'follow-up'] = []

	parent_thread_data[u'follow-up'].append(message)

	return message


def collect_message(url, message):

    print url

    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, "html.parser")

    #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>

    message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    

    # mhonarc xcomments
    # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
    message['subject'] = parse_xcomment(soup, "X-Subject")
    message['date'] = parse_xcomment(soup, "X-Date")
    message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
    message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
    message['content-type'] = parse_xcomment(soup, 'X-Content-Type')

    # parse what is displayed on the page

    info = soup.select('ul:nth-of-type(1) > li')

    for i in info:
        if i.em == None:
            continue
    	field = i.em.string
    	if field.lower() in message_labels:
    		message[field.lower()] = i.text.strip(field + ": ")

    ## reformat from -- [author_name, email_addr]

    # from_addr = email.utils.parseaddr(message['from'])
    # message['author_name'] = from_addr[0]
    # message['from'] = from_addr[1]

    ## -- content --
    message['content'] = soup.select('pre:nth-of-type(2)')[0].text

# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
def parse_xcomment(soup, xcom):
    com = soup.find(text=re.compile(xcom))
    if com is not None:
        return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
    return com

def to_mbox_message(msg):
    mime = MIMEText('', 'plain', _charset='utf8')
    mime['From'] = msg['from']
    mime['Subject'] = msg['subject']
    mime['Message-Id'] = msg['message-id']
    mime['Date'] = msg['date']
    mime.set_payload(msg['content'], charset='utf8')
    mbox_message = mailbox.mboxMessage(mime)
    mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
    return mbox_message

# throws exception
def write_mbox_message(msg, mbox):
    mbox_msg = to_mbox_message(msg)
    mbox.add(mbox_msg) # here
    if u'follow-up' in msg:
        for f in msg['follow-up']:
            write_mbox_message(f, mbox)
haha! commit 2016-06-23 15:12:16 +02:00			`import urllib2, urllib, urlparse`
			`import logging`
			`from bs4 import BeautifulSoup`
			`import email, email.parser`
			`from email.mime.text import MIMEText`
			`import mailbox`
			`import time, dateutil, string`
			`from pprint import pprint as pp`
			`import sys, os, re, json, gzip`
			`import traceback`

archive directory arg/option 2016-06-27 16:08:51 +02:00			`DELAY = 0.2`
haha! commit 2016-06-23 15:12:16 +02:00
			`# hack for the mailbox module (re: force mbox.add() encoding to utf8)`
			`reload(sys)`
			`sys.setdefaultencoding('utf8')`


			`def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):`

			`response = urllib2.urlopen(url)`
			`html = response.read()`
			`soup = BeautifulSoup(html, "html.parser")`

			`# base url`
			`base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')`

			`#collect name`
			`list_name = soup.select('body p:nth-of-type(2) base title')[0].string`
			`logging.info("Getting " + list_name + " list archive for " + sublist_name)`

			`lists = soup.select('ul:nth-of-type(2) li')`

			`threads = []`

			`for l in lists:`

			`if l.strong is None:`
			`continue`

			`name = l.strong.string`

			`if name.lower() == sublist_name.lower():`

			`threads_url_list = []`
			`threads_links = l.select('ul li a')`
			`for t in threads_links:`
			`thread_url = urlparse.urljoin(base_url, t.get('href'))`
			`threads_url_list.append(thread_url)`

			`nbr_threads = str(len(threads_url_list))`
			`n = 0`

			`for u in threads_url_list:`
			`n += 1`
			`logging.info("## " + str(n) + " / " + nbr_threads + " ##")`
			`threads.append(collect_threads_from_url(u, base_arch_dir, mbox))`

			`return threads`

			`# for u in threads_url_list[0:10]:`
			`# print "---------------------------------------"`
			`# tt = collect_threads_from_url(u, base_arch_dir, mbox)`
			`# threads.append(tt)`


			`return None`

			`def collect_threads_from_url(url, base_arch_dir, mbox):`

			`response = urllib2.urlopen(url)`
			`html = response.read()`
			`soup = BeautifulSoup(html, "html.parser")`

			`# base url`
			`base_url = url`

			`# collect name`
			`threads_name = soup.select('p:nth-of-type(1) title')[0].string`
			`threads_name = threads_name.replace(' ', '_')`

			`# thread data struct`
			`threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}`

			`logging.info("Collecting Threads of: " + threads_name)`

			`# check if archive already exists`
			`file_path = os.path.join(base_arch_dir, threads['name'] + ".json")`
			`if os.path.isfile(file_path):`
			`logging.info("archive already exists. loading from file " + file_path)`
			`with open(file_path, 'r') as fpin:`
			`threads = json.load(fpin)`
			`else:`
			`lists = soup.select('ul:nth-of-type(1) > li')`

			`nbr_threads = str(len(lists))`
			`n = 0`

			`for l in lists:`
			`n += 1`
			`logging.info("> " + str(n) + " / " + nbr_threads)`
archive directory arg/option 2016-06-27 16:08:51 +02:00
			`try:`
			`thread = archive_thread(l, base_url, None)`
			`threads['threads'].append(thread)`
			`except:`
			`ex_type, ex, tb = sys.exc_info()`
			`print ex_type`
			`print ex`
			`traceback.print_tb(tb)`
			`del tb`
			`continue`

haha! commit 2016-06-23 15:12:16 +02:00			`time.sleep(DELAY)`

			`# write`
			`logging.info("writing archive to file " + file_path)`

			`with open(file_path, 'w') as fp:`
			`json.dump(threads, fp, indent=4)`

			`if mbox:`
			`mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt")`
			`mbox_path_gz = mbox_path + ".gz"`
			`logging.info("writing mbox " + mbox_path)`
			`if not os.path.isfile(mbox_path):`
			`box = mailbox.mbox(mbox_path)`
			`box.lock()`
			`try:`
			`for t in threads['threads']:`
			`write_mbox_message(t, box)`
			`box.flush()`
			`except:`
			`ex_type, ex, tb = sys.exc_info()`
			`print ex_type`
			`print ex`
			`traceback.print_tb(tb)`
			`del tb`
			`finally:`
			`box.unlock()`

			`with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout:`
			`fpout.writelines(fpin)`

			`else:`
			`logging.info("mbox " + mbox_path + " already exists.")`

			`logging.info("done. ")`

			`return threads`



			`def archive_thread(li, base_url, parent_thread_data):`

			`thread_link = li.select('strong a')[0]`
			`thread_url = urlparse.urljoin(base_url, thread_link.get('href'))`
			`thread_id = thread_link.get('name')`
			`thread_title = thread_link.string`
			`thread_author_name = li.select('em')[0].string`

			`message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}`

			`collect_message(thread_url, message)`

			`follow = li.select('ul > li')`
			`if len(follow) > 0:`
			`for f in follow:`
			`follow_link = f.select('strong a')`
			`if len (follow_link) > 0:`
			`archive_thread(f, base_url, message) ## recursion`

			`if parent_thread_data is None:`
			`return message`

			`if u'follow-up' not in parent_thread_data:`
			`parent_thread_data[u'follow-up'] = []`

			`parent_thread_data[u'follow-up'].append(message)`

			`return message`


			`def collect_message(url, message):`

			`print url`

			`response = urllib2.urlopen(url)`
			`html = response.read()`
			`soup = BeautifulSoup(html, "html.parser")`

			`#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>`

			`message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')`

			`# mhonarc xcomments`
			`# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html`
			`message['subject'] = parse_xcomment(soup, "X-Subject")`
			`message['date'] = parse_xcomment(soup, "X-Date")`
			`message['from'] = parse_xcomment(soup, "X-From-R13") #useless...`
			`message['message-id'] = parse_xcomment(soup, 'X-Message-Id')`
			`message['content-type'] = parse_xcomment(soup, 'X-Content-Type')`

			`# parse what is displayed on the page`

			`info = soup.select('ul:nth-of-type(1) > li')`

			`for i in info:`
			`if i.em == None:`
			`continue`
			`field = i.em.string`
			`if field.lower() in message_labels:`
			`message[field.lower()] = i.text.strip(field + ": ")`

			`## reformat from -- [author_name, email_addr]`

			`# from_addr = email.utils.parseaddr(message['from'])`
			`# message['author_name'] = from_addr[0]`
			`# message['from'] = from_addr[1]`

			`## -- content --`
			`message['content'] = soup.select('pre:nth-of-type(2)')[0].text`

			`# mhonarc xcomments`
			`# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html`
			`def parse_xcomment(soup, xcom):`
			`com = soup.find(text=re.compile(xcom))`
			`if com is not None:`
			`return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()`
			`return com`

			`def to_mbox_message(msg):`
			`mime = MIMEText('', 'plain', _charset='utf8')`
			`mime['From'] = msg['from']`
			`mime['Subject'] = msg['subject']`
			`mime['Message-Id'] = msg['message-id']`
			`mime['Date'] = msg['date']`
			`mime.set_payload(msg['content'], charset='utf8')`
			`mbox_message = mailbox.mboxMessage(mime)`
			`mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))`
			`return mbox_message`

			`# throws exception`
			`def write_mbox_message(msg, mbox):`
			`mbox_msg = to_mbox_message(msg)`
			`mbox.add(mbox_msg) # here`
			`if u'follow-up' in msg:`
			`for f in msg['follow-up']:`
			`write_mbox_message(f, mbox)`