renamed lib to nettime

2016-07-21 16:04:43 +02:00
parent 63181b37f3
commit fd71309119
4 changed files with 0 additions and 0 deletions
@@ -0,0 +1,403 @@
+import numpy as np
+import pandas as pd
+import email, email.parser
+import os, datetime, json, gzip, re
+from random import randint
+
+def format_from(from_str):
+	from_addr = email.utils.parseaddr(from_str)[1]
+	if '{AT}' not in from_addr:
+		tok = from_str.split()
+		try:
+			at = tok.index('{AT}')
+			from_addr = ''.join(tok[at-1:at+2])
+			if from_addr.startswith('<') or from_addr.endswith('>'):
+				from_addr = from_addr.strip('<').strip('>')
+		except ValueError:
+			return None
+	return from_addr.lower()
+
+def format_date(date_str):
+	try:
+		date_tz = email.utils.parsedate_tz(date_str)
+		time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
+	except TypeError:
+		print "Format Date TypeError"
+		print "  > " + date_str
+		return None
+	except ValueError:
+		print "Format Date ValueError"
+		print "  > " + date_str
+		return None
+
+	dt = datetime.datetime.fromtimestamp(time_tz)
+
+	try:
+		pdt = pd.to_datetime(dt)
+		return pdt
+	except pd.tslib.OutOfBoundsDatetime:
+		print 'time out of bound'
+		print dt
+		return None
+
+def message_to_tuple_record(msg, records, references=None):
+
+	# check date first?
+	date_time = format_date(msg['date'])
+	if not date_time:
+		return
+
+	# filter date?
+	nettime_min_date = pd.to_datetime('01/10/1995', format='%d/%m/%Y')
+	nettime_max_date = pd.to_datetime(datetime.datetime.now())
+	if date_time < nettime_min_date or date_time > nettime_max_date:
+		return None
+
+	# check / filter from email address second?
+	from_addr = format_from(msg['from'])
+	if not from_addr:
+		return
+
+	records.append((msg['message-id'],
+						from_addr,
+						msg['author_name'],
+						msg['subject'],
+						date_time,
+						msg['url'],
+						len(msg['content']),
+						0 if not msg.has_key('follow-up') else len(msg['follow-up']),
+						references))
+
+	if msg.has_key('follow-up'):
+		for f in msg['follow-up']:
+			message_to_tuple_record(f, records, references=msg['message-id'])
+
+	return 
+
+def json_data_to_pd_dataframe(json_data):
+
+	records = []
+	for d in json_data:
+		for dd in d['threads']:
+			message_to_tuple_record(dd, records)
+
+	df = pd.DataFrame.from_records(records,
+						index='date',
+						columns=['message-id',
+									'from',
+									'author',
+									'subject',
+									'date',
+									'url',
+									'content-length',
+									'nbr-references',
+									'references'])
+
+	df.index.name = 'date'
+
+	return df
+
+def load_from_file(filename, archive_dir):
+
+	json_data = None
+	if not filename.endswith('.json.gz'):
+		file_path = os.path.join(archive_dir, filename + '.json.gz')
+	else:
+		file_path = os.path.join(archive_dir, filename)
+
+	if os.path.isfile(file_path):
+		with gzip.open(file_path, 'r') as fp:
+			json_data = json.load(fp)
+			return json_data_to_pd_dataframe(json_data['threads'])
+	else:
+		#list of all "filename[...].json.gz" in archive_dir
+		files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
+		if files:
+			filename = files[-1] # take the most recent (listed alpha-chronological)
+			file_path = os.path.join(archive_dir, filename)
+			if os.path.isfile(file_path):
+				with gzip.open(file_path, 'r') as fp:
+					json_data = json.load(fp)
+					return json_data_to_pd_dataframe(json_data['threads'])
+		else:
+			#list of all json files in archive_dir/filename
+			dir_path = os.path.join(archive_dir, filename)
+			if not os.path.isdir(dir_path):
+				return None
+
+			files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
+			if not files:
+				return None
+
+			# load all json files
+			threads = []
+			for file_path in files:
+				with open(file_path, 'r') as fp:
+					json_data = json.load(fp)
+					threads.append(json_data)
+
+			return json_data_to_pd_dataframe(threads)
+				
+
+class Archive:
+
+	
+	data = None				# "raw" json data
+	dataframe = None 		# main pd dataframe
+
+	activity = None			# (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
+	content_length = None	# (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
+
+	threads = None
+
+	def __init__(self, data="nettime-l", archive_dir="archives"):
+
+		if isinstance(data, pd.core.frame.DataFrame):
+			self.dataframe = data.copy()
+
+		if isinstance(data, str):
+			self.dataframe = load_from_file(data, archive_dir)
+
+	'''
+	activity
+	'''			
+
+	def _activity(self):
+
+		if self.activity is None:
+			from_index = self.dataframe.reindex(columns=['from'])
+			self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
+
+		return self.activity
+
+	def activity_from(self, email_address, resolution='M'):
+
+		eaddr = email_address.replace('@', '{at}').lower()
+
+		self._activity()
+		try:
+			if resolution.lower() == 'm':
+				return self.activity[eaddr]
+			elif resolution.lower() == 'y':
+				y = self.activity[eaddr].resample('AS').sum()
+				y.index = y.index.year
+				return y
+			else:
+				return None
+		except KeyError:
+			return None
+
+	def activity_overall(self, resolution='M'):
+
+		self._activity()
+		try:
+			sum_activity_month = self.activity.sum(axis=1)
+			if resolution.lower() == 'm':
+				sum_activity_month.rename
+				return sum_activity_month
+			elif resolution.lower() == 'y':
+				y = sum_activity_month.resample('AS').sum()
+				y.index = y.index.year
+				return y
+			else:
+				return None
+		except:
+			return None
+
+	def activity_from_ranking(self, resolution='M', rank=5, filter_nettime=True):
+		# finish this -- re resolution AND filtering
+		self._activity()
+		afr = self.activity.sum(axis=0).order(ascending=False)
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			afr = afr[afr.index.str.contains(p)]
+		return afr[:rank]
+
+	def plot_activity_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
+
+		activity_rank = self.activity_from_ranking(rank=rank).keys()
+		series = []
+		for k in activity_rank:
+			series.append(self.activity_from(k, resolution))
+			
+		df = pd.concat(series, axis=1)
+		
+		colors = np.random.rand(len(df),3)
+
+		if figsize:
+			df.plot(colors=colors, figsize=figsize)
+		else:
+			df.plot(colors=colors)
+
+	'''
+	content lenght
+	'''
+
+	def _content_length(self):
+
+		if self.content_length is None:
+			from_content_index = self.dataframe.reindex(columns=['from', 'content-length'])
+			self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
+			self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
+
+		return self.content_length
+
+	def content_length_from(self, email_address, resolution='M'):
+
+		eaddr = email_address.replace('@', '{at}').lower()
+
+		self._content_length()
+		try:
+			if resolution.lower() == 'm':
+				return self.content_length[eaddr]
+			elif resolution.lower() == 'y':
+				y = self.content_length[eaddr].resample('AS').sum()
+				y.index = y.index.year
+				return y
+			else:
+				return None
+		except KeyError:
+			return None
+
+	def content_length_overall(self):
+
+		self._content_length()
+		try:
+			sum_content_length_month = self.content_length.sum(axis=1)
+			if resolution.lower() == 'm':
+				return sum_content_length_month
+			elif resolution.lower() == 'y':
+				y = sum_content_length_month.resample('AS').sum()
+				y.index = y.index.year
+				return y
+			else:
+				return None
+		except:
+			return None
+
+	def content_length_from_ranking(self, resolution='M', rank=5, filter_nettime=True):
+		# finish this -- re resolution
+		self._content_length()
+		cfr = self.content_length.sum(axis=0).order(ascending=False)
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			cfr = cfr[cfr.index.str.contains(p)]
+		return cfr[:rank]
+
+	def plot_content_length_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
+
+		content_rank = self.content_length_from_ranking(rank=rank).keys()
+		series = []
+		for k in content_rank:
+			series.append(self.content_length_from(k, resolution))
+			
+		df = pd.concat(series, axis=1)
+		
+		colors = np.random.rand(len(df),3)
+
+		if figsize:
+			df.plot(colors=colors, figsize=figsize)
+		else:
+			df.plot(colors=colors)
+
+	'''
+	threads
+	'''			
+
+	def _threads(self, thresh=0):
+
+		if self.threads is None:
+			self.threads = self.dataframe[self.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
+		return self.threads;
+
+	def threads_ranking(self, rank=5):
+
+		self._threads()
+		return self.threads.drop('message-id', axis=1)[:rank]
+
+	def threads_from(self, email_address, resolution='y'):
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		self._threads()
+
+		eaddr = email_address.replace('@', '{at}').lower()
+
+		self._threads()
+		threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
+		threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum()
+		threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
+		return threads_from_ranking[eaddr]
+
+	def threads_from_ranking(self, rank=5, filter_nettime=True):
+
+		self._threads()
+		threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
+		threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq='AS'), 'from']).sum()
+		threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
+		tfr = threads_from_ranking.sum(axis=0).order(ascending=False)
+
+		if filter_nettime:
+			p = r'^((?!nettime*).)*$'
+			tfr = tfr[tfr.index.str.contains(p)]
+
+		return tfr[:rank]
+
+	def plot_threads_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
+
+		threads_rank = self.threads_from_ranking(rank=rank).keys()
+		series = []
+		for k in threads_rank:
+			series.append(self.threads_from(k, resolution))
+			
+		df = pd.concat(series, axis=1)
+		
+		colors = np.random.rand(len(df),3)
+
+		if figsize:
+			df.plot(colors=colors, figsize=figsize)
+		else:
+			df.plot(colors=colors)
+
+
+	def threads_overall(self, resolution='y', aggregate='sum', tresh=0):
+
+		freq = 'M'
+		if resolution.lower() == 'y':
+			freq = 'AS'
+		elif resolution.lower() == 'm':
+			freq = 'M'
+		else:
+			return None
+
+		agg = aggregate.lower()
+		if not agg in ['sum', 'mean']:
+			return None
+
+		if not self.threads is None:
+			del self.threads
+			self.threads = None
+
+		self._threads(tresh)
+
+		if agg == 'sum':
+			y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
+		else:
+			y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
+
+		if freq == 'AS':
+			y.index = y.index.year
+
+		return y
+
+
+
+
+
+
@@ -0,0 +1,252 @@
+import urllib2, urllib, urlparse
+import logging
+from bs4 import BeautifulSoup
+import email, email.parser
+from email.mime.text import MIMEText
+import mailbox
+import time, dateutil, string
+from pprint import pprint as pp
+import sys, os, re, json, gzip
+import traceback
+
+DELAY = 0.2
+
+# hack for the mailbox module (re: force mbox.add() encoding to utf8)
+reload(sys) 
+sys.setdefaultencoding('utf8')
+
+
+def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
+
+    response = urllib2.urlopen(url)
+    html = response.read()
+    soup = BeautifulSoup(html, "html.parser")
+
+    # base url 
+    base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
+
+	#collect name
+    list_name = soup.select('body p:nth-of-type(2) base title')[0].string
+    logging.info("Getting " + list_name + " list archive for " + sublist_name)
+
+    lists = soup.select('ul:nth-of-type(2) li')
+
+    threads = []
+
+    for l in lists:
+
+    	if l.strong is None:
+    		continue
+
+    	name = l.strong.string
+
+    	if name.lower() == sublist_name.lower():
+
+            threads_url_list = []
+            threads_links = l.select('ul li a')
+            for t in threads_links:
+                thread_url = urlparse.urljoin(base_url, t.get('href'))
+                threads_url_list.append(thread_url)
+
+            nbr_threads = str(len(threads_url_list))
+            n = 0
+
+            for u in threads_url_list:
+                n += 1
+                logging.info("## " + str(n) + " / " + nbr_threads + " ##")
+                threads.append(collect_threads_from_url(u, base_arch_dir, mbox))                
+
+            return threads
+
+            # for u in threads_url_list[0:10]:
+            #     print "---------------------------------------"
+            #     tt = collect_threads_from_url(u, base_arch_dir, mbox)
+            #     threads.append(tt)
+                
+
+    return None
+
+def collect_threads_from_url(url, base_arch_dir, mbox):
+
+    response = urllib2.urlopen(url)
+    html = response.read()
+    soup = BeautifulSoup(html, "html.parser")
+
+    # base url 
+    base_url = url
+
+    # collect name
+    threads_name = soup.select('p:nth-of-type(1) title')[0].string
+    threads_name = threads_name.replace(' ', '_')
+
+    # thread data struct
+    threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
+
+    logging.info("Collecting Threads of: " + threads_name)
+
+    # check if archive already exists
+    file_path = os.path.join(base_arch_dir, threads['name'] + ".json")
+    if os.path.isfile(file_path):
+        logging.info("archive already exists. loading from file " + file_path)
+        with open(file_path, 'r') as fpin:
+            threads = json.load(fpin)
+    else:
+        lists = soup.select('ul:nth-of-type(1) > li')
+
+        nbr_threads = str(len(lists))
+        n = 0
+
+        for l in lists:
+            n += 1
+            logging.info("> " + str(n) + " / " + nbr_threads)
+
+            try:
+                thread = archive_thread(l, base_url, None)
+                threads['threads'].append(thread)
+            except:
+                ex_type, ex, tb = sys.exc_info()
+                print ex_type
+                print ex
+                traceback.print_tb(tb)
+                del tb                
+                continue
+
+            time.sleep(DELAY)
+
+        # write 
+        logging.info("writing archive to file " + file_path)
+
+        with open(file_path, 'w') as fp:
+            json.dump(threads, fp, indent=4)
+
+    if mbox:
+        mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt")
+        mbox_path_gz = mbox_path + ".gz"
+        logging.info("writing mbox  " + mbox_path)
+        if not os.path.isfile(mbox_path):
+            box = mailbox.mbox(mbox_path)
+            box.lock()
+            try:
+                for t in threads['threads']:
+                    write_mbox_message(t, box)
+                box.flush()
+            except:
+                ex_type, ex, tb = sys.exc_info()
+                print ex_type
+                print ex
+                traceback.print_tb(tb)
+                del tb
+            finally:
+                box.unlock()
+
+            with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout:
+                fpout.writelines(fpin)
+
+        else:
+            logging.info("mbox  " + mbox_path + " already exists.")            
+
+    logging.info("done. ")
+
+    return threads
+
+    
+
+def archive_thread(li, base_url, parent_thread_data):
+
+	thread_link = li.select('strong a')[0]
+	thread_url = urlparse.urljoin(base_url, thread_link.get('href'))
+	thread_id = thread_link.get('name')
+	thread_title = thread_link.string
+	thread_author_name = li.select('em')[0].string
+
+	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
+
+	collect_message(thread_url, message)
+
+	follow = li.select('ul > li')
+	if len(follow) > 0:
+		for f in follow:
+			follow_link = f.select('strong a')
+			if len (follow_link) > 0:
+				archive_thread(f, base_url, message)  ## recursion
+	
+	if parent_thread_data is None:
+		return message
+
+	if u'follow-up' not in parent_thread_data:
+		parent_thread_data[u'follow-up'] = []
+
+	parent_thread_data[u'follow-up'].append(message)
+
+	return message
+
+
+def collect_message(url, message):
+
+    print url
+
+    response = urllib2.urlopen(url)
+    html = response.read()
+    soup = BeautifulSoup(html, "html.parser")
+
+    #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
+
+    message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    
+
+    # mhonarc xcomments
+    # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+    message['subject'] = parse_xcomment(soup, "X-Subject")
+    message['date'] = parse_xcomment(soup, "X-Date")
+    message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
+    message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
+    message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
+
+    # parse what is displayed on the page
+
+    info = soup.select('ul:nth-of-type(1) > li')
+
+    for i in info:
+        if i.em == None:
+            continue
+    	field = i.em.string
+    	if field.lower() in message_labels:
+    		message[field.lower()] = i.text.strip(field + ": ")
+
+    ## reformat from -- [author_name, email_addr]
+
+    # from_addr = email.utils.parseaddr(message['from'])
+    # message['author_name'] = from_addr[0]
+    # message['from'] = from_addr[1]
+
+    ## -- content --
+    message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+
+# mhonarc xcomments
+# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+def parse_xcomment(soup, xcom):
+    com = soup.find(text=re.compile(xcom))
+    if com is not None:
+        return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
+    return com
+
+def to_mbox_message(msg):
+    mime = MIMEText('', 'plain', _charset='utf8')
+    mime['From'] = msg['from']
+    mime['Subject'] = msg['subject']
+    mime['Message-Id'] = msg['message-id']
+    mime['Date'] = msg['date']
+    mime.set_payload(msg['content'], charset='utf8')
+    mbox_message = mailbox.mboxMessage(mime)
+    mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
+    return mbox_message
+
+# throws exception
+def write_mbox_message(msg, mbox):
+    mbox_msg = to_mbox_message(msg)
+    mbox.add(mbox_msg) # here
+    if u'follow-up' in msg:
+        for f in msg['follow-up']:
+            write_mbox_message(f, mbox)
+
+
+
@@ -0,0 +1,26 @@
+import urllib2, urllib, urlparse
+import os, re, json, gzip
+import mhonarccrawl
+import datetime
+
+def archive_from_url(url, sublist_name="nettime-l", archive_dir="archives"):
+    url = url.rstrip()
+    archive_list_dir = check_dir(archive_dir, sublist_name)
+
+    archive_name = sublist_name.lower()
+    archive_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
+    archive = {'name' : sublist_name.lower(), 'url': url, 'date': archive_date, 'threads' : []}
+
+    archive['threads'] = mhonarccrawl.collect_from_url(url, sublist_name, archive_list_dir, mbox=True)
+
+    file_path = os.path.join(archive_dir, archive_name + "_" + archive_date + ".json.gz")
+    with gzip.open(file_path, 'w') as fp:
+        json.dump(archive, fp, indent=4)
+
+    return
+
+def check_dir(base_dir, list_name):
+    arc_dir = os.path.join(base_dir, list_name)
+    if not os.path.exists(arc_dir):
+        os.makedirs(arc_dir)
+    return arc_dir