listservs/analysis/query.py

import numpy as np
import pandas as pd
import analysis.archive
import logging

class Query:

	archive = None			# analysis.archive.Archive object
	activity = None			# (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
	content_length = None	# (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
	threads = None			# ...
	single_threads = None
	replies = None			# ...

	def __init__(self, arch=None):

		if not isinstance(arch, analysis.archive.Archive):
			logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
			raise Exception()

		self.archive = arch

	'''
	activity
	'''			

	def _activity(self):

		if self.activity is None:
			from_index = self.archive.dataframe.reindex(columns=['from'])
			self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)

		return self.activity

	def activity_from(self, email_address, resolution='y', series=False):

		eaddr = email_address.replace('@', '{at}').lower()

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None		

		self._activity()
		try:
			af = self.activity[eaddr]			
		except KeyError:
			return None

		activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()

		if freq == 'AS':
			activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
			activity_from.index.name = 'year'
		else:
			activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
			activity_from.index.name = 'year-month'

		if series:
			return activity_from

		return activity_from.to_frame('nbr-messages').astype(int)

	def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
		
		self._activity()
		afr = self.activity.sum(axis=0).order(ascending=False)
		if filter_nettime:
			p = r'^((?!nettime*).)*$'
			afr = afr[afr.index.str.contains(p)]

		if series:
			return afr[:rank]

		return afr[:rank].to_frame('nbr-messages').astype(int)	


	# def activity_overall(self, resolution='y', series=False):

	# 	freq = 'M'
	# 	if resolution.lower() == 'y':
	# 		freq = 'AS'
	# 	elif resolution.lower() == 'm':
	# 		freq = 'M'
	# 	else:
	# 		return None

	# 	self._activity()

	# 	y = self.activity.sum(axis=1)
	# 	y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()

	# 	if freq == 'AS':
	# 		y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
	# 		y.index.name = 'year'
	# 	else:
	# 		y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
	# 		y.index.name = 'year-month'

	# 	if series:
	# 		return y

	# 	return y.to_frame('nbr-messages').astype(int)

	def activity_overall(self, resolution='y', series=False):

		a = self.archive.dataframe['url']

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None

		y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()

		if freq == 'AS':
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
			y.index.name = 'year'
		else:
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
			y.index.name = 'year-month'

		if series:
			return y

		return y.to_frame('nbr-messages').astype(int)

	def cohort(self, resolution='m', series=False):

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None

		self._activity()

		c = self.activity.idxmax().order().to_frame('date')
		c.index = c['date']

		cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()

		if freq == 'AS':
			cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
			cohort.index.name = 'year'
		else:
			cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
			cohort.index.name = 'year-month'

		if series:
			return cohort

		return cohort.to_frame('first-messages').astype(int)

	'''
	content lenght
	'''

	def _content_length(self):

		if self.content_length is None:
			from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
			self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
			self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)

		return self.content_length

	def content_length_from(self, email_address, resolution='y', series=False):

		eaddr = email_address.replace('@', '{at}').lower()

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None		

		self._content_length()
		try:
			af = self.content_length[eaddr]			
		except KeyError:
			return None

		content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()

		if freq == 'AS':
			content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
			content_length_from.index.name = 'year'
		else:
			content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
			content_length_from.index.name = 'year-month'

		if series:
			return content_length_from

		return content_length_from.to_frame('nbr-bytes').astype(int)

	def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
		
		self._content_length()
		cfr = self.content_length.sum(axis=0).order(ascending=False)
		if filter_nettime:
			p = r'^((?!nettime*).)*$'
			cfr = cfr[cfr.index.str.contains(p)]

		if series:
			return cfr[:rank]

		return cfr[:rank].to_frame('nbr-bytes').astype(int)

	def content_length_overall(self, resolution='y', series=False):

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None

		self._content_length()

		y = self.content_length.sum(axis=1)
		y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()

		if freq == 'AS':
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
			y.index.name = 'year'
		else:
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
			y.index.name = 'year-month'

		if series:
			return y

		return y.to_frame('nbr-bytes').astype(int)


	'''
	threads
	'''			

	def _threads(self, thresh=0):

		print("doing threads")

		if self.threads is None:
			self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)

		if self.single_threads is None:
			self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)

		return self.threads;

	def threads_ranking(self, rank=5, resolution='y'):

		self._threads()

		if resolution == None:
			data = self.threads.drop('message-id', axis=1)[:rank]
			return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None

		# get the threads ranking per time resolution
		# 
		data = self.threads.drop('message-id', axis=1)
		data = data.groupby([pd.TimeGrouper(freq=freq)])
		r = {}
		for k, v in data:
			if freq == 'AS':
				time_key = k.strftime('%Y')
			else:
				time_key = k.strftime('%Y-%m')
			frame = v[:rank]
			frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
			r[time_key] = frame
		return r

	def threads_replies_to(self, email_address, resolution='y', series=False):

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None

		self._threads()

		eaddr = email_address.replace('@', '{at}').lower()

		self._threads()
		threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
		threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum()  # <-- sum = adding up nbr references 
		threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)

		if series:
			return threads_from_ranking[eaddr]

		threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)

		if freq == 'AS':
			threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
			threads_from_ranking.index.name = 'year'
		else:
			threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
			threads_from_ranking.index.name = 'year-month'

		return threads_from_ranking

	def threads_replies_to_ranking(self, rank=5, filter_nettime=True):

		self._threads()

		tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)

		if filter_nettime:
			p = r'^((?!nettime*).)*$'
			tfr = tfr[tfr.index.str.contains(p)]

		tfr = tfr[:rank].astype(int)
		return tfr

	def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):

		self._threads()
		tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
		if filter_nettime:
			p = r'^((?!nettime*).)*$'
			tir = tir[tir.index.str.contains(p)]

		if series:
			return tir[:rank]

		return tir[:rank].to_frame('nbr-initiated-threads').astype(int)

	def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):

		# activity
		self._activity()
		afr = self.activity.sum(axis=0).astype(int)
		if filter_nettime:
			p = r'^((?!nettime*).)*$'
			afr = afr[afr.index.str.contains(p)]

		# initiated threads [top 25]
		self._threads()
		tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
		if filter_nettime:
			p = r'^((?!nettime*).)*$'
			tir = tir[tir.index.str.contains(p)]

		inter = afr.index.intersection(tir.index)
		avg = tir[inter] / afr[inter]

		labels = ['messages', 'threads', 'avg.threads']
		return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]

	def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):

		self._threads()

		#initiated
		tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
		if filter_nettime:
			p = r'^((?!nettime*).)*$'
			tir = tir[tir.index.str.contains(p)]

		#replies [top 25]
		tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
		if filter_nettime:
			p = r'^((?!nettime*).)*$'
			tfr = tfr[tfr.index.str.contains(p)]
		tfr = tfr['nbr-references']			# dataframe to series


		inter = tir.index.intersection(tfr.index)
		avg = tfr[inter] / tir[inter] 

		labels = ['threads', 'replies', 'avg.replies']
		return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]


	def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None

		agg = aggregate.lower()
		if not agg in ['sum', 'mean', 'count']:
			return None

		if not self.threads is None:
			del self.threads
			self.threads = None

		self._threads(tresh)

		if agg == 'sum':
			# number of replies total (re: sum all the replies)
			y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
		elif agg == 'mean':
			y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
		else:
			# number of threads (re: msgs with at least one reply)
			y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()

		if freq == 'AS':
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
			y.index.name = 'year'
		else:
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
			y.index.name = 'year-month'

		if series:
			return y

		return y.to_frame('nbr-threads').astype(int)

	def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None

		agg = aggregate.lower()
		if not agg in ['sum', 'mean', 'count']:
			return None

		if not self.single_threads is None:
			del self.single_threads
			self.single_threads = None

		self._threads(tresh)


		y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()


		if freq == 'AS':
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
			y.index.name = 'year'
		else:
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))			
			y.index.name = 'year-month'

		if series:
			return y

		return y.to_frame('nbr-threads').astype(int)


	'''
	replies
	'''

	def _replies(self):

		if self.replies is None:
			self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
			self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
		return self.replies;

	def replies_ranking(self, rank=5, resolution=None):

		self._replies()

		if resolution == None:
			data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
			return data.to_frame('nbr_replies')

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None

		# get the threads ranking per time resolution
		# 
		data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
		r = {}
		for k, v in data:
			if freq == 'AS':
				time_key = k.strftime('%Y')
			else:
				time_key = k.strftime('%Y-%m')
			frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
			r[time_key] = frame.to_frame('nbr-replies')
		return r

	def replies_avg_ranking(self, rank=5, filter_nettime=True):

			# activity
			self._activity()
			afr = self.activity.sum(axis=0)
			if filter_nettime:
				p = r'^((?!nettime*).)*$'
				afr = afr[afr.index.str.contains(p)]

			# replies in thread [top 25]

			self._replies()
			rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]

			inter = afr.index.intersection(rpl.index)
			avg = rpl[inter] / afr[inter]

			labels = ['messages', 'replies', 'avg.replies']
			return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]

	def replies_overall(self, resolution='y', series=False):

		freq = 'M'
		if resolution.lower() == 'y':
			freq = 'AS'
		elif resolution.lower() == 'm':
			freq = 'M'
		else:
			return None

		if not self.replies is None:
			del self.replies
			self.replies = None

		self._replies()

		y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()


		if freq == 'AS':
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
			y.index.name = 'year'
		else:
			y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
			y.index.name = 'year-month'

		if series:
			return y

		return y.to_frame('nbr-replies').astype(int)
many many things... 2017-11-04 13:34:05 +01:00			`import numpy as np`
			`import pandas as pd`
			`import analysis.archive`
			`import logging`

			`class Query:`

			`archive = None # analysis.archive.Archive object`
			`activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))`
			`content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))`
			`threads = None # ...`
			`single_threads = None`
			`replies = None # ...`

			`def __init__(self, arch=None):`

			`if not isinstance(arch, analysis.archive.Archive):`
			`logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")`
			`raise Exception()`

			`self.archive = arch`

			`'''`
			`activity`
			`'''`

			`def _activity(self):`

			`if self.activity is None:`
			`from_index = self.archive.dataframe.reindex(columns=['from'])`
			`self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)`

			`return self.activity`

			`def activity_from(self, email_address, resolution='y', series=False):`

			`eaddr = email_address.replace('@', '{at}').lower()`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`self._activity()`
			`try:`
			`af = self.activity[eaddr]`
			`except KeyError:`
			`return None`

			`activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()`

			`if freq == 'AS':`
			`activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))`
			`activity_from.index.name = 'year'`
			`else:`
			`activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`activity_from.index.name = 'year-month'`

			`if series:`
			`return activity_from`

			`return activity_from.to_frame('nbr-messages').astype(int)`

			`def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):`

			`self._activity()`
			`afr = self.activity.sum(axis=0).order(ascending=False)`
			`if filter_nettime:`
			`p = r'^((?!nettime).)$'`
			`afr = afr[afr.index.str.contains(p)]`

			`if series:`
			`return afr[:rank]`

			`return afr[:rank].to_frame('nbr-messages').astype(int)`


			`# def activity_overall(self, resolution='y', series=False):`

			`# freq = 'M'`
			`# if resolution.lower() == 'y':`
			`# freq = 'AS'`
			`# elif resolution.lower() == 'm':`
			`# freq = 'M'`
			`# else:`
			`# return None`

			`# self._activity()`

			`# y = self.activity.sum(axis=1)`
			`# y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()`

			`# if freq == 'AS':`
			`# y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))`
			`# y.index.name = 'year'`
			`# else:`
			`# y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`# y.index.name = 'year-month'`

			`# if series:`
			`# return y`

			`# return y.to_frame('nbr-messages').astype(int)`

			`def activity_overall(self, resolution='y', series=False):`

			`a = self.archive.dataframe['url']`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()`

			`if freq == 'AS':`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))`
			`y.index.name = 'year'`
			`else:`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`y.index.name = 'year-month'`

			`if series:`
			`return y`

			`return y.to_frame('nbr-messages').astype(int)`

			`def cohort(self, resolution='m', series=False):`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`self._activity()`

			`c = self.activity.idxmax().order().to_frame('date')`
			`c.index = c['date']`

			`cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()`

			`if freq == 'AS':`
			`cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))`
			`cohort.index.name = 'year'`
			`else:`
			`cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`cohort.index.name = 'year-month'`

			`if series:`
			`return cohort`

			`return cohort.to_frame('first-messages').astype(int)`

			`'''`
			`content lenght`
			`'''`

			`def _content_length(self):`

			`if self.content_length is None:`
			`from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])`
			`self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()`
			`self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)`

			`return self.content_length`

			`def content_length_from(self, email_address, resolution='y', series=False):`

			`eaddr = email_address.replace('@', '{at}').lower()`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`self._content_length()`
			`try:`
			`af = self.content_length[eaddr]`
			`except KeyError:`
			`return None`

			`content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()`

			`if freq == 'AS':`
			`content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))`
			`content_length_from.index.name = 'year'`
			`else:`
			`content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`content_length_from.index.name = 'year-month'`

			`if series:`
			`return content_length_from`

			`return content_length_from.to_frame('nbr-bytes').astype(int)`

			`def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):`

			`self._content_length()`
			`cfr = self.content_length.sum(axis=0).order(ascending=False)`
			`if filter_nettime:`
			`p = r'^((?!nettime).)$'`
			`cfr = cfr[cfr.index.str.contains(p)]`

			`if series:`
			`return cfr[:rank]`

			`return cfr[:rank].to_frame('nbr-bytes').astype(int)`

			`def content_length_overall(self, resolution='y', series=False):`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`self._content_length()`

			`y = self.content_length.sum(axis=1)`
			`y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()`

			`if freq == 'AS':`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))`
			`y.index.name = 'year'`
			`else:`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`y.index.name = 'year-month'`

			`if series:`
			`return y`

			`return y.to_frame('nbr-bytes').astype(int)`


			`'''`
			`threads`
			`'''`

			`def _threads(self, thresh=0):`

			`print("doing threads")`

			`if self.threads is None:`
			`self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)`

			`if self.single_threads is None:`
			`self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)`

			`return self.threads;`

			`def threads_ranking(self, rank=5, resolution='y'):`

			`self._threads()`

			`if resolution == None:`
			`data = self.threads.drop('message-id', axis=1)[:rank]`
			`return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`# get the threads ranking per time resolution`
			`#`
			`data = self.threads.drop('message-id', axis=1)`
			`data = data.groupby([pd.TimeGrouper(freq=freq)])`
			`r = {}`
			`for k, v in data:`
			`if freq == 'AS':`
			`time_key = k.strftime('%Y')`
			`else:`
			`time_key = k.strftime('%Y-%m')`
			`frame = v[:rank]`
			`frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)`
			`r[time_key] = frame`
			`return r`

			`def threads_replies_to(self, email_address, resolution='y', series=False):`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`self._threads()`

			`eaddr = email_address.replace('@', '{at}').lower()`

			`self._threads()`
			`threads_from = self.threads.reindex(columns=['from', 'nbr-references'])`
			`threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references`
			`threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)`

			`if series:`
			`return threads_from_ranking[eaddr]`

			`threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)`

			`if freq == 'AS':`
			`threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))`
			`threads_from_ranking.index.name = 'year'`
			`else:`
			`threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`threads_from_ranking.index.name = 'year-month'`

			`return threads_from_ranking`

			`def threads_replies_to_ranking(self, rank=5, filter_nettime=True):`

			`self._threads()`

			`tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)`

			`if filter_nettime:`
			`p = r'^((?!nettime).)$'`
			`tfr = tfr[tfr.index.str.contains(p)]`

			`tfr = tfr[:rank].astype(int)`
			`return tfr`

			`def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):`

			`self._threads()`
			`tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)`
			`if filter_nettime:`
			`p = r'^((?!nettime).)$'`
			`tir = tir[tir.index.str.contains(p)]`

			`if series:`
			`return tir[:rank]`

			`return tir[:rank].to_frame('nbr-initiated-threads').astype(int)`

			`def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):`

			`# activity`
			`self._activity()`
			`afr = self.activity.sum(axis=0).astype(int)`
			`if filter_nettime:`
			`p = r'^((?!nettime).)$'`
			`afr = afr[afr.index.str.contains(p)]`

			`# initiated threads [top 25]`
			`self._threads()`
			`tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25`
			`if filter_nettime:`
			`p = r'^((?!nettime).)$'`
			`tir = tir[tir.index.str.contains(p)]`

			`inter = afr.index.intersection(tir.index)`
			`avg = tir[inter] / afr[inter]`

			`labels = ['messages', 'threads', 'avg.threads']`
			`return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]`

			`def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):`

			`self._threads()`

			`#initiated`
			`tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)`
			`if filter_nettime:`
			`p = r'^((?!nettime).)$'`
			`tir = tir[tir.index.str.contains(p)]`

			`#replies [top 25]`
			`tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25`
			`if filter_nettime:`
			`p = r'^((?!nettime).)$'`
			`tfr = tfr[tfr.index.str.contains(p)]`
			`tfr = tfr['nbr-references'] # dataframe to series`


			`inter = tir.index.intersection(tfr.index)`
			`avg = tfr[inter] / tir[inter]`

			`labels = ['threads', 'replies', 'avg.replies']`
			`return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]`


			`def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`agg = aggregate.lower()`
			`if not agg in ['sum', 'mean', 'count']:`
			`return None`

			`if not self.threads is None:`
			`del self.threads`
			`self.threads = None`

			`self._threads(tresh)`

			`if agg == 'sum':`
			`# number of replies total (re: sum all the replies)`
			`y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()`
			`elif agg == 'mean':`
			`y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()`
			`else:`
			`# number of threads (re: msgs with at least one reply)`
			`y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()`

			`if freq == 'AS':`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))`
			`y.index.name = 'year'`
			`else:`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`y.index.name = 'year-month'`

			`if series:`
			`return y`

			`return y.to_frame('nbr-threads').astype(int)`

			`def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`agg = aggregate.lower()`
			`if not agg in ['sum', 'mean', 'count']:`
			`return None`

			`if not self.single_threads is None:`
			`del self.single_threads`
			`self.single_threads = None`

			`self._threads(tresh)`


			`y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()`


			`if freq == 'AS':`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))`
			`y.index.name = 'year'`
			`else:`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`y.index.name = 'year-month'`

			`if series:`
			`return y`

			`return y.to_frame('nbr-threads').astype(int)`


			`'''`
			`replies`
			`'''`

			`def _replies(self):`

			`if self.replies is None:`
			`self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])`
			`self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])`
			`return self.replies;`

			`def replies_ranking(self, rank=5, resolution=None):`

			`self._replies()`

			`if resolution == None:`
			`data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]`
			`return data.to_frame('nbr_replies')`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`# get the threads ranking per time resolution`
			`#`
			`data = self.replies.groupby([pd.TimeGrouper(freq=freq)])`
			`r = {}`
			`for k, v in data:`
			`if freq == 'AS':`
			`time_key = k.strftime('%Y')`
			`else:`
			`time_key = k.strftime('%Y-%m')`
			`frame = v.groupby('from').size().sort_values(ascending=False)[:rank]`
			`r[time_key] = frame.to_frame('nbr-replies')`
			`return r`

			`def replies_avg_ranking(self, rank=5, filter_nettime=True):`

			`# activity`
			`self._activity()`
			`afr = self.activity.sum(axis=0)`
			`if filter_nettime:`
			`p = r'^((?!nettime).)$'`
			`afr = afr[afr.index.str.contains(p)]`

			`# replies in thread [top 25]`

			`self._replies()`
			`rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]`

			`inter = afr.index.intersection(rpl.index)`
			`avg = rpl[inter] / afr[inter]`

			`labels = ['messages', 'replies', 'avg.replies']`
			`return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]`

			`def replies_overall(self, resolution='y', series=False):`

			`freq = 'M'`
			`if resolution.lower() == 'y':`
			`freq = 'AS'`
			`elif resolution.lower() == 'm':`
			`freq = 'M'`
			`else:`
			`return None`

			`if not self.replies is None:`
			`del self.replies`
			`self.replies = None`

			`self._replies()`

			`y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()`


			`if freq == 'AS':`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))`
			`y.index.name = 'year'`
			`else:`
			`y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))`
			`y.index.name = 'year-month'`

			`if series:`
			`return y`

			`return y.to_frame('nbr-replies').astype(int)`