listservs/analysis/query.py

574 lines
15 KiB
Python
Raw Normal View History

2017-11-04 13:34:05 +01:00
import numpy as np
import pandas as pd
import analysis.archive
import logging
class Query:
archive = None # analysis.archive.Archive object
activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
threads = None # ...
single_threads = None
replies = None # ...
def __init__(self, arch=None):
if not isinstance(arch, analysis.archive.Archive):
logging.error("Query constructor Error: arch must be of type analysis.archive.Archive")
raise Exception()
self.archive = arch
'''
activity
'''
def _activity(self):
if self.activity is None:
from_index = self.archive.dataframe.reindex(columns=['from'])
self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
return self.activity
def activity_from(self, email_address, resolution='y', series=False):
eaddr = email_address.replace('@', '{at}').lower()
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._activity()
try:
af = self.activity[eaddr]
except KeyError:
return None
activity_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y'))
activity_from.index.name = 'year'
else:
activity_from.index = activity_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
activity_from.index.name = 'year-month'
if series:
return activity_from
return activity_from.to_frame('nbr-messages').astype(int)
def activity_from_ranking(self, rank=5, filter_nettime=True, series=False):
self._activity()
afr = self.activity.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
if series:
return afr[:rank]
return afr[:rank].to_frame('nbr-messages').astype(int)
# def activity_overall(self, resolution='y', series=False):
# freq = 'M'
# if resolution.lower() == 'y':
# freq = 'AS'
# elif resolution.lower() == 'm':
# freq = 'M'
# else:
# return None
# self._activity()
# y = self.activity.sum(axis=1)
# y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
# if freq == 'AS':
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
# y.index.name = 'year'
# else:
# y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
# y.index.name = 'year-month'
# if series:
# return y
# return y.to_frame('nbr-messages').astype(int)
def activity_overall(self, resolution='y', series=False):
a = self.archive.dataframe['url']
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
y = self.archive.dataframe['url'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-messages').astype(int)
def cohort(self, resolution='m', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._activity()
c = self.activity.idxmax().order().to_frame('date')
c.index = c['date']
cohort = c.groupby([pd.TimeGrouper(freq=freq)]).size()
if freq == 'AS':
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y'))
cohort.index.name = 'year'
else:
cohort.index = cohort.index.format(formatter=lambda x: x.strftime('%Y-%m'))
cohort.index.name = 'year-month'
if series:
return cohort
return cohort.to_frame('first-messages').astype(int)
'''
content lenght
'''
def _content_length(self):
if self.content_length is None:
from_content_index = self.archive.dataframe.reindex(columns=['from', 'content-length'])
self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
return self.content_length
def content_length_from(self, email_address, resolution='y', series=False):
eaddr = email_address.replace('@', '{at}').lower()
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._content_length()
try:
af = self.content_length[eaddr]
except KeyError:
return None
content_length_from = af.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y'))
content_length_from.index.name = 'year'
else:
content_length_from.index = content_length_from.index.format(formatter=lambda x: x.strftime('%Y-%m'))
content_length_from.index.name = 'year-month'
if series:
return content_length_from
return content_length_from.to_frame('nbr-bytes').astype(int)
def content_length_from_ranking(self, resolution='y', rank=5, filter_nettime=True, series=False):
self._content_length()
cfr = self.content_length.sum(axis=0).order(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
cfr = cfr[cfr.index.str.contains(p)]
if series:
return cfr[:rank]
return cfr[:rank].to_frame('nbr-bytes').astype(int)
def content_length_overall(self, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._content_length()
y = self.content_length.sum(axis=1)
y = y.groupby([pd.TimeGrouper(freq=freq)]).sum()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-bytes').astype(int)
'''
threads
'''
def _threads(self, thresh=0):
print("doing threads")
if self.threads is None:
self.threads = self.archive.dataframe[self.archive.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
if self.single_threads is None:
self.single_threads = self.archive.dataframe[(self.archive.dataframe['references'] == 'X') & (self.archive.dataframe['nbr-references'] > thresh)].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
return self.threads;
def threads_ranking(self, rank=5, resolution='y'):
self._threads()
if resolution == None:
data = self.threads.drop('message-id', axis=1)[:rank]
return data.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
# get the threads ranking per time resolution
#
data = self.threads.drop('message-id', axis=1)
data = data.groupby([pd.TimeGrouper(freq=freq)])
r = {}
for k, v in data:
if freq == 'AS':
time_key = k.strftime('%Y')
else:
time_key = k.strftime('%Y-%m')
frame = v[:rank]
frame = frame.reindex_axis(['subject', 'from', 'nbr-references', 'url'], axis=1)
r[time_key] = frame
return r
def threads_replies_to(self, email_address, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
self._threads()
eaddr = email_address.replace('@', '{at}').lower()
self._threads()
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum() # <-- sum = adding up nbr references
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
if series:
return threads_from_ranking[eaddr]
threads_from_ranking = threads_from_ranking[eaddr].to_frame('nbr-threads').astype(int)
if freq == 'AS':
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y'))
threads_from_ranking.index.name = 'year'
else:
threads_from_ranking.index = threads_from_ranking.index.format(formatter=lambda x: x.strftime('%Y-%m'))
threads_from_ranking.index.name = 'year-month'
return threads_from_ranking
def threads_replies_to_ranking(self, rank=5, filter_nettime=True):
self._threads()
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
tfr = tfr[:rank].astype(int)
return tfr
def threads_initiated_from_ranking(self, rank=5, filter_nettime=True, series=False):
self._threads()
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
if series:
return tir[:rank]
return tir[:rank].to_frame('nbr-initiated-threads').astype(int)
def threads_activity_threads_initiated_avg_ranking(self, rank=5, filter_nettime=True):
# activity
self._activity()
afr = self.activity.sum(axis=0).astype(int)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
# initiated threads [top 25]
self._threads()
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)[:25] # <-- top 25
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
inter = afr.index.intersection(tir.index)
avg = tir[inter] / afr[inter]
labels = ['messages', 'threads', 'avg.threads']
return pd.concat([afr[avg.index], tir[avg.index], avg], axis=1, keys=labels).sort_values('avg.threads', ascending=False)[:rank]
def threads_initiated_replies_avg_ranking(self, rank=5, filter_nettime=True):
self._threads()
#initiated
tir = self.threads.reindex(columns=['from']).groupby('from').size().sort_values(ascending=False)
if filter_nettime:
p = r'^((?!nettime*).)*$'
tir = tir[tir.index.str.contains(p)]
#replies [top 25]
tfr = self.threads.reindex(columns=['from', 'nbr-references']).groupby('from').sum().sort_values('nbr-references', ascending=False)[:25] # <-- top 25
if filter_nettime:
p = r'^((?!nettime*).)*$'
tfr = tfr[tfr.index.str.contains(p)]
tfr = tfr['nbr-references'] # dataframe to series
inter = tir.index.intersection(tfr.index)
avg = tfr[inter] / tir[inter]
labels = ['threads', 'replies', 'avg.replies']
return pd.concat([tir[avg.index], tfr[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
def threads_overall(self, resolution='y', aggregate='count', series=False, tresh=0):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
agg = aggregate.lower()
if not agg in ['sum', 'mean', 'count']:
return None
if not self.threads is None:
del self.threads
self.threads = None
self._threads(tresh)
if agg == 'sum':
# number of replies total (re: sum all the replies)
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
elif agg == 'mean':
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
else:
# number of threads (re: msgs with at least one reply)
y = self.threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-threads').astype(int)
def single_threads_overall(self, resolution='y', aggregate='sum', series=False, tresh=1):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
agg = aggregate.lower()
if not agg in ['sum', 'mean', 'count']:
return None
if not self.single_threads is None:
del self.single_threads
self.single_threads = None
self._threads(tresh)
y = self.single_threads['nbr-references'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-threads').astype(int)
'''
replies
'''
def _replies(self):
if self.replies is None:
self.replies = self.archive.dataframe[self.archive.dataframe['references'] != 'X'].reindex(columns=['from','references'])
self.non_replies = self.archive.dataframe[self.archive.dataframe['references'] == 'X'].reindex(columns=['from','references'])
return self.replies;
def replies_ranking(self, rank=5, resolution=None):
self._replies()
if resolution == None:
data = self.replies.groupby('from').size().sort_values(ascending=False)[:rank]
return data.to_frame('nbr_replies')
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
# get the threads ranking per time resolution
#
data = self.replies.groupby([pd.TimeGrouper(freq=freq)])
r = {}
for k, v in data:
if freq == 'AS':
time_key = k.strftime('%Y')
else:
time_key = k.strftime('%Y-%m')
frame = v.groupby('from').size().sort_values(ascending=False)[:rank]
r[time_key] = frame.to_frame('nbr-replies')
return r
def replies_avg_ranking(self, rank=5, filter_nettime=True):
# activity
self._activity()
afr = self.activity.sum(axis=0)
if filter_nettime:
p = r'^((?!nettime*).)*$'
afr = afr[afr.index.str.contains(p)]
# replies in thread [top 25]
self._replies()
rpl = data = self.replies.groupby('from').size().sort_values(ascending=False)[:25]
inter = afr.index.intersection(rpl.index)
avg = rpl[inter] / afr[inter]
labels = ['messages', 'replies', 'avg.replies']
return pd.concat([afr[avg.index], rpl[avg.index], avg], axis=1, keys=labels).sort_values('avg.replies', ascending=False)[:rank]
def replies_overall(self, resolution='y', series=False):
freq = 'M'
if resolution.lower() == 'y':
freq = 'AS'
elif resolution.lower() == 'm':
freq = 'M'
else:
return None
if not self.replies is None:
del self.replies
self.replies = None
self._replies()
y = self.replies['references'].groupby([pd.TimeGrouper(freq=freq)]).count()
if freq == 'AS':
y.index = y.index.format(formatter=lambda x: x.strftime('%Y'))
y.index.name = 'year'
else:
y.index = y.index.format(formatter=lambda x: x.strftime('%Y-%m'))
y.index.name = 'year-month'
if series:
return y
return y.to_frame('nbr-replies').astype(int)