reporting

This commit is contained in:
gauthiier 2019-12-31 17:52:52 +01:00
parent 9c6ea7e1be
commit 8a92b3b1be
4 changed files with 272 additions and 0 deletions

5
list_all_files.py Normal file
View File

@ -0,0 +1,5 @@
from pathlib import Path
m = list(Path('.').rglob('*.py'))
for f in m:
print(f)

187
report.py Normal file
View File

@ -0,0 +1,187 @@
import argparse, os, glob, sys, json, re, logging, hashlib
import mysql.connector as mariadb
from report import listsreport
from archive import archive
import config
from datetime import datetime
import collections
hashes = []
def hash(m):
return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest()
def report_msg(msg):
global hashes
h = hash(msg)
if h in hashes:
# logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...")
return None
else:
hashes.append(h)
chars = 0
words = 0
lines = 0
chars += len(msg["content"])
words += len(re.findall(r'\w+', msg["content"]))
lines += len(msg["content"].split('\n'))
if 'follow-up' in msg:
for f in msg['follow-up']:
x = report_msg(f)
if x is not None:
chars += x[0]
words += x[1]
lines += x[2]
return (chars, words, lines)
def year_filename(fn):
return datetime.strptime(fn, "%B_%Y.json").year
def report_all(l):
global hashes
ll = l
if not l.startswith(config.archives):
l = os.path.join(config.archives, l)
if not os.path.isdir(l):
sys.exit(l + ' is not a valid archive. Aborting.')
files = [f for f in glob.glob(os.path.join(l, "*.json"))]
report = {}
for f in files:
# print(os.path.basename(f))
with open(f) as fp:
d = json.load(fp)
for t in d['threads']:
x = report_msg(t)
year = year_filename(os.path.basename(f))
# print(year)
if year not in list(report.keys()):
report[year] = {'chars': 0, 'words': 0, 'lines': 0}
if x is not None:
report[year]['chars'] += x[0]
report[year]['words'] += x[1]
report[year]['lines'] += x[2]
total_chars = 0
total_words = 0
total_lines = 0
print("[" + ll + "] Number of written characters per year:")
sorted_report = collections.OrderedDict(sorted(report.items()))
for k, v in sorted_report.items():
print(" " + str(k) + ": " + str(v['chars']))
total_chars += v['chars']
print("[" + ll + "] Number of written words per year:")
for k, v in sorted_report.items():
print(" " + str(k) + ": " + str(v['words']))
total_words += v['words']
print("[" + ll + "] Number of written lines per year:")
for k, v in sorted_report.items():
print(" " + str(k) + ": " + str(v['lines']))
total_lines += v['lines']
print("[" + ll + "] Total chars: " + str(total_chars))
print("[" + ll + "] Total words: " + str(total_words))
print("[" + ll + "] Total lines: " + str(total_lines))
def format_from(f):
i = f.lower().find("{at}")
if i > 0:
return re.sub('[0-9a-zA-Z]', '*', f[:i]) + f[i:] + "; "
else:
return ""
if __name__ == "__main__":
p = argparse.ArgumentParser(description='Mailinglists report')
p.add_argument('lists', metavar="lists", help="lists to report", nargs="+")
p.add_argument('--txt', '-t', default=False, help="output as text", action="store_true")
args = p.parse_args()
db_con = archive.connect_db(config.db['database'], config.db['host'], config.db['user'], config.db['password'])
if db_con is None:
logging.warning("Not connection to database...")
sys.exit()
try:
cursor = db_con.cursor(buffered=True)
report = {}
for li in args.lists:
report[li] = {}
# contributors
contribs = listsreport.contributors(li, cursor)
contribs_year = listsreport.contributors(li, cursor, per_year=True)
report[li]['contributors'] = contribs
report[li]['contributors_year'] = contribs_year
# size text
text_size = listsreport.text_size(li, cursor)
text_size_year = listsreport.text_size(li, cursor, per_year=True)
report[li]['text_size'] = text_size
report[li]['text_size_year'] = text_size_year
# contributions
contribs = listsreport.contributions(li, cursor)
contribs_year = listsreport.contributions(li, cursor, per_year=True)
report[li]['contributions'] = contribs
report[li]['contributions_year'] = contribs_year
except mariadb.Error as error:
logging.error("Error: {}".format(error))
finally:
cursor.close()
if args.txt:
for k, v in report.items():
# print("Report: " + k)
# contributions
print("[" + k + "] Total contributions: " + v['contributions'])
print("[" + k + "] Contributions per year:")
for c in v['contributions_year']:
print(" " + c['year'] + ": " + c['val'])
report_all(k)
# print("[" + k + "] Total number of written characters: " + v['text_size'])
# print("[" + k + "] Number of written characters per year:")
# for c in v['text_size_year']:
# print(" " + c['year'] + ": " + c['val'])
print("[" + k + "] Total number of disctinct contributors' email address: " + str(len(v['contributors'])))
print("[" + k + "] Cohort of contributors per year:")
for c in v['contributions_year']:
print(" " + c['year'] + ": " + c['val'])
print("[" + k + "] List of contributors obfuscated addresses:")
list_contibutors = ""
for c in v['contributors']:
list_contibutors += format_from(c)
print(list_contibutors)
else:
print(json.dumps(report))

0
report/__init__.py Normal file
View File

80
report/listsreport.py Normal file
View File

@ -0,0 +1,80 @@
import mysql.connector as mariadb
'''
contributors
'''
CONTRIBUTORS_QUERY = ("SELECT DISTINCT(from_) FROM {} ")
CONTRIBUTORS_CNT_PER_YEAR_QUERY = ("SELECT YEAR(date_) theyear, COUNT(DISTINCT(from_)) FROM {} GROUP BY theyear")
'''
contributions
'''
CONTRIBUTIONS_QUERY = ("SELECT COUNT(*) FROM {}")
CONTRIBUTIONS_PER_YEAR_QUERY = ("SELECT YEAR(date_) theyear, COUNT(*) FROM {} GROUP BY theyear")
'''
size text
'''
SIZE_TEXT_QUERY = ("SELECT SUM(LENGTH(content_)) FROM {}")
SIZE_TEXT_PER_YEAR_QUERY = ("SELECT YEAR(date_) theyear, SUM(LENGTH(content_)) FROM {} GROUP BY theyear")
'''
nbr words
'''
SIZE_TEXT_QUERY = ("SELECT SUM(LENGTH(content_)) FROM {}")
SIZE_TEXT_PER_YEAR_QUERY = ("SELECT YEAR(date_) theyear, SUM(LENGTH(content_)) FROM {} GROUP BY theyear")
def contributors(li, cursor, per_year=False):
if per_year:
cursor.execute(CONTRIBUTORS_CNT_PER_YEAR_QUERY.format(li))
else:
cursor.execute(CONTRIBUTORS_QUERY.format(li))
results = []
if per_year:
for (year, val) in cursor:
results.append({'year' : str(year), 'val': str(val)})
else:
for c in cursor:
results += c
return results
def contributions(li, cursor, per_year=False):
if per_year:
cursor.execute(CONTRIBUTIONS_PER_YEAR_QUERY.format(li))
else:
cursor.execute(CONTRIBUTIONS_QUERY.format(li))
results = []
if per_year:
for (year, val) in cursor:
results.append({'year' : str(year), 'val': str(val)})
else:
for c in cursor:
return str(c[0])
return results
def text_size(li, cursor, per_year=False):
if per_year:
cursor.execute(SIZE_TEXT_PER_YEAR_QUERY.format(li))
else:
cursor.execute(SIZE_TEXT_QUERY.format(li))
results = []
if per_year:
for (year, val) in cursor:
results.append({'year' : str(year), 'val': str(val)})
else:
for c in cursor:
return str(c[0])
return results