listservs/report.py
2019-12-31 17:52:52 +01:00

188 lines
4.9 KiB
Python

import argparse, os, glob, sys, json, re, logging, hashlib
import mysql.connector as mariadb
from report import listsreport
from archive import archive
import config
from datetime import datetime
import collections
hashes = []
def hash(m):
return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest()
def report_msg(msg):
global hashes
h = hash(msg)
if h in hashes:
# logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...")
return None
else:
hashes.append(h)
chars = 0
words = 0
lines = 0
chars += len(msg["content"])
words += len(re.findall(r'\w+', msg["content"]))
lines += len(msg["content"].split('\n'))
if 'follow-up' in msg:
for f in msg['follow-up']:
x = report_msg(f)
if x is not None:
chars += x[0]
words += x[1]
lines += x[2]
return (chars, words, lines)
def year_filename(fn):
return datetime.strptime(fn, "%B_%Y.json").year
def report_all(l):
global hashes
ll = l
if not l.startswith(config.archives):
l = os.path.join(config.archives, l)
if not os.path.isdir(l):
sys.exit(l + ' is not a valid archive. Aborting.')
files = [f for f in glob.glob(os.path.join(l, "*.json"))]
report = {}
for f in files:
# print(os.path.basename(f))
with open(f) as fp:
d = json.load(fp)
for t in d['threads']:
x = report_msg(t)
year = year_filename(os.path.basename(f))
# print(year)
if year not in list(report.keys()):
report[year] = {'chars': 0, 'words': 0, 'lines': 0}
if x is not None:
report[year]['chars'] += x[0]
report[year]['words'] += x[1]
report[year]['lines'] += x[2]
total_chars = 0
total_words = 0
total_lines = 0
print("[" + ll + "] Number of written characters per year:")
sorted_report = collections.OrderedDict(sorted(report.items()))
for k, v in sorted_report.items():
print(" " + str(k) + ": " + str(v['chars']))
total_chars += v['chars']
print("[" + ll + "] Number of written words per year:")
for k, v in sorted_report.items():
print(" " + str(k) + ": " + str(v['words']))
total_words += v['words']
print("[" + ll + "] Number of written lines per year:")
for k, v in sorted_report.items():
print(" " + str(k) + ": " + str(v['lines']))
total_lines += v['lines']
print("[" + ll + "] Total chars: " + str(total_chars))
print("[" + ll + "] Total words: " + str(total_words))
print("[" + ll + "] Total lines: " + str(total_lines))
def format_from(f):
i = f.lower().find("{at}")
if i > 0:
return re.sub('[0-9a-zA-Z]', '*', f[:i]) + f[i:] + "; "
else:
return ""
if __name__ == "__main__":
p = argparse.ArgumentParser(description='Mailinglists report')
p.add_argument('lists', metavar="lists", help="lists to report", nargs="+")
p.add_argument('--txt', '-t', default=False, help="output as text", action="store_true")
args = p.parse_args()
db_con = archive.connect_db(config.db['database'], config.db['host'], config.db['user'], config.db['password'])
if db_con is None:
logging.warning("Not connection to database...")
sys.exit()
try:
cursor = db_con.cursor(buffered=True)
report = {}
for li in args.lists:
report[li] = {}
# contributors
contribs = listsreport.contributors(li, cursor)
contribs_year = listsreport.contributors(li, cursor, per_year=True)
report[li]['contributors'] = contribs
report[li]['contributors_year'] = contribs_year
# size text
text_size = listsreport.text_size(li, cursor)
text_size_year = listsreport.text_size(li, cursor, per_year=True)
report[li]['text_size'] = text_size
report[li]['text_size_year'] = text_size_year
# contributions
contribs = listsreport.contributions(li, cursor)
contribs_year = listsreport.contributions(li, cursor, per_year=True)
report[li]['contributions'] = contribs
report[li]['contributions_year'] = contribs_year
except mariadb.Error as error:
logging.error("Error: {}".format(error))
finally:
cursor.close()
if args.txt:
for k, v in report.items():
# print("Report: " + k)
# contributions
print("[" + k + "] Total contributions: " + v['contributions'])
print("[" + k + "] Contributions per year:")
for c in v['contributions_year']:
print(" " + c['year'] + ": " + c['val'])
report_all(k)
# print("[" + k + "] Total number of written characters: " + v['text_size'])
# print("[" + k + "] Number of written characters per year:")
# for c in v['text_size_year']:
# print(" " + c['year'] + ": " + c['val'])
print("[" + k + "] Total number of disctinct contributors' email address: " + str(len(v['contributors'])))
print("[" + k + "] Cohort of contributors per year:")
for c in v['contributions_year']:
print(" " + c['year'] + ": " + c['val'])
print("[" + k + "] List of contributors obfuscated addresses:")
list_contibutors = ""
for c in v['contributors']:
list_contibutors += format_from(c)
print(list_contibutors)
else:
print(json.dumps(report))