import argparse, os, glob, sys, json, re, logging, hashlib import mysql.connector as mariadb from report import listsreport from archive import archive import config from datetime import datetime import collections hashes = [] def hash(m): return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest() def report_msg(msg): global hashes h = hash(msg) if h in hashes: # logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...") return None else: hashes.append(h) chars = 0 words = 0 lines = 0 chars += len(msg["content"]) words += len(re.findall(r'\w+', msg["content"])) lines += len(msg["content"].split('\n')) if 'follow-up' in msg: for f in msg['follow-up']: x = report_msg(f) if x is not None: chars += x[0] words += x[1] lines += x[2] return (chars, words, lines) def year_filename(fn): return datetime.strptime(fn, "%B_%Y.json").year def report_all(l): global hashes ll = l if not l.startswith(config.archives): l = os.path.join(config.archives, l) if not os.path.isdir(l): sys.exit(l + ' is not a valid archive. Aborting.') files = [f for f in glob.glob(os.path.join(l, "*.json"))] report = {} for f in files: # print(os.path.basename(f)) with open(f) as fp: d = json.load(fp) for t in d['threads']: x = report_msg(t) year = year_filename(os.path.basename(f)) # print(year) if year not in list(report.keys()): report[year] = {'chars': 0, 'words': 0, 'lines': 0} if x is not None: report[year]['chars'] += x[0] report[year]['words'] += x[1] report[year]['lines'] += x[2] total_chars = 0 total_words = 0 total_lines = 0 print("[" + ll + "] Number of written characters per year:") sorted_report = collections.OrderedDict(sorted(report.items())) for k, v in sorted_report.items(): print(" " + str(k) + ": " + str(v['chars'])) total_chars += v['chars'] print("[" + ll + "] Number of written words per year:") for k, v in sorted_report.items(): print(" " + str(k) + ": " + str(v['words'])) total_words += v['words'] print("[" + ll + "] Number of written lines per year:") for k, v in sorted_report.items(): print(" " + str(k) + ": " + str(v['lines'])) total_lines += v['lines'] print("[" + ll + "] Total chars: " + str(total_chars)) print("[" + ll + "] Total words: " + str(total_words)) print("[" + ll + "] Total lines: " + str(total_lines)) def format_from(f): i = f.lower().find("{at}") if i > 0: return re.sub('[0-9a-zA-Z]', '*', f[:i]) + f[i:] + "; " else: return "" if __name__ == "__main__": p = argparse.ArgumentParser(description='Mailinglists report') p.add_argument('lists', metavar="lists", help="lists to report", nargs="+") p.add_argument('--txt', '-t', default=False, help="output as text", action="store_true") args = p.parse_args() db_con = archive.connect_db(config.db['database'], config.db['host'], config.db['user'], config.db['password']) if db_con is None: logging.warning("Not connection to database...") sys.exit() try: cursor = db_con.cursor(buffered=True) report = {} for li in args.lists: report[li] = {} # contributors contribs = listsreport.contributors(li, cursor) contribs_year = listsreport.contributors(li, cursor, per_year=True) report[li]['contributors'] = contribs report[li]['contributors_year'] = contribs_year # size text text_size = listsreport.text_size(li, cursor) text_size_year = listsreport.text_size(li, cursor, per_year=True) report[li]['text_size'] = text_size report[li]['text_size_year'] = text_size_year # contributions contribs = listsreport.contributions(li, cursor) contribs_year = listsreport.contributions(li, cursor, per_year=True) report[li]['contributions'] = contribs report[li]['contributions_year'] = contribs_year except mariadb.Error as error: logging.error("Error: {}".format(error)) finally: cursor.close() if args.txt: for k, v in report.items(): # print("Report: " + k) # contributions print("[" + k + "] Total contributions: " + v['contributions']) print("[" + k + "] Contributions per year:") for c in v['contributions_year']: print(" " + c['year'] + ": " + c['val']) report_all(k) # print("[" + k + "] Total number of written characters: " + v['text_size']) # print("[" + k + "] Number of written characters per year:") # for c in v['text_size_year']: # print(" " + c['year'] + ": " + c['val']) print("[" + k + "] Total number of disctinct contributors' email address: " + str(len(v['contributors']))) print("[" + k + "] Cohort of contributors per year:") for c in v['contributions_year']: print(" " + c['year'] + ": " + c['val']) print("[" + k + "] List of contributors obfuscated addresses:") list_contibutors = "" for c in v['contributors']: list_contibutors += format_from(c) print(list_contibutors) else: print(json.dumps(report))