import argparse, os, sys, glob, re, json, hashlib, logging from datetime import datetime import config import collections ARCH = "archives/" hashes = [] def hash(m): return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest() def report_msg(msg): global hashes h = hash(msg) if h in hashes: # logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...") return None else: hashes.append(h) chars = 0 words = 0 lines = 0 chars += len(msg["content"]) words += len(re.findall(r'\w+', msg["content"])) lines += len(msg["content"].split('\n')) if 'follow-up' in msg: for f in msg['follow-up']: x = report_msg(f) if x is not None: chars += x[0] words += x[1] lines += x[2] return (chars, words, lines) def year_filename(fn): return datetime.strptime(fn, "%B_%Y.json").year def run(l): global hashes if not l.startswith(config.archives): l = os.path.join(config.archives, l) if not os.path.isdir(l): sys.exit(l + ' is not a valid archive. Aborting.') files = [f for f in glob.glob(os.path.join(l, "*.json"))] report = {} for f in files: # print(os.path.basename(f)) with open(f) as fp: d = json.load(fp) for t in d['threads']: x = report_msg(t) year = year_filename(os.path.basename(f)) # print(year) if year not in list(report.keys()): report[year] = {'chars': 0, 'words': 0, 'lines': 0} if x is not None: report[year]['chars'] += x[0] report[year]['words'] += x[1] report[year]['lines'] += x[2] total_chars = 0 total_words = 0 total_lines = 0 print("\n\n" + l) print("Number of written characters per year:") sorted_report = collections.OrderedDict(sorted(report.items())) for k, v in sorted_report.items(): print(" " + str(k) + ": " + str(v['chars'])) total_chars += v['chars'] print("Number of written words per year:") for k, v in sorted_report.items(): print(" " + str(k) + ": " + str(v['words'])) total_words += v['words'] print("Number of written lines per year:") for k, v in sorted_report.items(): print(" " + str(k) + ": " + str(v['lines'])) total_lines += v['lines'] print("Total chars: " + str(total_chars)) print("Total words: " + str(total_words)) print("Total lines: " + str(total_lines)) if __name__ == "__main__": p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!') p.add_argument('list', metavar="list", help="mailinglist to ana", nargs="+") args = p.parse_args() if not args.list: sys.exit('No list(s). Aborting.') for l in args.list: run(l) print("\n\n . . . . ")