This commit is contained in:
gauthiier 2019-12-31 17:54:52 +01:00
parent dd0d9469ea
commit 18ed080652
3 changed files with 792 additions and 21 deletions

706
book/colophon.txt Normal file

File diff suppressed because one or more lines are too long

5
list_all_files.py Normal file
View File

@ -0,0 +1,5 @@
from pathlib import Path
m = list(Path('.').rglob('*.py'))
for f in m:
print(f)

102
stats.py
View File

@ -1,40 +1,100 @@
import argparse, os, sys, glob, json
import argparse, os, sys, glob, re, json, hashlib, logging
from datetime import datetime
import config
import collections
ARCH = "archives/"
hashes = []
def hash(m):
return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest()
def report_msg(msg):
global hashes
h = hash(msg)
if h in hashes:
# logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...")
return None
else:
hashes.append(h)
chars = 0
words = 0
lines = 0
chars += len(msg["content"])
words += len(re.findall(r'\w+', msg["content"]))
lines += len(msg["content"].split('\n'))
if 'follow-up' in msg:
for f in msg['follow-up']:
x = report_msg(f)
if x is not None:
chars += x[0]
words += x[1]
lines += x[2]
return (chars, words, lines)
def year_filename(fn):
return datetime.strptime(fn, "%B_%Y.json").year
def run(l):
if not l.startswith(ARCH):
l = os.path.join(ARCH, l)
global hashes
if not l.startswith(config.archives):
l = os.path.join(config.archives, l)
if not os.path.isdir(l):
sys.exit(l + ' is not a valid archive. Aborting.')
files = [f for f in glob.glob(os.path.join(l, "*.json"))]
report = {}
for f in files:
# print(os.path.basename(f))
with open(f) as fp:
d = json.load(fp)
for t in d['threads']:
x = report_msg(t)
year = year_filename(os.path.basename(f))
# print(year)
if year not in list(report.keys()):
report[year] = {'chars': 0, 'words': 0, 'lines': 0}
if x is not None:
report[year]['chars'] += x[0]
report[year]['words'] += x[1]
report[year]['lines'] += x[2]
total_chars = 0
total_words = 0
total_lines = 0
for f in files:
with open(f) as fp:
d = json.load(fp)
# print(d['name'])
chars = 0
words = 0
lines = 0
for t in d['threads']:
chars += len(t["content"])
words += len(t["content"].split())
lines += len(t["content"].split('\n'))
# print(" chars: " + str(chars))
# print(" words: " + str(words))
# print(" lines: " + str(lines))
total_chars += chars
total_words += words
total_lines += lines
print("\n\n" + l)
print("Number of written characters per year:")
sorted_report = collections.OrderedDict(sorted(report.items()))
for k, v in sorted_report.items():
print(" " + str(k) + ": " + str(v['chars']))
total_chars += v['chars']
print("Number of written words per year:")
for k, v in sorted_report.items():
print(" " + str(k) + ": " + str(v['words']))
total_words += v['words']
print("Number of written lines per year:")
for k, v in sorted_report.items():
print(" " + str(k) + ": " + str(v['lines']))
total_lines += v['lines']
print("Total chars: " + str(total_chars))
print("Total words: " + str(total_words))
print("Total lines: " + str(total_lines))