colophon

2019-12-31 17:54:52 +01:00
parent dd0d9469ea
commit 18ed080652
3 changed files with 792 additions and 21 deletions
@@ -0,0 +1,5 @@
+from pathlib import Path
+
+m = list(Path('.').rglob('*.py'))
+for f in m:
+	print(f)
@@ -1,40 +1,100 @@
-import argparse, os, sys, glob, json
+import argparse, os, sys, glob, re, json, hashlib, logging
+from datetime import datetime
+import config
+import collections
+

 ARCH = "archives/"

+hashes = []
+
+def hash(m):
+	return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest()
+
+def report_msg(msg):
+	global hashes
+
+	h = hash(msg)																			
+	if h in hashes:
+		# logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...")
+		return None
+	else:
+		hashes.append(h)
+
+		chars = 0
+		words = 0
+		lines = 0
+
+		chars += len(msg["content"])
+		words += len(re.findall(r'\w+', msg["content"]))
+		lines += len(msg["content"].split('\n'))
+
+		if 'follow-up' in msg:
+			for f in msg['follow-up']:
+				x = report_msg(f)
+				if x is not None:
+					chars += x[0]
+					words += x[1]
+					lines += x[2]
+
+		return (chars, words, lines)
+
+def year_filename(fn):
+	return datetime.strptime(fn, "%B_%Y.json").year
+

 def run(l):

-	if not l.startswith(ARCH):
-		l = os.path.join(ARCH, l)
+	global hashes
+
+	if not l.startswith(config.archives):
+		l = os.path.join(config.archives, l)

 	if not os.path.isdir(l):
 		sys.exit(l + ' is not a valid archive. Aborting.')

 	files = [f for f in glob.glob(os.path.join(l, "*.json"))]

+	report = {}
+
+	for f in files:		
+		# print(os.path.basename(f))
+
+		with open(f) as fp:
+			d = json.load(fp)
+
+		for t in d['threads']:
+			x = report_msg(t)
+
+		year = year_filename(os.path.basename(f))
+		# print(year)
+		if year not in list(report.keys()):
+			report[year] = {'chars': 0, 'words': 0, 'lines': 0}
+
+		if x is not None:
+			report[year]['chars'] += x[0]
+			report[year]['words'] += x[1]
+			report[year]['lines'] += x[2]
+
 	total_chars = 0
 	total_words = 0
 	total_lines = 0
-	for f in files:		
-		with open(f) as fp:
-			d = json.load(fp)
-			# print(d['name'])
-			chars = 0
-			words = 0
-			lines = 0
-			for t in d['threads']:
-				chars += len(t["content"])
-				words += len(t["content"].split())
-				lines += len(t["content"].split('\n'))
-			# print("	chars: " + str(chars))
-			# print("	words: " + str(words))
-			# print("	lines: " + str(lines))
-			total_chars += chars
-			total_words += words
-			total_lines += lines
-
 	print("\n\n" + l)
+	print("Number of written characters per year:")
+	sorted_report = collections.OrderedDict(sorted(report.items()))
+	for k, v in sorted_report.items():
+		print("	" + str(k) + ": " + str(v['chars']))
+		total_chars += v['chars']
+	print("Number of written words per year:")
+	for k, v in sorted_report.items():
+		print("	" + str(k) + ": " + str(v['words']))
+		total_words += v['words']
+	print("Number of written lines per year:")
+	for k, v in sorted_report.items():
+		print("	" + str(k) + ": " + str(v['lines']))
+		total_lines += v['lines']
+
+	
 	print("Total chars: " + str(total_chars))
 	print("Total words: " + str(total_words))
 	print("Total lines: " + str(total_lines))