From 893515735add60667310a391dec5e725b53b9cfa Mon Sep 17 00:00:00 2001
From: gauthiier <d@gauthiier.info>
Date: Mon, 2 Dec 2019 17:03:54 +0100
Subject: [PATCH] new stats + export filters

---
 .gitignore    |  1 +
 export_xml.py | 80 +++++++++++++++++++++++++++++++++++++--------------
 pdf_stats.py  | 24 ++++++++++++++++
 3 files changed, 84 insertions(+), 21 deletions(-)
 create mode 100644 pdf_stats.py

diff --git a/.gitignore b/.gitignore
index 7c4a867..b60704e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 
 .DS_Store
 archives/
+export/
 
 
 # ---> Python
diff --git a/export_xml.py b/export_xml.py
index ebe73d4..300aa54 100644
--- a/export_xml.py
+++ b/export_xml.py
@@ -1,17 +1,40 @@
-import argparse, os, glob, sys, json, email.utils
+import argparse, os, glob, sys, json, email.utils, logging
 import xml.etree.ElementTree as et
 from datetime import datetime
 import regex as re
 
+nn = 0
+logging.basicConfig(level=logging.CRITICAL)
+
+xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]')
+nl_re = re.compile('\n\n\n')
+ind_re = re.compile('--------------------------------------------------------------------------')
+
 def format_subject(s):
 	return ' '.join(s.split())
 
 def format_content(c):
+
+	c = c.strip()
+
+	# new lines 
+	c = re.sub(nl_re, '\n', c)
+
+	## weird stuff
+
+	# 1. indesign automatic overset... (? dunno why ?)
+	# ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime)
+	c = re.sub(ind_re, '', c)
+
+	return c
+
+
 	return c.strip().replace("\n\n\n", "\n")
 
 # See for Nevejan?s research
-def remove_control_characters(s):
-    return re.sub(r'\p{C}-[\r\n\t]+', '', s)
+def remove_invalid_xml_characters(s):
+	return re.sub(xml_re, '', s)
+    # return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s)
     # return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)
 
 def parse_date_file(fname):
@@ -25,12 +48,12 @@ def parse_date_msg(msg):
 		date_tz = email.utils.parsedate_tz(date_str)
 		time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
 	except TypeError:
-		print("Format Date TypeError")
-		print("  > " + date_str)
+		logging.warning("Format Date TypeError")
+		loggin.warning("  > " + date_str)
 		return None
 	except ValueError:
-		print("Format Date ValueError")
-		print("  > " + date_str)
+		loggin.warning("Format Date ValueError")
+		loggin.warning("  > " + date_str)
 		return None
 	except Exception as ex:
 		print(ex)
@@ -43,12 +66,12 @@ def index_follow_up(msg):
 	if 'follow-up' in msg:
 		for m in msg['follow-up']:
 			d = parse_date_msg(m)
+			if d is None:
+				d = parse_date_msg(msg) # same as parent
 			r.append((d, m))
 			r += index_follow_up(m)
 	return r
 
-nn = 0
-
 def emit_mail_xml(msg, xmlel):
 
 	global nn
@@ -91,7 +114,7 @@ def export_file(f, fout):
 	for t in d['threads']:
 		emit_mail_xml(t, all_mail)		
 
-	fout.write(et.tostring(all_mail))		
+	fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))		
 
 def export_year(d, dt, fout):
 
@@ -115,7 +138,7 @@ def export_year(d, dt, fout):
 
 	for d, f in dates:
 
-		print(f)
+		logging.debug(f)
 
 		section = et.SubElement(chapter, 'section')
 		month = et.SubElement(section, 'month')
@@ -129,32 +152,47 @@ def export_year(d, dt, fout):
 			emit_mail_xml(t, mails)		
 
 	# write utf8 to file (et.tostring are bytes)
-	fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
-	# fout.write(remove_control_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
+	# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
+	fout.write(remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
 
 if __name__ == "__main__":
 
 	p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
 	p.add_argument('file', metavar="f", help="mailinglist file(s) or dir(s) to export", nargs="+")
 	p.add_argument('--year', '-y', metavar='y', type=str, help='year of archive')
+	p.add_argument('--interval', '-i', metavar='i', type=str, help='years interval')
+	p.add_argument('--output', '-o', metavar='o', type=str, help='output directory')
 
 	args = p.parse_args()
 
+	years = []
+
 	if args.year:
-		dt = datetime.strptime(args.year, '%Y')		
+		years.append(datetime.strptime(args.year, '%Y'))
+
+	if args.interval:
+		r = args.interval.split('-')
+		years = list(map(lambda x: datetime.strptime(str(x), '%Y'), range(int(r[0]), int(r[1]) + 1)))
 
 	if not args.file:
 		sys.exit('No file(s). Aborting.')
 
 # with open("out.xml", "w") as fout:
 	for f in args.file:
-		if args.year:
-			if not os.path.isdir(f):
-				sys.exit(f + ' is not a valid directory. Aborting.')
-			foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml"
-			with open(foutname, "w") as fout:
-				export_year(f, dt, fout)
-				print("nbr of message exported: " + str(nn))
+		if not os.path.isdir(f):
+			logging.warning(f + ' is not a valid directory.')
+			continue
+		if len(years) > 0:
+			for y in years:
+				yn = y.strftime('%Y')
+				foutname = os.path.basename(f) + "_" + yn + ".xml"	
+				if args.output and os.path.isdir(args.output):
+					foutname = os.path.join(args.output, foutname)
+				with open(foutname, "w") as fout:
+					sys.stdout.write("Processing - " + yn)
+					export_year(f, y, fout)
+					print(" - nbr of message exported: " + str(nn))
+					nn = 0
 		else:
 			if not os.path.isfile(f):
 				sys.exit(f + ' is not a valid file. Aborting.')			
diff --git a/pdf_stats.py b/pdf_stats.py
new file mode 100644
index 0000000..6ff011c
--- /dev/null
+++ b/pdf_stats.py
@@ -0,0 +1,24 @@
+import argparse, os, sys, glob
+from PyPDF2 import PdfFileReader
+
+if __name__ == "__main__":
+
+	p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
+	p.add_argument('dir', metavar="dir", help="pdf dir")
+
+	args = p.parse_args()
+
+	if not os.path.isdir(args.dir):
+		sys.exit(l + ' is not a valid directory. Aborting.')
+
+	files = [f for f in glob.glob(os.path.join(args.dir, "*.pdf"))]
+
+	total_pages = 0
+	for f in files:		
+		with open(f) as fp:
+			p = PdfFileReader(f)
+			np = p.getNumPages()
+			print(f + " - nbr. pages: " + str(np))
+			total_pages += np
+
+	print(". . . . \n Total pages: " + str(total_pages))