new stats + export filters

This commit is contained in:
gauthiier 2019-12-02 17:03:54 +01:00
parent f2b98963d4
commit 893515735a
3 changed files with 84 additions and 21 deletions

1
.gitignore vendored
View File

@ -2,6 +2,7 @@
.DS_Store .DS_Store
archives/ archives/
export/
# ---> Python # ---> Python

View File

@ -1,17 +1,40 @@
import argparse, os, glob, sys, json, email.utils import argparse, os, glob, sys, json, email.utils, logging
import xml.etree.ElementTree as et import xml.etree.ElementTree as et
from datetime import datetime from datetime import datetime
import regex as re import regex as re
nn = 0
logging.basicConfig(level=logging.CRITICAL)
xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]')
nl_re = re.compile('\n\n\n')
ind_re = re.compile('--------------------------------------------------------------------------')
def format_subject(s): def format_subject(s):
return ' '.join(s.split()) return ' '.join(s.split())
def format_content(c): def format_content(c):
c = c.strip()
# new lines
c = re.sub(nl_re, '\n', c)
## weird stuff
# 1. indesign automatic overset... (? dunno why ?)
# ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime)
c = re.sub(ind_re, '', c)
return c
return c.strip().replace("\n\n\n", "\n") return c.strip().replace("\n\n\n", "\n")
# See for Nevejan?s research # See for Nevejan?s research
def remove_control_characters(s): def remove_invalid_xml_characters(s):
return re.sub(r'\p{C}-[\r\n\t]+', '', s) return re.sub(xml_re, '', s)
# return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s)
# return re.sub(r'\p{Cc}-[\r\n\t]+', '', s) # return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)
def parse_date_file(fname): def parse_date_file(fname):
@ -25,12 +48,12 @@ def parse_date_msg(msg):
date_tz = email.utils.parsedate_tz(date_str) date_tz = email.utils.parsedate_tz(date_str)
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
except TypeError: except TypeError:
print("Format Date TypeError") logging.warning("Format Date TypeError")
print(" > " + date_str) loggin.warning(" > " + date_str)
return None return None
except ValueError: except ValueError:
print("Format Date ValueError") loggin.warning("Format Date ValueError")
print(" > " + date_str) loggin.warning(" > " + date_str)
return None return None
except Exception as ex: except Exception as ex:
print(ex) print(ex)
@ -43,12 +66,12 @@ def index_follow_up(msg):
if 'follow-up' in msg: if 'follow-up' in msg:
for m in msg['follow-up']: for m in msg['follow-up']:
d = parse_date_msg(m) d = parse_date_msg(m)
if d is None:
d = parse_date_msg(msg) # same as parent
r.append((d, m)) r.append((d, m))
r += index_follow_up(m) r += index_follow_up(m)
return r return r
nn = 0
def emit_mail_xml(msg, xmlel): def emit_mail_xml(msg, xmlel):
global nn global nn
@ -91,7 +114,7 @@ def export_file(f, fout):
for t in d['threads']: for t in d['threads']:
emit_mail_xml(t, all_mail) emit_mail_xml(t, all_mail)
fout.write(et.tostring(all_mail)) fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
def export_year(d, dt, fout): def export_year(d, dt, fout):
@ -115,7 +138,7 @@ def export_year(d, dt, fout):
for d, f in dates: for d, f in dates:
print(f) logging.debug(f)
section = et.SubElement(chapter, 'section') section = et.SubElement(chapter, 'section')
month = et.SubElement(section, 'month') month = et.SubElement(section, 'month')
@ -129,32 +152,47 @@ def export_year(d, dt, fout):
emit_mail_xml(t, mails) emit_mail_xml(t, mails)
# write utf8 to file (et.tostring are bytes) # write utf8 to file (et.tostring are bytes)
fout.write(et.tostring(chapter).decode('utf-8', 'ignore')) # fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
# fout.write(remove_control_characters(et.tostring(chapter).decode('utf-8', 'ignore'))) fout.write(remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
if __name__ == "__main__": if __name__ == "__main__":
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!') p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
p.add_argument('file', metavar="f", help="mailinglist file(s) or dir(s) to export", nargs="+") p.add_argument('file', metavar="f", help="mailinglist file(s) or dir(s) to export", nargs="+")
p.add_argument('--year', '-y', metavar='y', type=str, help='year of archive') p.add_argument('--year', '-y', metavar='y', type=str, help='year of archive')
p.add_argument('--interval', '-i', metavar='i', type=str, help='years interval')
p.add_argument('--output', '-o', metavar='o', type=str, help='output directory')
args = p.parse_args() args = p.parse_args()
years = []
if args.year: if args.year:
dt = datetime.strptime(args.year, '%Y') years.append(datetime.strptime(args.year, '%Y'))
if args.interval:
r = args.interval.split('-')
years = list(map(lambda x: datetime.strptime(str(x), '%Y'), range(int(r[0]), int(r[1]) + 1)))
if not args.file: if not args.file:
sys.exit('No file(s). Aborting.') sys.exit('No file(s). Aborting.')
# with open("out.xml", "w") as fout: # with open("out.xml", "w") as fout:
for f in args.file: for f in args.file:
if args.year:
if not os.path.isdir(f): if not os.path.isdir(f):
sys.exit(f + ' is not a valid directory. Aborting.') logging.warning(f + ' is not a valid directory.')
foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml" continue
if len(years) > 0:
for y in years:
yn = y.strftime('%Y')
foutname = os.path.basename(f) + "_" + yn + ".xml"
if args.output and os.path.isdir(args.output):
foutname = os.path.join(args.output, foutname)
with open(foutname, "w") as fout: with open(foutname, "w") as fout:
export_year(f, dt, fout) sys.stdout.write("Processing - " + yn)
print("nbr of message exported: " + str(nn)) export_year(f, y, fout)
print(" - nbr of message exported: " + str(nn))
nn = 0
else: else:
if not os.path.isfile(f): if not os.path.isfile(f):
sys.exit(f + ' is not a valid file. Aborting.') sys.exit(f + ' is not a valid file. Aborting.')

24
pdf_stats.py Normal file
View File

@ -0,0 +1,24 @@
import argparse, os, sys, glob
from PyPDF2 import PdfFileReader
if __name__ == "__main__":
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
p.add_argument('dir', metavar="dir", help="pdf dir")
args = p.parse_args()
if not os.path.isdir(args.dir):
sys.exit(l + ' is not a valid directory. Aborting.')
files = [f for f in glob.glob(os.path.join(args.dir, "*.pdf"))]
total_pages = 0
for f in files:
with open(f) as fp:
p = PdfFileReader(f)
np = p.getNumPages()
print(f + " - nbr. pages: " + str(np))
total_pages += np
print(". . . . \n Total pages: " + str(total_pages))