From 893515735add60667310a391dec5e725b53b9cfa Mon Sep 17 00:00:00 2001 From: gauthiier Date: Mon, 2 Dec 2019 17:03:54 +0100 Subject: [PATCH] new stats + export filters --- .gitignore | 1 + export_xml.py | 80 +++++++++++++++++++++++++++++++++++++-------------- pdf_stats.py | 24 ++++++++++++++++ 3 files changed, 84 insertions(+), 21 deletions(-) create mode 100644 pdf_stats.py diff --git a/.gitignore b/.gitignore index 7c4a867..b60704e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ .DS_Store archives/ +export/ # ---> Python diff --git a/export_xml.py b/export_xml.py index ebe73d4..300aa54 100644 --- a/export_xml.py +++ b/export_xml.py @@ -1,17 +1,40 @@ -import argparse, os, glob, sys, json, email.utils +import argparse, os, glob, sys, json, email.utils, logging import xml.etree.ElementTree as et from datetime import datetime import regex as re +nn = 0 +logging.basicConfig(level=logging.CRITICAL) + +xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]') +nl_re = re.compile('\n\n\n') +ind_re = re.compile('--------------------------------------------------------------------------') + def format_subject(s): return ' '.join(s.split()) def format_content(c): + + c = c.strip() + + # new lines + c = re.sub(nl_re, '\n', c) + + ## weird stuff + + # 1. indesign automatic overset... (? dunno why ?) + # ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime) + c = re.sub(ind_re, '', c) + + return c + + return c.strip().replace("\n\n\n", "\n") # See for Nevejan?s research -def remove_control_characters(s): - return re.sub(r'\p{C}-[\r\n\t]+', '', s) +def remove_invalid_xml_characters(s): + return re.sub(xml_re, '', s) + # return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s) # return re.sub(r'\p{Cc}-[\r\n\t]+', '', s) def parse_date_file(fname): @@ -25,12 +48,12 @@ def parse_date_msg(msg): date_tz = email.utils.parsedate_tz(date_str) time_tz = email.utils.mktime_tz(date_tz) #utc timestamp except TypeError: - print("Format Date TypeError") - print(" > " + date_str) + logging.warning("Format Date TypeError") + loggin.warning(" > " + date_str) return None except ValueError: - print("Format Date ValueError") - print(" > " + date_str) + loggin.warning("Format Date ValueError") + loggin.warning(" > " + date_str) return None except Exception as ex: print(ex) @@ -43,12 +66,12 @@ def index_follow_up(msg): if 'follow-up' in msg: for m in msg['follow-up']: d = parse_date_msg(m) + if d is None: + d = parse_date_msg(msg) # same as parent r.append((d, m)) r += index_follow_up(m) return r -nn = 0 - def emit_mail_xml(msg, xmlel): global nn @@ -91,7 +114,7 @@ def export_file(f, fout): for t in d['threads']: emit_mail_xml(t, all_mail) - fout.write(et.tostring(all_mail)) + fout.write(et.tostring(all_mail).decode('utf-8', 'ignore')) def export_year(d, dt, fout): @@ -115,7 +138,7 @@ def export_year(d, dt, fout): for d, f in dates: - print(f) + logging.debug(f) section = et.SubElement(chapter, 'section') month = et.SubElement(section, 'month') @@ -129,32 +152,47 @@ def export_year(d, dt, fout): emit_mail_xml(t, mails) # write utf8 to file (et.tostring are bytes) - fout.write(et.tostring(chapter).decode('utf-8', 'ignore')) - # fout.write(remove_control_characters(et.tostring(chapter).decode('utf-8', 'ignore'))) + # fout.write(et.tostring(chapter).decode('utf-8', 'ignore')) + fout.write(remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore'))) if __name__ == "__main__": p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!') p.add_argument('file', metavar="f", help="mailinglist file(s) or dir(s) to export", nargs="+") p.add_argument('--year', '-y', metavar='y', type=str, help='year of archive') + p.add_argument('--interval', '-i', metavar='i', type=str, help='years interval') + p.add_argument('--output', '-o', metavar='o', type=str, help='output directory') args = p.parse_args() + years = [] + if args.year: - dt = datetime.strptime(args.year, '%Y') + years.append(datetime.strptime(args.year, '%Y')) + + if args.interval: + r = args.interval.split('-') + years = list(map(lambda x: datetime.strptime(str(x), '%Y'), range(int(r[0]), int(r[1]) + 1))) if not args.file: sys.exit('No file(s). Aborting.') # with open("out.xml", "w") as fout: for f in args.file: - if args.year: - if not os.path.isdir(f): - sys.exit(f + ' is not a valid directory. Aborting.') - foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml" - with open(foutname, "w") as fout: - export_year(f, dt, fout) - print("nbr of message exported: " + str(nn)) + if not os.path.isdir(f): + logging.warning(f + ' is not a valid directory.') + continue + if len(years) > 0: + for y in years: + yn = y.strftime('%Y') + foutname = os.path.basename(f) + "_" + yn + ".xml" + if args.output and os.path.isdir(args.output): + foutname = os.path.join(args.output, foutname) + with open(foutname, "w") as fout: + sys.stdout.write("Processing - " + yn) + export_year(f, y, fout) + print(" - nbr of message exported: " + str(nn)) + nn = 0 else: if not os.path.isfile(f): sys.exit(f + ' is not a valid file. Aborting.') diff --git a/pdf_stats.py b/pdf_stats.py new file mode 100644 index 0000000..6ff011c --- /dev/null +++ b/pdf_stats.py @@ -0,0 +1,24 @@ +import argparse, os, sys, glob +from PyPDF2 import PdfFileReader + +if __name__ == "__main__": + + p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!') + p.add_argument('dir', metavar="dir", help="pdf dir") + + args = p.parse_args() + + if not os.path.isdir(args.dir): + sys.exit(l + ' is not a valid directory. Aborting.') + + files = [f for f in glob.glob(os.path.join(args.dir, "*.pdf"))] + + total_pages = 0 + for f in files: + with open(f) as fp: + p = PdfFileReader(f) + np = p.getNumPages() + print(f + " - nbr. pages: " + str(np)) + total_pages += np + + print(". . . . \n Total pages: " + str(total_pages))