new stats + export filters
This commit is contained in:
parent
f2b98963d4
commit
893515735a
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
.DS_Store
|
.DS_Store
|
||||||
archives/
|
archives/
|
||||||
|
export/
|
||||||
|
|
||||||
|
|
||||||
# ---> Python
|
# ---> Python
|
||||||
|
|||||||
@ -1,17 +1,40 @@
|
|||||||
import argparse, os, glob, sys, json, email.utils
|
import argparse, os, glob, sys, json, email.utils, logging
|
||||||
import xml.etree.ElementTree as et
|
import xml.etree.ElementTree as et
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
nn = 0
|
||||||
|
logging.basicConfig(level=logging.CRITICAL)
|
||||||
|
|
||||||
|
xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]')
|
||||||
|
nl_re = re.compile('\n\n\n')
|
||||||
|
ind_re = re.compile('--------------------------------------------------------------------------')
|
||||||
|
|
||||||
def format_subject(s):
|
def format_subject(s):
|
||||||
return ' '.join(s.split())
|
return ' '.join(s.split())
|
||||||
|
|
||||||
def format_content(c):
|
def format_content(c):
|
||||||
|
|
||||||
|
c = c.strip()
|
||||||
|
|
||||||
|
# new lines
|
||||||
|
c = re.sub(nl_re, '\n', c)
|
||||||
|
|
||||||
|
## weird stuff
|
||||||
|
|
||||||
|
# 1. indesign automatic overset... (? dunno why ?)
|
||||||
|
# ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime)
|
||||||
|
c = re.sub(ind_re, '', c)
|
||||||
|
|
||||||
|
return c
|
||||||
|
|
||||||
|
|
||||||
return c.strip().replace("\n\n\n", "\n")
|
return c.strip().replace("\n\n\n", "\n")
|
||||||
|
|
||||||
# See for Nevejan?s research
|
# See for Nevejan?s research
|
||||||
def remove_control_characters(s):
|
def remove_invalid_xml_characters(s):
|
||||||
return re.sub(r'\p{C}-[\r\n\t]+', '', s)
|
return re.sub(xml_re, '', s)
|
||||||
|
# return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s)
|
||||||
# return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)
|
# return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)
|
||||||
|
|
||||||
def parse_date_file(fname):
|
def parse_date_file(fname):
|
||||||
@ -25,12 +48,12 @@ def parse_date_msg(msg):
|
|||||||
date_tz = email.utils.parsedate_tz(date_str)
|
date_tz = email.utils.parsedate_tz(date_str)
|
||||||
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
||||||
except TypeError:
|
except TypeError:
|
||||||
print("Format Date TypeError")
|
logging.warning("Format Date TypeError")
|
||||||
print(" > " + date_str)
|
loggin.warning(" > " + date_str)
|
||||||
return None
|
return None
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print("Format Date ValueError")
|
loggin.warning("Format Date ValueError")
|
||||||
print(" > " + date_str)
|
loggin.warning(" > " + date_str)
|
||||||
return None
|
return None
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
print(ex)
|
print(ex)
|
||||||
@ -43,12 +66,12 @@ def index_follow_up(msg):
|
|||||||
if 'follow-up' in msg:
|
if 'follow-up' in msg:
|
||||||
for m in msg['follow-up']:
|
for m in msg['follow-up']:
|
||||||
d = parse_date_msg(m)
|
d = parse_date_msg(m)
|
||||||
|
if d is None:
|
||||||
|
d = parse_date_msg(msg) # same as parent
|
||||||
r.append((d, m))
|
r.append((d, m))
|
||||||
r += index_follow_up(m)
|
r += index_follow_up(m)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
nn = 0
|
|
||||||
|
|
||||||
def emit_mail_xml(msg, xmlel):
|
def emit_mail_xml(msg, xmlel):
|
||||||
|
|
||||||
global nn
|
global nn
|
||||||
@ -91,7 +114,7 @@ def export_file(f, fout):
|
|||||||
for t in d['threads']:
|
for t in d['threads']:
|
||||||
emit_mail_xml(t, all_mail)
|
emit_mail_xml(t, all_mail)
|
||||||
|
|
||||||
fout.write(et.tostring(all_mail))
|
fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
|
||||||
|
|
||||||
def export_year(d, dt, fout):
|
def export_year(d, dt, fout):
|
||||||
|
|
||||||
@ -115,7 +138,7 @@ def export_year(d, dt, fout):
|
|||||||
|
|
||||||
for d, f in dates:
|
for d, f in dates:
|
||||||
|
|
||||||
print(f)
|
logging.debug(f)
|
||||||
|
|
||||||
section = et.SubElement(chapter, 'section')
|
section = et.SubElement(chapter, 'section')
|
||||||
month = et.SubElement(section, 'month')
|
month = et.SubElement(section, 'month')
|
||||||
@ -129,32 +152,47 @@ def export_year(d, dt, fout):
|
|||||||
emit_mail_xml(t, mails)
|
emit_mail_xml(t, mails)
|
||||||
|
|
||||||
# write utf8 to file (et.tostring are bytes)
|
# write utf8 to file (et.tostring are bytes)
|
||||||
fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
||||||
# fout.write(remove_control_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
|
fout.write(remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
||||||
p.add_argument('file', metavar="f", help="mailinglist file(s) or dir(s) to export", nargs="+")
|
p.add_argument('file', metavar="f", help="mailinglist file(s) or dir(s) to export", nargs="+")
|
||||||
p.add_argument('--year', '-y', metavar='y', type=str, help='year of archive')
|
p.add_argument('--year', '-y', metavar='y', type=str, help='year of archive')
|
||||||
|
p.add_argument('--interval', '-i', metavar='i', type=str, help='years interval')
|
||||||
|
p.add_argument('--output', '-o', metavar='o', type=str, help='output directory')
|
||||||
|
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|
||||||
|
years = []
|
||||||
|
|
||||||
if args.year:
|
if args.year:
|
||||||
dt = datetime.strptime(args.year, '%Y')
|
years.append(datetime.strptime(args.year, '%Y'))
|
||||||
|
|
||||||
|
if args.interval:
|
||||||
|
r = args.interval.split('-')
|
||||||
|
years = list(map(lambda x: datetime.strptime(str(x), '%Y'), range(int(r[0]), int(r[1]) + 1)))
|
||||||
|
|
||||||
if not args.file:
|
if not args.file:
|
||||||
sys.exit('No file(s). Aborting.')
|
sys.exit('No file(s). Aborting.')
|
||||||
|
|
||||||
# with open("out.xml", "w") as fout:
|
# with open("out.xml", "w") as fout:
|
||||||
for f in args.file:
|
for f in args.file:
|
||||||
if args.year:
|
|
||||||
if not os.path.isdir(f):
|
if not os.path.isdir(f):
|
||||||
sys.exit(f + ' is not a valid directory. Aborting.')
|
logging.warning(f + ' is not a valid directory.')
|
||||||
foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml"
|
continue
|
||||||
|
if len(years) > 0:
|
||||||
|
for y in years:
|
||||||
|
yn = y.strftime('%Y')
|
||||||
|
foutname = os.path.basename(f) + "_" + yn + ".xml"
|
||||||
|
if args.output and os.path.isdir(args.output):
|
||||||
|
foutname = os.path.join(args.output, foutname)
|
||||||
with open(foutname, "w") as fout:
|
with open(foutname, "w") as fout:
|
||||||
export_year(f, dt, fout)
|
sys.stdout.write("Processing - " + yn)
|
||||||
print("nbr of message exported: " + str(nn))
|
export_year(f, y, fout)
|
||||||
|
print(" - nbr of message exported: " + str(nn))
|
||||||
|
nn = 0
|
||||||
else:
|
else:
|
||||||
if not os.path.isfile(f):
|
if not os.path.isfile(f):
|
||||||
sys.exit(f + ' is not a valid file. Aborting.')
|
sys.exit(f + ' is not a valid file. Aborting.')
|
||||||
|
|||||||
24
pdf_stats.py
Normal file
24
pdf_stats.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import argparse, os, sys, glob
|
||||||
|
from PyPDF2 import PdfFileReader
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
||||||
|
p.add_argument('dir', metavar="dir", help="pdf dir")
|
||||||
|
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
if not os.path.isdir(args.dir):
|
||||||
|
sys.exit(l + ' is not a valid directory. Aborting.')
|
||||||
|
|
||||||
|
files = [f for f in glob.glob(os.path.join(args.dir, "*.pdf"))]
|
||||||
|
|
||||||
|
total_pages = 0
|
||||||
|
for f in files:
|
||||||
|
with open(f) as fp:
|
||||||
|
p = PdfFileReader(f)
|
||||||
|
np = p.getNumPages()
|
||||||
|
print(f + " - nbr. pages: " + str(np))
|
||||||
|
total_pages += np
|
||||||
|
|
||||||
|
print(". . . . \n Total pages: " + str(total_pages))
|
||||||
Loading…
x
Reference in New Issue
Block a user