List_server_busy/export_xml.py

164 lines
3.7 KiB
Python
Raw Normal View History

2019-11-30 15:03:44 +01:00
import argparse, os, glob, sys, json, email.utils
2019-11-27 15:45:18 +01:00
import xml.etree.ElementTree as et
from datetime import datetime
2019-11-30 15:03:44 +01:00
import regex as re
2019-11-27 15:45:18 +01:00
def format_subject(s):
return ' '.join(s.split())
def format_content(c):
return c.strip().replace("\n\n\n", "\n")
2019-11-30 15:03:44 +01:00
# See for Nevejan?s research
def remove_control_characters(s):
return re.sub(r'\p{C}-[\r\n\t]+', '', s)
# return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)
def parse_date_file(fname):
2019-11-27 15:45:18 +01:00
return datetime.strptime(fname, '%B_%Y.json')
2019-11-30 15:03:44 +01:00
# returns utc timestamp
def parse_date_msg(msg):
date_str = msg['date']
time_tz = None
try:
date_tz = email.utils.parsedate_tz(date_str)
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
except TypeError:
print("Format Date TypeError")
print(" > " + date_str)
return None
except ValueError:
print("Format Date ValueError")
print(" > " + date_str)
return None
except Exception as ex:
print(ex)
finally:
return time_tz
# recursive
def index_follow_up(msg):
r = []
if 'follow-up' in msg:
for m in msg['follow-up']:
d = parse_date_msg(m)
r.append((d, m))
r += index_follow_up(m)
return r
nn = 0
2019-11-27 15:45:18 +01:00
def emit_mail_xml(msg, xmlel):
2019-11-30 15:03:44 +01:00
global nn
nn += 1
2019-11-27 15:45:18 +01:00
mail = et.SubElement(xmlel, 'mail')
subject = et.SubElement(mail, 'subject')
subject.text = format_subject(msg['subject'])
to = et.SubElement(mail, 'to')
2019-11-30 15:03:44 +01:00
if 'to' in msg:
to.text = msg['to']
else:
to.text = 'n/a'
2019-11-27 15:45:18 +01:00
from_ = et.SubElement(mail, 'from')
from_.text = msg['from']
date = et.SubElement(mail, 'date')
date.text = msg['date']
content = et.SubElement(mail, 'content')
content.text = format_content(msg['content'])
2019-11-30 15:03:44 +01:00
# recursuve "follow-up"
if 'follow-up' in msg:
followups = index_follow_up(msg)
followups.sort(key=lambda tup: tup[0])
for d, f in followups:
emit_mail_xml(f, xmlel)
2019-11-27 15:45:18 +01:00
def export_file(f, fout):
with open(f) as fp:
d = json.load(fp)
all_mail = et.Element('all')
for t in d['threads']:
emit_mail_xml(t, all_mail)
fout.write(et.tostring(all_mail))
def export_year(d, dt, fout):
dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))]
chapter = et.Element('chapter')
year = et.SubElement(chapter, 'year')
year.text = dt.strftime('%Y')
# SORT MONTHS BEFORE WRITING TO XML
dates = []
for f in dir_files:
2019-11-30 15:03:44 +01:00
fdt = parse_date_file(os.path.basename(f))
2019-11-27 15:45:18 +01:00
if dt.year != fdt.year:
continue
dates.append((fdt, f))
dates.sort(key=lambda tup: tup[0])
for d, f in dates:
print(f)
section = et.SubElement(chapter, 'section')
month = et.SubElement(section, 'month')
month.text = d.strftime('%B')
with open(f) as fp:
dj = json.load(fp)
mails = et.SubElement(section, 'mails')
for t in dj['threads']:
emit_mail_xml(t, mails)
2019-11-30 15:03:44 +01:00
# write utf8 to file (et.tostring are bytes)
fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
# fout.write(remove_control_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
2019-11-27 15:45:18 +01:00
if __name__ == "__main__":
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
p.add_argument('file', metavar="f", help="mailinglist file(s) or dir(s) to export", nargs="+")
p.add_argument('--year', '-y', metavar='y', type=str, help='year of archive')
args = p.parse_args()
if args.year:
dt = datetime.strptime(args.year, '%Y')
if not args.file:
sys.exit('No file(s). Aborting.')
# with open("out.xml", "w") as fout:
for f in args.file:
if args.year:
if not os.path.isdir(f):
sys.exit(f + ' is not a valid directory. Aborting.')
foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml"
2019-11-30 15:03:44 +01:00
with open(foutname, "w") as fout:
2019-11-27 15:45:18 +01:00
export_year(f, dt, fout)
2019-11-30 15:03:44 +01:00
print("nbr of message exported: " + str(nn))
2019-11-27 15:45:18 +01:00
else:
if not os.path.isfile(f):
sys.exit(f + ' is not a valid file. Aborting.')
export_file(f, fout)