diff --git a/conda_env.yml b/conda_env.yml new file mode 100644 index 0000000..b1e0836 --- /dev/null +++ b/conda_env.yml @@ -0,0 +1,23 @@ +name: annalen +channels: + - defaults +dependencies: + - ca-certificates=2019.10.16=0 + - certifi=2019.9.11=py37_0 + - libcxx=4.0.1=hcfea43d_1 + - libcxxabi=4.0.1=hcfea43d_1 + - libedit=3.1.20181209=hb402a30_0 + - libffi=3.2.1=h475c297_4 + - ncurses=6.1=h0a44026_1 + - openssl=1.1.1d=h1de35cc_3 + - pip=19.3.1=py37_0 + - python=3.7.5=h359304d_0 + - readline=7.0=h1de35cc_5 + - regex=2019.11.1=py37h1de35cc_0 + - setuptools=42.0.1=py37_0 + - sqlite=3.30.1=ha441bb4_0 + - tk=8.6.8=ha441bb4_0 + - wheel=0.33.6=py37_0 + - xz=5.2.4=h1de35cc_4 + - zlib=1.2.11=h1de35cc_3 + diff --git a/export_xml.py b/export_xml.py index 2ffb6ae..ebe73d4 100644 --- a/export_xml.py +++ b/export_xml.py @@ -1,6 +1,7 @@ -import argparse, os, glob, sys, json +import argparse, os, glob, sys, json, email.utils import xml.etree.ElementTree as et from datetime import datetime +import regex as re def format_subject(s): return ' '.join(s.split()) @@ -8,18 +9,61 @@ def format_subject(s): def format_content(c): return c.strip().replace("\n\n\n", "\n") -def parse_date(fname): +# See for Nevejan?s research +def remove_control_characters(s): + return re.sub(r'\p{C}-[\r\n\t]+', '', s) + # return re.sub(r'\p{Cc}-[\r\n\t]+', '', s) + +def parse_date_file(fname): return datetime.strptime(fname, '%B_%Y.json') +# returns utc timestamp +def parse_date_msg(msg): + date_str = msg['date'] + time_tz = None + try: + date_tz = email.utils.parsedate_tz(date_str) + time_tz = email.utils.mktime_tz(date_tz) #utc timestamp + except TypeError: + print("Format Date TypeError") + print(" > " + date_str) + return None + except ValueError: + print("Format Date ValueError") + print(" > " + date_str) + return None + except Exception as ex: + print(ex) + finally: + return time_tz + +# recursive +def index_follow_up(msg): + r = [] + if 'follow-up' in msg: + for m in msg['follow-up']: + d = parse_date_msg(m) + r.append((d, m)) + r += index_follow_up(m) + return r + +nn = 0 + def emit_mail_xml(msg, xmlel): + global nn + nn += 1 + mail = et.SubElement(xmlel, 'mail') subject = et.SubElement(mail, 'subject') subject.text = format_subject(msg['subject']) to = et.SubElement(mail, 'to') - to.text = msg['to'] + if 'to' in msg: + to.text = msg['to'] + else: + to.text = 'n/a' from_ = et.SubElement(mail, 'from') from_.text = msg['from'] @@ -28,13 +72,14 @@ def emit_mail_xml(msg, xmlel): date.text = msg['date'] content = et.SubElement(mail, 'content') - ## unescape chars ... content.text = format_content(msg['content']) - if msg['follow-up']: - print('follow-up') - - # **** RECURSIVE "follow-up" **** + # recursuve "follow-up" + if 'follow-up' in msg: + followups = index_follow_up(msg) + followups.sort(key=lambda tup: tup[0]) + for d, f in followups: + emit_mail_xml(f, xmlel) def export_file(f, fout): @@ -60,7 +105,7 @@ def export_year(d, dt, fout): dates = [] for f in dir_files: - fdt = parse_date(os.path.basename(f)) + fdt = parse_date_file(os.path.basename(f)) if dt.year != fdt.year: continue @@ -70,7 +115,6 @@ def export_year(d, dt, fout): for d, f in dates: - print(d) print(f) section = et.SubElement(chapter, 'section') @@ -84,7 +128,9 @@ def export_year(d, dt, fout): for t in dj['threads']: emit_mail_xml(t, mails) - fout.write(et.tostring(chapter)) + # write utf8 to file (et.tostring are bytes) + fout.write(et.tostring(chapter).decode('utf-8', 'ignore')) + # fout.write(remove_control_characters(et.tostring(chapter).decode('utf-8', 'ignore'))) if __name__ == "__main__": @@ -106,8 +152,9 @@ if __name__ == "__main__": if not os.path.isdir(f): sys.exit(f + ' is not a valid directory. Aborting.') foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml" - with open(foutname, "wb") as fout: + with open(foutname, "w") as fout: export_year(f, dt, fout) + print("nbr of message exported: " + str(nn)) else: if not os.path.isfile(f): sys.exit(f + ' is not a valid file. Aborting.') diff --git a/setenv b/setenv new file mode 100755 index 0000000..38bb5cb --- /dev/null +++ b/setenv @@ -0,0 +1,2 @@ +# activate conda venv +source activate annalen \ No newline at end of file