env files

This commit is contained in:
gauthiier 2019-11-30 15:03:44 +01:00
parent 6766c86c87
commit f2b98963d4
3 changed files with 84 additions and 12 deletions

23
conda_env.yml Normal file
View File

@ -0,0 +1,23 @@
name: annalen
channels:
- defaults
dependencies:
- ca-certificates=2019.10.16=0
- certifi=2019.9.11=py37_0
- libcxx=4.0.1=hcfea43d_1
- libcxxabi=4.0.1=hcfea43d_1
- libedit=3.1.20181209=hb402a30_0
- libffi=3.2.1=h475c297_4
- ncurses=6.1=h0a44026_1
- openssl=1.1.1d=h1de35cc_3
- pip=19.3.1=py37_0
- python=3.7.5=h359304d_0
- readline=7.0=h1de35cc_5
- regex=2019.11.1=py37h1de35cc_0
- setuptools=42.0.1=py37_0
- sqlite=3.30.1=ha441bb4_0
- tk=8.6.8=ha441bb4_0
- wheel=0.33.6=py37_0
- xz=5.2.4=h1de35cc_4
- zlib=1.2.11=h1de35cc_3

View File

@ -1,6 +1,7 @@
import argparse, os, glob, sys, json import argparse, os, glob, sys, json, email.utils
import xml.etree.ElementTree as et import xml.etree.ElementTree as et
from datetime import datetime from datetime import datetime
import regex as re
def format_subject(s): def format_subject(s):
return ' '.join(s.split()) return ' '.join(s.split())
@ -8,18 +9,61 @@ def format_subject(s):
def format_content(c): def format_content(c):
return c.strip().replace("\n\n\n", "\n") return c.strip().replace("\n\n\n", "\n")
def parse_date(fname): # See for Nevejan?s research
def remove_control_characters(s):
return re.sub(r'\p{C}-[\r\n\t]+', '', s)
# return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)
def parse_date_file(fname):
return datetime.strptime(fname, '%B_%Y.json') return datetime.strptime(fname, '%B_%Y.json')
# returns utc timestamp
def parse_date_msg(msg):
date_str = msg['date']
time_tz = None
try:
date_tz = email.utils.parsedate_tz(date_str)
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
except TypeError:
print("Format Date TypeError")
print(" > " + date_str)
return None
except ValueError:
print("Format Date ValueError")
print(" > " + date_str)
return None
except Exception as ex:
print(ex)
finally:
return time_tz
# recursive
def index_follow_up(msg):
r = []
if 'follow-up' in msg:
for m in msg['follow-up']:
d = parse_date_msg(m)
r.append((d, m))
r += index_follow_up(m)
return r
nn = 0
def emit_mail_xml(msg, xmlel): def emit_mail_xml(msg, xmlel):
global nn
nn += 1
mail = et.SubElement(xmlel, 'mail') mail = et.SubElement(xmlel, 'mail')
subject = et.SubElement(mail, 'subject') subject = et.SubElement(mail, 'subject')
subject.text = format_subject(msg['subject']) subject.text = format_subject(msg['subject'])
to = et.SubElement(mail, 'to') to = et.SubElement(mail, 'to')
to.text = msg['to'] if 'to' in msg:
to.text = msg['to']
else:
to.text = 'n/a'
from_ = et.SubElement(mail, 'from') from_ = et.SubElement(mail, 'from')
from_.text = msg['from'] from_.text = msg['from']
@ -28,13 +72,14 @@ def emit_mail_xml(msg, xmlel):
date.text = msg['date'] date.text = msg['date']
content = et.SubElement(mail, 'content') content = et.SubElement(mail, 'content')
## unescape chars ...
content.text = format_content(msg['content']) content.text = format_content(msg['content'])
if msg['follow-up']: # recursuve "follow-up"
print('follow-up') if 'follow-up' in msg:
followups = index_follow_up(msg)
# **** RECURSIVE "follow-up" **** followups.sort(key=lambda tup: tup[0])
for d, f in followups:
emit_mail_xml(f, xmlel)
def export_file(f, fout): def export_file(f, fout):
@ -60,7 +105,7 @@ def export_year(d, dt, fout):
dates = [] dates = []
for f in dir_files: for f in dir_files:
fdt = parse_date(os.path.basename(f)) fdt = parse_date_file(os.path.basename(f))
if dt.year != fdt.year: if dt.year != fdt.year:
continue continue
@ -70,7 +115,6 @@ def export_year(d, dt, fout):
for d, f in dates: for d, f in dates:
print(d)
print(f) print(f)
section = et.SubElement(chapter, 'section') section = et.SubElement(chapter, 'section')
@ -84,7 +128,9 @@ def export_year(d, dt, fout):
for t in dj['threads']: for t in dj['threads']:
emit_mail_xml(t, mails) emit_mail_xml(t, mails)
fout.write(et.tostring(chapter)) # write utf8 to file (et.tostring are bytes)
fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
# fout.write(remove_control_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
if __name__ == "__main__": if __name__ == "__main__":
@ -106,8 +152,9 @@ if __name__ == "__main__":
if not os.path.isdir(f): if not os.path.isdir(f):
sys.exit(f + ' is not a valid directory. Aborting.') sys.exit(f + ' is not a valid directory. Aborting.')
foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml" foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml"
with open(foutname, "wb") as fout: with open(foutname, "w") as fout:
export_year(f, dt, fout) export_year(f, dt, fout)
print("nbr of message exported: " + str(nn))
else: else:
if not os.path.isfile(f): if not os.path.isfile(f):
sys.exit(f + ' is not a valid file. Aborting.') sys.exit(f + ' is not a valid file. Aborting.')

2
setenv Executable file
View File

@ -0,0 +1,2 @@
# activate conda venv
source activate annalen