env files
This commit is contained in:
parent
6766c86c87
commit
f2b98963d4
23
conda_env.yml
Normal file
23
conda_env.yml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
name: annalen
|
||||||
|
channels:
|
||||||
|
- defaults
|
||||||
|
dependencies:
|
||||||
|
- ca-certificates=2019.10.16=0
|
||||||
|
- certifi=2019.9.11=py37_0
|
||||||
|
- libcxx=4.0.1=hcfea43d_1
|
||||||
|
- libcxxabi=4.0.1=hcfea43d_1
|
||||||
|
- libedit=3.1.20181209=hb402a30_0
|
||||||
|
- libffi=3.2.1=h475c297_4
|
||||||
|
- ncurses=6.1=h0a44026_1
|
||||||
|
- openssl=1.1.1d=h1de35cc_3
|
||||||
|
- pip=19.3.1=py37_0
|
||||||
|
- python=3.7.5=h359304d_0
|
||||||
|
- readline=7.0=h1de35cc_5
|
||||||
|
- regex=2019.11.1=py37h1de35cc_0
|
||||||
|
- setuptools=42.0.1=py37_0
|
||||||
|
- sqlite=3.30.1=ha441bb4_0
|
||||||
|
- tk=8.6.8=ha441bb4_0
|
||||||
|
- wheel=0.33.6=py37_0
|
||||||
|
- xz=5.2.4=h1de35cc_4
|
||||||
|
- zlib=1.2.11=h1de35cc_3
|
||||||
|
|
||||||
@ -1,6 +1,7 @@
|
|||||||
import argparse, os, glob, sys, json
|
import argparse, os, glob, sys, json, email.utils
|
||||||
import xml.etree.ElementTree as et
|
import xml.etree.ElementTree as et
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import regex as re
|
||||||
|
|
||||||
def format_subject(s):
|
def format_subject(s):
|
||||||
return ' '.join(s.split())
|
return ' '.join(s.split())
|
||||||
@ -8,18 +9,61 @@ def format_subject(s):
|
|||||||
def format_content(c):
|
def format_content(c):
|
||||||
return c.strip().replace("\n\n\n", "\n")
|
return c.strip().replace("\n\n\n", "\n")
|
||||||
|
|
||||||
def parse_date(fname):
|
# See for Nevejan?s research
|
||||||
|
def remove_control_characters(s):
|
||||||
|
return re.sub(r'\p{C}-[\r\n\t]+', '', s)
|
||||||
|
# return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)
|
||||||
|
|
||||||
|
def parse_date_file(fname):
|
||||||
return datetime.strptime(fname, '%B_%Y.json')
|
return datetime.strptime(fname, '%B_%Y.json')
|
||||||
|
|
||||||
|
# returns utc timestamp
|
||||||
|
def parse_date_msg(msg):
|
||||||
|
date_str = msg['date']
|
||||||
|
time_tz = None
|
||||||
|
try:
|
||||||
|
date_tz = email.utils.parsedate_tz(date_str)
|
||||||
|
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
||||||
|
except TypeError:
|
||||||
|
print("Format Date TypeError")
|
||||||
|
print(" > " + date_str)
|
||||||
|
return None
|
||||||
|
except ValueError:
|
||||||
|
print("Format Date ValueError")
|
||||||
|
print(" > " + date_str)
|
||||||
|
return None
|
||||||
|
except Exception as ex:
|
||||||
|
print(ex)
|
||||||
|
finally:
|
||||||
|
return time_tz
|
||||||
|
|
||||||
|
# recursive
|
||||||
|
def index_follow_up(msg):
|
||||||
|
r = []
|
||||||
|
if 'follow-up' in msg:
|
||||||
|
for m in msg['follow-up']:
|
||||||
|
d = parse_date_msg(m)
|
||||||
|
r.append((d, m))
|
||||||
|
r += index_follow_up(m)
|
||||||
|
return r
|
||||||
|
|
||||||
|
nn = 0
|
||||||
|
|
||||||
def emit_mail_xml(msg, xmlel):
|
def emit_mail_xml(msg, xmlel):
|
||||||
|
|
||||||
|
global nn
|
||||||
|
nn += 1
|
||||||
|
|
||||||
mail = et.SubElement(xmlel, 'mail')
|
mail = et.SubElement(xmlel, 'mail')
|
||||||
|
|
||||||
subject = et.SubElement(mail, 'subject')
|
subject = et.SubElement(mail, 'subject')
|
||||||
subject.text = format_subject(msg['subject'])
|
subject.text = format_subject(msg['subject'])
|
||||||
|
|
||||||
to = et.SubElement(mail, 'to')
|
to = et.SubElement(mail, 'to')
|
||||||
|
if 'to' in msg:
|
||||||
to.text = msg['to']
|
to.text = msg['to']
|
||||||
|
else:
|
||||||
|
to.text = 'n/a'
|
||||||
|
|
||||||
from_ = et.SubElement(mail, 'from')
|
from_ = et.SubElement(mail, 'from')
|
||||||
from_.text = msg['from']
|
from_.text = msg['from']
|
||||||
@ -28,13 +72,14 @@ def emit_mail_xml(msg, xmlel):
|
|||||||
date.text = msg['date']
|
date.text = msg['date']
|
||||||
|
|
||||||
content = et.SubElement(mail, 'content')
|
content = et.SubElement(mail, 'content')
|
||||||
## unescape chars ...
|
|
||||||
content.text = format_content(msg['content'])
|
content.text = format_content(msg['content'])
|
||||||
|
|
||||||
if msg['follow-up']:
|
# recursuve "follow-up"
|
||||||
print('follow-up')
|
if 'follow-up' in msg:
|
||||||
|
followups = index_follow_up(msg)
|
||||||
# **** RECURSIVE "follow-up" ****
|
followups.sort(key=lambda tup: tup[0])
|
||||||
|
for d, f in followups:
|
||||||
|
emit_mail_xml(f, xmlel)
|
||||||
|
|
||||||
|
|
||||||
def export_file(f, fout):
|
def export_file(f, fout):
|
||||||
@ -60,7 +105,7 @@ def export_year(d, dt, fout):
|
|||||||
dates = []
|
dates = []
|
||||||
for f in dir_files:
|
for f in dir_files:
|
||||||
|
|
||||||
fdt = parse_date(os.path.basename(f))
|
fdt = parse_date_file(os.path.basename(f))
|
||||||
if dt.year != fdt.year:
|
if dt.year != fdt.year:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -70,7 +115,6 @@ def export_year(d, dt, fout):
|
|||||||
|
|
||||||
for d, f in dates:
|
for d, f in dates:
|
||||||
|
|
||||||
print(d)
|
|
||||||
print(f)
|
print(f)
|
||||||
|
|
||||||
section = et.SubElement(chapter, 'section')
|
section = et.SubElement(chapter, 'section')
|
||||||
@ -84,7 +128,9 @@ def export_year(d, dt, fout):
|
|||||||
for t in dj['threads']:
|
for t in dj['threads']:
|
||||||
emit_mail_xml(t, mails)
|
emit_mail_xml(t, mails)
|
||||||
|
|
||||||
fout.write(et.tostring(chapter))
|
# write utf8 to file (et.tostring are bytes)
|
||||||
|
fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
||||||
|
# fout.write(remove_control_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
@ -106,8 +152,9 @@ if __name__ == "__main__":
|
|||||||
if not os.path.isdir(f):
|
if not os.path.isdir(f):
|
||||||
sys.exit(f + ' is not a valid directory. Aborting.')
|
sys.exit(f + ' is not a valid directory. Aborting.')
|
||||||
foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml"
|
foutname = os.path.basename(f) + "_" + dt.strftime('%Y') + ".xml"
|
||||||
with open(foutname, "wb") as fout:
|
with open(foutname, "w") as fout:
|
||||||
export_year(f, dt, fout)
|
export_year(f, dt, fout)
|
||||||
|
print("nbr of message exported: " + str(nn))
|
||||||
else:
|
else:
|
||||||
if not os.path.isfile(f):
|
if not os.path.isfile(f):
|
||||||
sys.exit(f + ' is not a valid file. Aborting.')
|
sys.exit(f + ' is not a valid file. Aborting.')
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user