export

2019-12-26 18:12:49 +01:00
parent 9c8f5b0e5e
commit 0b7564d44b
10 changed files with 366 additions and 5 deletions
@@ -0,0 +1,24 @@
+import os, logging, glob
+import xmltodict, json
+import config
+
+xml_dump = os.path.join(config.export['path'], config.export['xml'])
+
+def list_all(dirname=config.export['path'], extension="xml"):
+
+	if not os.path.isdir(dirname):
+		logging.error(dirname + " is not a valid directory.")
+		return None
+
+	return [f for f in glob.glob(os.path.join(dirname, "*." + extension))]
+
+def get(fn, extension="xml"):
+	if not os.path.isfile(fn):
+		logging.error(fn + " is not a valid file.")
+		return None
+
+	if extension == "xml":
+		with open(fn) as fp:
+			dxml = fp.read()
+		return xmltodict.parse(dxml) #<--- this is werid parse mails/mail....
+		# return json.loads(json.dumps(xmltodict.parse(dxml)))
@@ -0,0 +1,161 @@
+import json, os
+import xml.etree.ElementTree as et
+import export.utils
+import config
+from datetime import datetime
+
+nn = 0
+
+sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump'])
+xml_dump = os.path.join(config.export['path'], config.export['xml'])
+
+
+def export_generate_path(tag):
+	now = datetime.now()
+	return os.path.join(config.export['path'], tag + "_[now].xml")
+
+def emit_mail_xml(msg, xmlel):
+
+	global nn
+	nn += 1
+
+	mail = et.SubElement(xmlel, 'mail')
+
+	subject = et.SubElement(mail, 'subject')
+	subject.text = export.utils.format_subject(msg['subject'])
+
+	to = et.SubElement(mail, 'to')
+	if 'to' in msg: 
+		to.text = msg['to']
+	else:
+		to.text = 'n/a'
+
+	from_ = et.SubElement(mail, 'from')
+	from_.text = msg['from']
+
+	date = et.SubElement(mail, 'date')
+	date.text = msg['date']
+
+	content = et.SubElement(mail, 'content')
+	content.text = export.utils.format_content(msg['content'])
+
+	# recursuve "follow-up"
+	if 'follow-up' in msg:
+		followups = export.utils.index_follow_up(msg)
+		followups.sort(key=lambda tup: tup[0])
+		for d, f in followups:
+			emit_mail_xml(f, xmlel)
+
+
+#------------------------------------------------------------
+# The following functions parse the selection files 
+#------------------------------------------------------------
+
+def export_single_tag(t, sel, fout):
+
+	if t not in list(sel.keys()):
+		logging.error("Tag: " + t + " does not exists.")
+		return False
+
+	ch = sel[t]
+
+	chapter = et.Element('chapter')
+	chapter_title = et.SubElement(chapter, 'title')
+	chapter_title.text = t
+
+	chapter_desc = et.SubElement(chapter, 'desc')
+	chapter_desc.text = ch['desc']
+
+	chapter_mails = et.SubElement(chapter, 'mails')
+
+	for m in ch['lists']:
+		emit_mail_xml(m, chapter_mails)
+
+	fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
+
+	return True
+
+def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
+
+	with open(sel_dump) as fin:
+		d = json.load(fin)
+
+	with open(xml_dump, 'w') as fout:
+		for k in d.keys():
+			if not export_single_tag(k, d, fout):
+				logging.error("Error exporting: " + k)
+				return False
+	return True
+
+def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
+
+	with open(sel_dump) as fin:
+		d = json.load(fin)
+
+	now = datetime.now()
+	xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
+
+	with open(xml_out, 'w') as fout:
+		if not export_single_tag(tag, d, fout):
+			logging.error("Error exporting: " + tag)
+			return False
+	return True
+
+
+
+#------------------------------------------------------------
+# The following functions parse the archive files directly
+#------------------------------------------------------------
+
+def export_file(f, fout):
+
+	with open(f) as fp:
+		d = json.load(fp)
+
+	all_mail = et.Element('all')
+	for t in d['threads']:
+		emit_mail_xml(t, all_mail)		
+
+	fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))		
+
+def parse_date_file(fname):
+	return datetime.strptime(fname, '%B_%Y.json')
+
+def export_year(d, dt, fout):
+
+	dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))]
+
+	chapter = et.Element('chapter')
+	year = et.SubElement(chapter, 'year')
+	year.text = dt.strftime('%Y')
+
+	# SORT MONTHS BEFORE WRITING TO XML
+	dates = []
+	for f in dir_files:
+
+		fdt = parse_date_file(os.path.basename(f))
+		if dt.year != fdt.year:
+			continue
+
+		dates.append((fdt, f))
+
+	dates.sort(key=lambda tup: tup[0])
+
+	for d, f in dates:
+
+		logging.debug(f)
+
+		section = et.SubElement(chapter, 'section')
+		month = et.SubElement(section, 'month')
+		month.text = d.strftime('%B')
+
+		with open(f) as fp:
+			dj = json.load(fp)
+
+		mails = et.SubElement(section, 'mails')
+		for t in dj['threads']:
+			emit_mail_xml(t, mails)		
+
+	# write utf8 to file (et.tostring are bytes)
+	# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
+	fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
@@ -0,0 +1,62 @@
+from datetime import datetime
+import regex as re
+import  email.utils, logging
+
+xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]')
+nl_re = re.compile('\n\n\n')
+ind_re = re.compile('--------------------------------------------------------------------------')
+
+def format_subject(s):
+	return ' '.join(s.split())
+
+def format_content(c):
+
+	c = c.strip()
+
+	# new lines 
+	c = re.sub(nl_re, '\n', c)
+
+	## weird stuff
+	# 1. indesign automatic overset... (? dunno why ?)
+	# ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime)
+	c = re.sub(ind_re, '', c)
+
+	return c
+
+# returns utc timestamp
+def parse_date_msg(msg):
+	date_str = msg['date']
+	time_tz = None
+	try:
+		date_tz = email.utils.parsedate_tz(date_str)
+		time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
+	except TypeError:
+		logging.warning("Format Date TypeError")
+		loggin.warning("  > " + date_str)
+		return None
+	except ValueError:
+		loggin.warning("Format Date ValueError")
+		loggin.warning("  > " + date_str)
+		return None
+	except Exception as ex:
+		print(ex)
+	finally:
+		return time_tz
+
+# recursive
+def index_follow_up(msg):
+	r = []
+	if 'follow-up' in msg:
+		for m in msg['follow-up']:
+			d = parse_date_msg(m)
+			if d is None:
+				d = parse_date_msg(msg) # same as parent
+			r.append((d, m))
+			r += index_follow_up(m)
+	return r
+
+# See for Nevejan?s research
+def remove_invalid_xml_characters(s):
+	return re.sub(xml_re, '', s)
+    # return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s)
+    # return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)