2019-12-27 15:30:07 +01:00
|
|
|
import json, os, logging
|
2019-12-26 18:12:49 +01:00
|
|
|
import xml.etree.ElementTree as et
|
2019-12-28 15:58:48 +01:00
|
|
|
from xml.sax.saxutils import unescape, escape
|
2019-12-26 18:12:49 +01:00
|
|
|
import export.utils
|
|
|
|
|
import config
|
|
|
|
|
from datetime import datetime
|
2019-12-28 15:58:48 +01:00
|
|
|
from export import emailreply
|
|
|
|
|
import re
|
2019-12-26 18:12:49 +01:00
|
|
|
|
|
|
|
|
nn = 0
|
|
|
|
|
|
|
|
|
|
sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump'])
|
|
|
|
|
xml_dump = os.path.join(config.export['path'], config.export['xml'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def export_generate_path(tag):
|
|
|
|
|
now = datetime.now()
|
|
|
|
|
return os.path.join(config.export['path'], tag + "_[now].xml")
|
|
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
def make_xml_element(el, val):
|
|
|
|
|
return "<" + el + ">" + escape(val) + "</" + el + ">"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def emit_mail_xml(msg, li):
|
2019-12-26 18:12:49 +01:00
|
|
|
|
|
|
|
|
global nn
|
|
|
|
|
nn += 1
|
|
|
|
|
|
2019-12-27 15:30:07 +01:00
|
|
|
logging.info("export xml: " + li)
|
|
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
subject = make_xml_element("subject", msg['subject']) + "\n"
|
2019-12-26 18:12:49 +01:00
|
|
|
|
|
|
|
|
if 'to' in msg:
|
2019-12-28 15:58:48 +01:00
|
|
|
to = make_xml_element("to", msg['to']) + "\n"
|
2019-12-26 18:12:49 +01:00
|
|
|
else:
|
2019-12-28 15:58:48 +01:00
|
|
|
to = make_xml_element("to", 'n/a') + "\n"
|
|
|
|
|
|
|
|
|
|
from_ = make_xml_element("from", msg['from']) + "\n"
|
|
|
|
|
|
|
|
|
|
date = make_xml_element("date", msg['date']) + "\n"
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
todo:
|
|
|
|
|
- filter reply
|
|
|
|
|
- unescape XML
|
|
|
|
|
'''
|
|
|
|
|
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
|
|
|
|
|
e.read()
|
2019-12-26 18:12:49 +01:00
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
|
2019-12-26 18:12:49 +01:00
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
content = make_xml_element("content", content_stripped) + "\n"
|
2019-12-26 18:12:49 +01:00
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
mail = "<mail>\n" + subject + from_ + to + date + content + "</mail>\n"
|
|
|
|
|
|
|
|
|
|
# content = et.SubElement(mail, 'content')
|
|
|
|
|
# content.text = e.reply
|
2019-12-26 18:12:49 +01:00
|
|
|
|
|
|
|
|
# recursuve "follow-up"
|
|
|
|
|
if 'follow-up' in msg:
|
|
|
|
|
followups = export.utils.index_follow_up(msg)
|
|
|
|
|
followups.sort(key=lambda tup: tup[0])
|
|
|
|
|
for d, f in followups:
|
2019-12-28 15:58:48 +01:00
|
|
|
mail += emit_mail_xml(f, li)
|
|
|
|
|
|
|
|
|
|
return mail
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def emit_mail_xml(msg, li, xmlel):
|
|
|
|
|
|
|
|
|
|
# global nn
|
|
|
|
|
# nn += 1
|
|
|
|
|
|
|
|
|
|
# logging.info("export xml: " + li)
|
|
|
|
|
|
|
|
|
|
# mail = et.SubElement(xmlel, 'mail')
|
|
|
|
|
|
|
|
|
|
# subject = et.SubElement(mail, 'subject')
|
|
|
|
|
# subject.text = export.utils.format_subject(msg['subject'])
|
|
|
|
|
|
|
|
|
|
# to = et.SubElement(mail, 'to')
|
|
|
|
|
# if 'to' in msg:
|
|
|
|
|
# to.text = msg['to']
|
|
|
|
|
# else:
|
|
|
|
|
# to.text = 'n/a'
|
|
|
|
|
|
|
|
|
|
# from_ = et.SubElement(mail, 'from')
|
|
|
|
|
# from_.text = msg['from']
|
|
|
|
|
|
|
|
|
|
# date = et.SubElement(mail, 'date')
|
|
|
|
|
# date.text = msg['date']
|
|
|
|
|
|
|
|
|
|
# '''
|
|
|
|
|
# todo:
|
|
|
|
|
# - filter reply
|
|
|
|
|
# - unescape XML
|
|
|
|
|
# '''
|
|
|
|
|
# e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
|
|
|
|
|
# e.read()
|
|
|
|
|
|
|
|
|
|
# escape_table = {
|
|
|
|
|
# "&": "&",
|
|
|
|
|
# ">": ">",
|
|
|
|
|
# "<": "<"
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# content_str = "<content>" + escape(e.reply, escape_table) + "</content>"
|
|
|
|
|
|
|
|
|
|
# print(content_str)
|
|
|
|
|
|
|
|
|
|
# content = et.fromstring(content_str)
|
|
|
|
|
# mail.append(content)
|
|
|
|
|
|
|
|
|
|
# # content = et.SubElement(mail, 'content')
|
|
|
|
|
# # content.text = e.reply
|
|
|
|
|
|
|
|
|
|
# # recursuve "follow-up"
|
|
|
|
|
# if 'follow-up' in msg:
|
|
|
|
|
# followups = export.utils.index_follow_up(msg)
|
|
|
|
|
# followups.sort(key=lambda tup: tup[0])
|
|
|
|
|
# for d, f in followups:
|
|
|
|
|
# emit_mail_xml(f, li, xmlel)
|
2019-12-26 18:12:49 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
#------------------------------------------------------------
|
|
|
|
|
# The following functions parse the selection files
|
|
|
|
|
#------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def export_single_tag(t, sel, fout):
|
|
|
|
|
|
|
|
|
|
if t not in list(sel.keys()):
|
|
|
|
|
logging.error("Tag: " + t + " does not exists.")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
ch = sel[t]
|
|
|
|
|
|
|
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
chapter = "<chapter>\n"
|
2019-12-26 18:12:49 +01:00
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
chapter_title = make_xml_element("title", t) + "\n"
|
|
|
|
|
|
|
|
|
|
chapter_desc = make_xml_element("desc", ch['desc']) + "\n"
|
|
|
|
|
|
|
|
|
|
chapter_mails = "<mails>\n"
|
2019-12-26 18:12:49 +01:00
|
|
|
|
|
|
|
|
for m in ch['lists']:
|
2019-12-28 15:58:48 +01:00
|
|
|
chapter_mails += emit_mail_xml(m, m['list'])
|
|
|
|
|
|
|
|
|
|
chapter_mails += "</mails>\n"
|
|
|
|
|
|
|
|
|
|
chapter = "<chapter>\n" + chapter_mails + "</chapter>"
|
|
|
|
|
|
|
|
|
|
fout.write(chapter.encode('utf-8'))
|
|
|
|
|
|
2019-12-26 18:12:49 +01:00
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
# # root = et.ElementTree(chapter)
|
|
|
|
|
# # root.write(fout, encoding="utf-8", xml_declaration=True)
|
|
|
|
|
|
|
|
|
|
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
|
|
|
|
|
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
|
|
|
|
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
|
|
|
|
|
|
|
|
|
|
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
|
|
|
|
|
|
|
|
|
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
2019-12-26 18:12:49 +01:00
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
|
|
|
|
|
# def export_single_tag(t, sel, fout):
|
|
|
|
|
|
|
|
|
|
# if t not in list(sel.keys()):
|
|
|
|
|
# logging.error("Tag: " + t + " does not exists.")
|
|
|
|
|
# return False
|
|
|
|
|
|
|
|
|
|
# ch = sel[t]
|
|
|
|
|
|
|
|
|
|
# chapter = et.Element('chapter')
|
|
|
|
|
# chapter_title = et.SubElement(chapter, 'title')
|
|
|
|
|
# chapter_title.text = t
|
|
|
|
|
|
|
|
|
|
# chapter_desc = et.SubElement(chapter, 'desc')
|
|
|
|
|
# chapter_desc.text = ch['desc']
|
|
|
|
|
|
|
|
|
|
# chapter_mails = et.SubElement(chapter, 'mails')
|
|
|
|
|
|
|
|
|
|
# for m in ch['lists']:
|
|
|
|
|
# emit_mail_xml(m, m['list'], chapter_mails)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # root = et.ElementTree(chapter)
|
|
|
|
|
# # root.write(fout, encoding="utf-8", xml_declaration=True)
|
|
|
|
|
|
|
|
|
|
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
|
|
|
|
|
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
|
|
|
|
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
|
|
|
|
|
|
|
|
|
|
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
|
|
|
|
|
|
|
|
|
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
|
|
|
|
|
|
|
|
|
# return True
|
|
|
|
|
|
2019-12-26 18:12:49 +01:00
|
|
|
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
|
|
|
|
|
|
|
|
|
|
with open(sel_dump) as fin:
|
|
|
|
|
d = json.load(fin)
|
|
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
with open(xml_dump, 'wb') as fout:
|
2019-12-26 18:12:49 +01:00
|
|
|
for k in d.keys():
|
|
|
|
|
if not export_single_tag(k, d, fout):
|
|
|
|
|
logging.error("Error exporting: " + k)
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
|
|
|
|
|
|
|
|
|
|
with open(sel_dump) as fin:
|
|
|
|
|
d = json.load(fin)
|
|
|
|
|
|
|
|
|
|
now = datetime.now()
|
|
|
|
|
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
|
|
|
|
|
|
2019-12-28 15:58:48 +01:00
|
|
|
with open(xml_out, 'wb') as fout:
|
2019-12-26 18:12:49 +01:00
|
|
|
if not export_single_tag(tag, d, fout):
|
|
|
|
|
logging.error("Error exporting: " + tag)
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#------------------------------------------------------------
|
|
|
|
|
# The following functions parse the archive files directly
|
|
|
|
|
#------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def export_file(f, fout):
|
|
|
|
|
|
|
|
|
|
with open(f) as fp:
|
|
|
|
|
d = json.load(fp)
|
|
|
|
|
|
|
|
|
|
all_mail = et.Element('all')
|
|
|
|
|
for t in d['threads']:
|
|
|
|
|
emit_mail_xml(t, all_mail)
|
|
|
|
|
|
|
|
|
|
fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
|
|
|
|
|
|
|
|
|
|
def parse_date_file(fname):
|
|
|
|
|
return datetime.strptime(fname, '%B_%Y.json')
|
|
|
|
|
|
|
|
|
|
def export_year(d, dt, fout):
|
|
|
|
|
|
|
|
|
|
dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))]
|
|
|
|
|
|
|
|
|
|
chapter = et.Element('chapter')
|
|
|
|
|
year = et.SubElement(chapter, 'year')
|
|
|
|
|
year.text = dt.strftime('%Y')
|
|
|
|
|
|
|
|
|
|
# SORT MONTHS BEFORE WRITING TO XML
|
|
|
|
|
dates = []
|
|
|
|
|
for f in dir_files:
|
|
|
|
|
|
|
|
|
|
fdt = parse_date_file(os.path.basename(f))
|
|
|
|
|
if dt.year != fdt.year:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
dates.append((fdt, f))
|
|
|
|
|
|
|
|
|
|
dates.sort(key=lambda tup: tup[0])
|
|
|
|
|
|
|
|
|
|
for d, f in dates:
|
|
|
|
|
|
|
|
|
|
logging.debug(f)
|
|
|
|
|
|
|
|
|
|
section = et.SubElement(chapter, 'section')
|
|
|
|
|
month = et.SubElement(section, 'month')
|
|
|
|
|
month.text = d.strftime('%B')
|
|
|
|
|
|
|
|
|
|
with open(f) as fp:
|
|
|
|
|
dj = json.load(fp)
|
|
|
|
|
|
|
|
|
|
mails = et.SubElement(section, 'mails')
|
|
|
|
|
for t in dj['threads']:
|
|
|
|
|
emit_mail_xml(t, mails)
|
|
|
|
|
|
|
|
|
|
# write utf8 to file (et.tostring are bytes)
|
|
|
|
|
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
|
|
|
|
fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
|