List_server_busy/export/exportxml.py
2019-12-31 09:53:46 +01:00

237 lines
5.9 KiB
Python

import json, os, logging
import xml.etree.ElementTree as et
from xml.sax.saxutils import unescape, escape
import export.utils
import config
from export import emailreply
import re, hashlib, datetime
nn = 0
sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump'])
xml_dump = os.path.join(config.export['path'], config.export['xml'])
'''
utils
'''
def export_generate_path(tag):
now = datetime.datetime.now()
return os.path.join(config.export['path'], tag + "_[now].xml")
'''
xml export
'''
def hash(m):
return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest()
def make_xml_element(el, val):
return "<" + el + ">" + escape(val) + "</" + el + ">"
'''
This is pretty patched up...........................................................
'''
def to_listserv(li, msg):
if li == 'crumb': # patch
return '<new-media-curating@jiscmail.ac.uk>'
elif li == 'spectre':
return 'spectre@mikrolisten.de'
elif li == 'empyre':
return '<empyre@lists.cofa.unsw.edu.au>'
elif li == 'nettime_bold':
return 'nettime-bold@nettime.org'
elif li == 'nettime_l':
# nettime-l@desk.nl -- June 8 1999
# mettime-l-temp@material.net -- July 15 1999
# nettime-l@bbs.thing.net> -- July 19 2007
# nettime-l@kein.org
dtz = export.utils.parse_date_msg(msg)
if dtz is not None:
d = datetime.datetime.fromtimestamp(dtz).date()
if d < datetime.date(1999, 6, 8):
return 'nettime-l@desk.nl'
elif d < datetime.date(1999, 7, 15):
return 'mettime-l-temp@material.net'
elif d < datetime.date(2007, 7, 19):
return 'nettime-l@bbs.thing.net'
return 'nettime-l@kein.org'
else:
logging.warning("no listserv to...")
return 'n/a'
def emit_mail_xml(msg, li, thread_nbr, msg_nbr):
global nn, hashes
nn += 1
h = hash(msg) # patch
if h in hashes:
logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...")
return ''
else:
hashes.append(h)
nbr = make_xml_element("nbr", str(thread_nbr) + "." + str(msg_nbr)) + "\n"
subject = make_xml_element("subject", msg['subject']) + "\n"
to = make_xml_element("to", to_listserv(li, msg)) + "\n" # patch
from_ = make_xml_element("from", msg['author_name']) + "\n"
date = make_xml_element("date", msg['date']) + "\n"
'''
todo:
- filter reply
- unescape XML
'''
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
e.read()
# content_stripped = re.sub(r'\n[ ]{2,}', '\n\n', e.reply) # patch
# # content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
# content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', content_stripped)
# # content_stripped = re.sub(r'\n[ ]{2,}\w', '\n\n', content_stripped)
content_stripped = e.reply
content = make_xml_element("content", content_stripped) + "\n"
mail = "<mail>\n" + nbr + subject + from_ + to + date + content + "</mail>\n"
# content = et.SubElement(mail, 'content')
# content.text = e.reply
# recursuve "follow-up"
if 'follow-up' in msg:
followups = export.utils.index_follow_up(msg)
followups.sort(key=lambda tup: tup[0])
for d, f in followups:
msg_nbr += 1
mail += emit_mail_xml(f, li, thread_nbr, msg_nbr)
return mail
def export_single_tag(t, sel, fout):
global hashes
if t not in list(sel.keys()):
logging.error("Tag: " + t + " does not exists.")
return False
logging.info("Exporting tag:" + t)
ch = sel[t]
chapter = "<chapter>\n"
chapter_title = make_xml_element("title", t) + "\n"
chapter_desc = make_xml_element("desc", ch['desc']) + "\n"
chapter_mails = "<mails>\n"
hashes = []
thread_nbr = 0
for m in ch['lists']:
chapter_mails += emit_mail_xml(m, m['list'], thread_nbr, 0)
thread_nbr += 1
chapter_mails += "</mails>\n"
chapter = "<chapter>\n" + chapter_title + chapter_desc + chapter_mails + "</chapter>"
fout.write(chapter.encode('utf-8'))
return True
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin:
d = json.load(fin)
with open(xml_dump, 'wb') as fout:
for k in d.keys():
if not export_single_tag(k, d, fout):
logging.error("Error exporting: " + k)
return False
return True
def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin:
d = json.load(fin)
now = datetime.datetime.now()
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
with open(xml_out, 'wb') as fout:
if not export_single_tag(tag, d, fout):
logging.error("Error exporting: " + tag)
return False
return True
#------------------------------------------------------------
# The following functions parse the archive files directly
#------------------------------------------------------------
def export_file(f, fout):
with open(f) as fp:
d = json.load(fp)
all_mail = et.Element('all')
for t in d['threads']:
emit_mail_xml(t, all_mail)
fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
def parse_date_file(fname):
return datetime.datetime.strptime(fname, '%B_%Y.json')
def export_year(d, dt, fout):
dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))]
chapter = et.Element('chapter')
year = et.SubElement(chapter, 'year')
year.text = dt.strftime('%Y')
# SORT MONTHS BEFORE WRITING TO XML
dates = []
for f in dir_files:
fdt = parse_date_file(os.path.basename(f))
if dt.year != fdt.year:
continue
dates.append((fdt, f))
dates.sort(key=lambda tup: tup[0])
for d, f in dates:
logging.debug(f)
section = et.SubElement(chapter, 'section')
month = et.SubElement(section, 'month')
month.text = d.strftime('%B')
with open(f) as fp:
dj = json.load(fp)
mails = et.SubElement(section, 'mails')
for t in dj['threads']:
emit_mail_xml(t, mails)
# write utf8 to file (et.tostring are bytes)
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))