List_server_busy/export/exportxml.py

370 lines
8.3 KiB
Python
Raw Normal View History

2019-12-27 15:30:07 +01:00
import json, os, logging
2019-12-26 18:12:49 +01:00
import xml.etree.ElementTree as et
2019-12-28 15:58:48 +01:00
from xml.sax.saxutils import unescape, escape
2019-12-26 18:12:49 +01:00
import export.utils
import config
2019-12-28 15:58:48 +01:00
from export import emailreply
2019-12-31 09:53:46 +01:00
import re, hashlib, datetime
2019-12-26 18:12:49 +01:00
nn = 0
sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump'])
xml_dump = os.path.join(config.export['path'], config.export['xml'])
2019-12-31 09:53:46 +01:00
'''
utils
'''
2019-12-26 18:12:49 +01:00
def export_generate_path(tag):
2019-12-31 09:53:46 +01:00
now = datetime.datetime.now()
2019-12-26 18:12:49 +01:00
return os.path.join(config.export['path'], tag + "_[now].xml")
2020-01-02 22:49:07 +01:00
def sort_sel_dump(tag, sel_dump=sel_dump, sel_out=None):
with open(sel_dump) as fin:
d = json.load(fin)
sorted_list = sorted(d[tag]['lists'], key=lambda m: export.utils.parse_date_msg(m))
d[tag]['lists'] = sorted_list
with open(sel_dump, 'w') as fout:
json.dump(d, fout, indent=4, ensure_ascii=False)
2020-01-12 12:16:10 +01:00
def recursive_sort_by_date(msg):
if 'follow-up' in msg:
msg['follow-up'] = sorted(msg['follow-up'], key=lambda m: export.utils.parse_date_msg(m))
for m in msg['follow-up']:
recursive_sort_by_date(m)
def recursive_get_follow_up(msg):
f = []
if 'follow-up' in msg:
for m in msg['follow-up']:
f += recursive_get_follow_up(m)
f += msg['follow-up']
return f
2020-01-02 22:49:07 +01:00
def fix_missing_content(xml_in, xml_out):
tree = et.parse(xml_in)
root = tree.getroot()
tag = root.find('title').text
content_less = root.findall('.//*[content=""]/.')
if len(content_less) < 0:
return
with open(sel_dump) as fin:
d = json.load(fin)
for m in content_less:
date_str = m.find('date').text
print(date_str)
for msg in d[tag]["lists"]:
ml = find_msg_by_date_recursive(msg, date_str)
if ml is not None:
m.find('content').text = ml['content']
# tree.write(xml_out)
with open(xml_out, "w") as fout:
fout.write(et.tostring(root).decode('utf-8', 'ignore'))
def find_msg_by_date_recursive(msg, date_str):
if msg['date'] == date_str:
return msg
if 'follow-up' in msg:
for m in msg['follow-up']:
r = find_msg_by_date_recursive(m, date_str)
if r is not None:
return r
return None
2019-12-31 09:53:46 +01:00
'''
xml export
'''
def hash(m):
return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest()
2019-12-28 15:58:48 +01:00
def make_xml_element(el, val):
return "<" + el + ">" + escape(val) + "</" + el + ">"
2019-12-31 09:53:46 +01:00
'''
This is pretty patched up...........................................................
'''
def to_listserv(li, msg):
if li == 'crumb': # patch
return '<new-media-curating@jiscmail.ac.uk>'
elif li == 'spectre':
return 'spectre@mikrolisten.de'
elif li == 'empyre':
return '<empyre@lists.cofa.unsw.edu.au>'
elif li == 'nettime_bold':
return 'nettime-bold@nettime.org'
elif li == 'nettime_l':
# nettime-l@desk.nl -- June 8 1999
# mettime-l-temp@material.net -- July 15 1999
# nettime-l@bbs.thing.net> -- July 19 2007
# nettime-l@kein.org
dtz = export.utils.parse_date_msg(msg)
if dtz is not None:
d = datetime.datetime.fromtimestamp(dtz).date()
if d < datetime.date(1999, 6, 8):
return 'nettime-l@desk.nl'
elif d < datetime.date(1999, 7, 15):
2020-01-25 10:58:25 +01:00
return 'nettime-l-temp@material.net'
2019-12-31 09:53:46 +01:00
elif d < datetime.date(2007, 7, 19):
return 'nettime-l@bbs.thing.net'
return 'nettime-l@kein.org'
2020-01-02 22:49:07 +01:00
elif li == 'oldboys':
return 'oldboys@lists.ccc.de'
2020-01-25 10:58:25 +01:00
#### SYNDICATE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! aarrgghhh
2019-12-31 09:53:46 +01:00
else:
logging.warning("no listserv to...")
return 'n/a'
2019-12-28 15:58:48 +01:00
2020-01-12 12:16:10 +01:00
def emit_mail_xml(msg, li, thread_nbr, msg_nbr):
2020-01-02 22:49:07 +01:00
2019-12-31 09:53:46 +01:00
global nn, hashes
2019-12-26 18:12:49 +01:00
nn += 1
2019-12-31 09:53:46 +01:00
h = hash(msg) # patch
if h in hashes:
2020-01-12 12:16:10 +01:00
#logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...")
2019-12-31 09:53:46 +01:00
return ''
else:
hashes.append(h)
2020-01-12 12:16:10 +01:00
print(msg['date'] + " - " + msg['subject'])
2019-12-31 09:53:46 +01:00
nbr = make_xml_element("nbr", str(thread_nbr) + "." + str(msg_nbr)) + "\n"
2019-12-27 15:30:07 +01:00
2019-12-28 15:58:48 +01:00
subject = make_xml_element("subject", msg['subject']) + "\n"
2019-12-26 18:12:49 +01:00
2019-12-31 09:53:46 +01:00
to = make_xml_element("to", to_listserv(li, msg)) + "\n" # patch
2019-12-28 15:58:48 +01:00
2019-12-31 09:53:46 +01:00
from_ = make_xml_element("from", msg['author_name']) + "\n"
2019-12-28 15:58:48 +01:00
date = make_xml_element("date", msg['date']) + "\n"
'''
todo:
- filter reply
- unescape XML
'''
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
e.read()
2019-12-26 18:12:49 +01:00
2019-12-31 09:53:46 +01:00
# content_stripped = re.sub(r'\n[ ]{2,}', '\n\n', e.reply) # patch
# # content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
# content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', content_stripped)
# # content_stripped = re.sub(r'\n[ ]{2,}\w', '\n\n', content_stripped)
2020-01-02 22:49:07 +01:00
# content_stripped = e.reply
content_stripped = msg['content']
2019-12-26 18:12:49 +01:00
2019-12-28 15:58:48 +01:00
content = make_xml_element("content", content_stripped) + "\n"
2019-12-26 18:12:49 +01:00
2019-12-31 09:53:46 +01:00
mail = "<mail>\n" + nbr + subject + from_ + to + date + content + "</mail>\n"
2019-12-28 15:58:48 +01:00
# content = et.SubElement(mail, 'content')
# content.text = e.reply
2019-12-26 18:12:49 +01:00
2020-01-12 12:16:10 +01:00
# # recursuve "follow-up"
# if 'follow-up' in msg:
# all_follow = recursive_get_follow_up(msg)
# print(str(len(all_follow)))
# all_follow = sorted(all_follow, key=lambda m: export.utils.parse_date_msg(m))
# for f in all_follow:
# mail += emit_mail_xml(f, li, thread_nbr, msg_nbr)
# recursive_sort_by_date(msg)
# for f in msg['follow-up']:
# mail += emit_mail_xml(f, li, thread_nbr, msg_nbr)
# followups = export.utils.index_follow_up(msg)
# followups.sort(key=lambda tup: tup[0]) # sort by date...?
# for d, f in followups:
# msg_nbr += 1
# mail += emit_mail_xml(f, li, thread_nbr, msg_nbr)
2019-12-28 15:58:48 +01:00
return mail
2019-12-26 18:12:49 +01:00
def export_single_tag(t, sel, fout):
2019-12-31 09:53:46 +01:00
global hashes
2019-12-26 18:12:49 +01:00
if t not in list(sel.keys()):
logging.error("Tag: " + t + " does not exists.")
return False
2019-12-31 09:53:46 +01:00
logging.info("Exporting tag:" + t)
2019-12-26 18:12:49 +01:00
2019-12-31 09:53:46 +01:00
ch = sel[t]
2019-12-26 18:12:49 +01:00
2019-12-28 15:58:48 +01:00
chapter = "<chapter>\n"
2019-12-26 18:12:49 +01:00
2019-12-28 15:58:48 +01:00
chapter_title = make_xml_element("title", t) + "\n"
chapter_desc = make_xml_element("desc", ch['desc']) + "\n"
chapter_mails = "<mails>\n"
2019-12-26 18:12:49 +01:00
2019-12-31 09:53:46 +01:00
hashes = []
thread_nbr = 0
2020-01-12 12:16:10 +01:00
for m in ch['lists']:
2019-12-31 09:53:46 +01:00
chapter_mails += emit_mail_xml(m, m['list'], thread_nbr, 0)
thread_nbr += 1
2019-12-28 15:58:48 +01:00
2020-01-12 12:16:10 +01:00
msg_nbr = 0
# recursuve "follow-up"
if 'follow-up' in m:
print('follow-up')
all_follow = recursive_get_follow_up(m)
print(str(len(all_follow)))
all_follow = sorted(all_follow, key=lambda m: export.utils.parse_date_msg(m))
for f in all_follow:
chapter_mails += emit_mail_xml(f, m['list'], thread_nbr, msg_nbr)
msg_nbr += 1
2019-12-28 15:58:48 +01:00
chapter_mails += "</mails>\n"
2019-12-31 09:53:46 +01:00
chapter = "<chapter>\n" + chapter_title + chapter_desc + chapter_mails + "</chapter>"
2019-12-28 15:58:48 +01:00
fout.write(chapter.encode('utf-8'))
2019-12-26 18:12:49 +01:00
return True
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin:
d = json.load(fin)
2019-12-28 15:58:48 +01:00
with open(xml_dump, 'wb') as fout:
2019-12-26 18:12:49 +01:00
for k in d.keys():
if not export_single_tag(k, d, fout):
logging.error("Error exporting: " + k)
return False
return True
def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin:
d = json.load(fin)
2019-12-31 09:53:46 +01:00
now = datetime.datetime.now()
2019-12-26 18:12:49 +01:00
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
2019-12-28 15:58:48 +01:00
with open(xml_out, 'wb') as fout:
2019-12-26 18:12:49 +01:00
if not export_single_tag(tag, d, fout):
logging.error("Error exporting: " + tag)
return False
return True
#------------------------------------------------------------
# The following functions parse the archive files directly
#------------------------------------------------------------
def export_file(f, fout):
with open(f) as fp:
d = json.load(fp)
all_mail = et.Element('all')
for t in d['threads']:
emit_mail_xml(t, all_mail)
fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
def parse_date_file(fname):
2019-12-31 09:53:46 +01:00
return datetime.datetime.strptime(fname, '%B_%Y.json')
2019-12-26 18:12:49 +01:00
def export_year(d, dt, fout):
dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))]
chapter = et.Element('chapter')
year = et.SubElement(chapter, 'year')
year.text = dt.strftime('%Y')
# SORT MONTHS BEFORE WRITING TO XML
dates = []
for f in dir_files:
fdt = parse_date_file(os.path.basename(f))
if dt.year != fdt.year:
continue
dates.append((fdt, f))
dates.sort(key=lambda tup: tup[0])
for d, f in dates:
logging.debug(f)
section = et.SubElement(chapter, 'section')
month = et.SubElement(section, 'month')
month.text = d.strftime('%B')
with open(f) as fp:
dj = json.load(fp)
mails = et.SubElement(section, 'mails')
for t in dj['threads']:
emit_mail_xml(t, mails)
# write utf8 to file (et.tostring are bytes)
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))