374 lines
8.4 KiB
Python

import json, os, logging
import xml.etree.ElementTree as et
from xml.sax.saxutils import unescape, escape
import export.utils
import config
from export import emailreply
import re, hashlib, datetime
nn = 0
sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump'])
xml_dump = os.path.join(config.export['path'], config.export['xml'])
'''
utils
'''
def export_generate_path(tag):
now = datetime.datetime.now()
return os.path.join(config.export['path'], tag + "_[now].xml")
def sort_sel_dump(tag, sel_dump=sel_dump, sel_out=None):
with open(sel_dump) as fin:
d = json.load(fin)
sorted_list = sorted(d[tag]['lists'], key=lambda m: export.utils.parse_date_msg(m))
d[tag]['lists'] = sorted_list
with open(sel_dump, 'w') as fout:
json.dump(d, fout, indent=4, ensure_ascii=False)
def recursive_sort_by_date(msg):
if 'follow-up' in msg:
msg['follow-up'] = sorted(msg['follow-up'], key=lambda m: export.utils.parse_date_msg(m))
for m in msg['follow-up']:
recursive_sort_by_date(m)
def recursive_get_follow_up(msg):
f = []
if 'follow-up' in msg:
for m in msg['follow-up']:
f += recursive_get_follow_up(m)
f += msg['follow-up']
return f
def fix_missing_content(xml_in, xml_out):
tree = et.parse(xml_in)
root = tree.getroot()
tag = root.find('title').text
content_less = root.findall('.//*[content=""]/.')
if len(content_less) < 0:
return
with open(sel_dump) as fin:
d = json.load(fin)
for m in content_less:
date_str = m.find('date').text
print(date_str)
for msg in d[tag]["lists"]:
ml = find_msg_by_date_recursive(msg, date_str)
if ml is not None:
m.find('content').text = ml['content']
# tree.write(xml_out)
with open(xml_out, "w") as fout:
fout.write(et.tostring(root).decode('utf-8', 'ignore'))
def find_msg_by_date_recursive(msg, date_str):
if msg['date'] == date_str:
return msg
if 'follow-up' in msg:
for m in msg['follow-up']:
r = find_msg_by_date_recursive(m, date_str)
if r is not None:
return r
return None
'''
xml export
'''
def hash(m):
return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest()
def make_xml_element(el, val):
return "<" + el + ">" + escape(val) + "</" + el + ">"
'''
This is pretty patched up...........................................................
'''
def to_listserv(li, msg):
if li == 'crumb': # patch
return '<new-media-curating@jiscmail.ac.uk>'
elif li == 'spectre':
return 'spectre@mikrolisten.de'
elif li == 'empyre':
return '<empyre@lists.cofa.unsw.edu.au>'
elif li == 'nettime_bold':
return 'nettime-bold@nettime.org'
elif li == 'nettime_l':
# nettime-l@desk.nl -- June 8 1999
# mettime-l-temp@material.net -- July 15 1999
# nettime-l@bbs.thing.net> -- July 19 2007
# nettime-l@kein.org
dtz = export.utils.parse_date_msg(msg)
if dtz is not None:
d = datetime.datetime.fromtimestamp(dtz).date()
if d < datetime.date(1999, 6, 8):
return 'nettime-l@desk.nl'
elif d < datetime.date(1999, 7, 15):
return 'nettime-l-temp@material.net'
elif d < datetime.date(2007, 7, 19):
return 'nettime-l@bbs.thing.net'
return 'nettime-l@kein.org'
elif li == 'oldboys':
return 'oldboys@lists.ccc.de'
#### SYNDICATE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! aarrgghhh
else:
logging.warning("no listserv to...")
return 'n/a'
def emit_mail_xml(msg, li, thread_nbr, msg_nbr):
global nn, hashes
nn += 1
h = hash(msg) # patch
if h in hashes:
logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...")
return ''
else:
hashes.append(h)
# print(msg['date'] + " - " + msg['subject'])
nbr = make_xml_element("nbr", str(thread_nbr) + "." + str(msg_nbr)) + "\n"
subject = make_xml_element("subject", msg['subject']) + "\n"
to = make_xml_element("to", to_listserv(li, msg)) + "\n" # patch
from_ = make_xml_element("from", msg['author_name']) + "\n"
date = make_xml_element("date", msg['date']) + "\n"
'''
todo:
- filter reply
- unescape XML
'''
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
e.read()
# content_stripped = re.sub(r'\n[ ]{2,}', '\n\n', e.reply) # patch
# # content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
# content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', content_stripped)
# # content_stripped = re.sub(r'\n[ ]{2,}\w', '\n\n', content_stripped)
# content_stripped = e.reply
content_stripped = msg['content']
content = make_xml_element("content", content_stripped) + "\n"
mail = "<mail>\n" + nbr + subject + from_ + to + date + content + "</mail>\n"
# content = et.SubElement(mail, 'content')
# content.text = e.reply
# # recursuve "follow-up"
# if 'follow-up' in msg:
# all_follow = recursive_get_follow_up(msg)
# print(str(len(all_follow)))
# all_follow = sorted(all_follow, key=lambda m: export.utils.parse_date_msg(m))
# for f in all_follow:
# mail += emit_mail_xml(f, li, thread_nbr, msg_nbr)
# recursive_sort_by_date(msg)
# for f in msg['follow-up']:
# mail += emit_mail_xml(f, li, thread_nbr, msg_nbr)
# followups = export.utils.index_follow_up(msg)
# followups.sort(key=lambda tup: tup[0]) # sort by date...?
# for d, f in followups:
# msg_nbr += 1
# mail += emit_mail_xml(f, li, thread_nbr, msg_nbr)
return mail
def export_single_tag(t, sel, fout):
global hashes
if t not in list(sel.keys()):
logging.error("Tag: " + t + " does not exists.")
return False
logging.info("Exporting tag:" + t)
ch = sel[t]
chapter = "<chapter>\n"
chapter_title = make_xml_element("title", t) + "\n"
chapter_desc = make_xml_element("desc", ch['desc']) + "\n"
chapter_mails = "<mails>\n"
hashes = []
thread_nbr = 0
for m in ch['lists']:
mail = emit_mail_xml(m, m['list'], thread_nbr, 0)
if mail == '':
continue
chapter_mails += mail
thread_nbr += 1
msg_nbr = 0
# recursuve "follow-up"
if 'follow-up' in m:
print('follow-up')
all_follow = recursive_get_follow_up(m)
print(str(len(all_follow)))
all_follow = sorted(all_follow, key=lambda m: export.utils.parse_date_msg(m))
for f in all_follow:
mail = emit_mail_xml(f, m['list'], thread_nbr, msg_nbr)
if mail != '':
chapter_mails += mail
msg_nbr += 1
chapter_mails += "</mails>\n"
chapter = "<chapter>\n" + chapter_title + chapter_desc + chapter_mails + "</chapter>"
fout.write(chapter.encode('utf-8'))
return True
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin:
d = json.load(fin)
with open(xml_dump, 'wb') as fout:
for k in d.keys():
if not export_single_tag(k, d, fout):
logging.error("Error exporting: " + k)
return False
return True
def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin:
d = json.load(fin)
now = datetime.datetime.now()
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
with open(xml_out, 'wb') as fout:
if not export_single_tag(tag, d, fout):
logging.error("Error exporting: " + tag)
return False
return True
#------------------------------------------------------------
# The following functions parse the archive files directly
#------------------------------------------------------------
def export_file(f, fout):
with open(f) as fp:
d = json.load(fp)
all_mail = et.Element('all')
for t in d['threads']:
emit_mail_xml(t, all_mail)
fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
def parse_date_file(fname):
return datetime.datetime.strptime(fname, '%B_%Y.json')
def export_year(d, dt, fout):
dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))]
chapter = et.Element('chapter')
year = et.SubElement(chapter, 'year')
year.text = dt.strftime('%Y')
# SORT MONTHS BEFORE WRITING TO XML
dates = []
for f in dir_files:
fdt = parse_date_file(os.path.basename(f))
if dt.year != fdt.year:
continue
dates.append((fdt, f))
dates.sort(key=lambda tup: tup[0])
for d, f in dates:
logging.debug(f)
section = et.SubElement(chapter, 'section')
month = et.SubElement(section, 'month')
month.text = d.strftime('%B')
with open(f) as fp:
dj = json.load(fp)
mails = et.SubElement(section, 'mails')
for t in dj['threads']:
emit_mail_xml(t, mails)
# write utf8 to file (et.tostring are bytes)
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))