import json, os, logging import xml.etree.ElementTree as et from xml.sax.saxutils import unescape, escape import export.utils import config from export import emailreply import re, hashlib, datetime nn = 0 sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump']) xml_dump = os.path.join(config.export['path'], config.export['xml']) ''' utils ''' def export_generate_path(tag): now = datetime.datetime.now() return os.path.join(config.export['path'], tag + "_[now].xml") def sort_sel_dump(tag, sel_dump=sel_dump, sel_out=None): with open(sel_dump) as fin: d = json.load(fin) sorted_list = sorted(d[tag]['lists'], key=lambda m: export.utils.parse_date_msg(m)) d[tag]['lists'] = sorted_list with open(sel_dump, 'w') as fout: json.dump(d, fout, indent=4, ensure_ascii=False) def recursive_sort_by_date(msg): if 'follow-up' in msg: msg['follow-up'] = sorted(msg['follow-up'], key=lambda m: export.utils.parse_date_msg(m)) for m in msg['follow-up']: recursive_sort_by_date(m) def recursive_get_follow_up(msg): f = [] if 'follow-up' in msg: for m in msg['follow-up']: f += recursive_get_follow_up(m) f += msg['follow-up'] return f def fix_missing_content(xml_in, xml_out): tree = et.parse(xml_in) root = tree.getroot() tag = root.find('title').text content_less = root.findall('.//*[content=""]/.') if len(content_less) < 0: return with open(sel_dump) as fin: d = json.load(fin) for m in content_less: date_str = m.find('date').text print(date_str) for msg in d[tag]["lists"]: ml = find_msg_by_date_recursive(msg, date_str) if ml is not None: m.find('content').text = ml['content'] # tree.write(xml_out) with open(xml_out, "w") as fout: fout.write(et.tostring(root).decode('utf-8', 'ignore')) def find_msg_by_date_recursive(msg, date_str): if msg['date'] == date_str: return msg if 'follow-up' in msg: for m in msg['follow-up']: r = find_msg_by_date_recursive(m, date_str) if r is not None: return r return None ''' xml export ''' def hash(m): return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest() def make_xml_element(el, val): return "<" + el + ">" + escape(val) + "" ''' This is pretty patched up........................................................... ''' def to_listserv(li, msg): if li == 'crumb': # patch return '' elif li == 'spectre': return 'spectre@mikrolisten.de' elif li == 'empyre': return '' elif li == 'nettime_bold': return 'nettime-bold@nettime.org' elif li == 'nettime_l': # nettime-l@desk.nl -- June 8 1999 # mettime-l-temp@material.net -- July 15 1999 # nettime-l@bbs.thing.net> -- July 19 2007 # nettime-l@kein.org dtz = export.utils.parse_date_msg(msg) if dtz is not None: d = datetime.datetime.fromtimestamp(dtz).date() if d < datetime.date(1999, 6, 8): return 'nettime-l@desk.nl' elif d < datetime.date(1999, 7, 15): return 'nettime-l-temp@material.net' elif d < datetime.date(2007, 7, 19): return 'nettime-l@bbs.thing.net' return 'nettime-l@kein.org' elif li == 'oldboys': return 'oldboys@lists.ccc.de' #### SYNDICATE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! aarrgghhh else: logging.warning("no listserv to...") return 'n/a' def emit_mail_xml(msg, li, thread_nbr, msg_nbr): global nn, hashes nn += 1 h = hash(msg) # patch if h in hashes: logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...") return '' else: hashes.append(h) # print(msg['date'] + " - " + msg['subject']) nbr = make_xml_element("nbr", str(thread_nbr) + "." + str(msg_nbr)) + "\n" subject = make_xml_element("subject", msg['subject']) + "\n" to = make_xml_element("to", to_listserv(li, msg)) + "\n" # patch from_ = make_xml_element("from", msg['author_name']) + "\n" date = make_xml_element("date", msg['date']) + "\n" ''' todo: - filter reply - unescape XML ''' e = emailreply.EmailMessage(export.utils.format_content(msg['content'])) e.read() # content_stripped = re.sub(r'\n[ ]{2,}', '\n\n', e.reply) # patch # # content_stripped = re.sub(r'(?\n" + nbr + subject + from_ + to + date + content + "\n" # content = et.SubElement(mail, 'content') # content.text = e.reply # # recursuve "follow-up" # if 'follow-up' in msg: # all_follow = recursive_get_follow_up(msg) # print(str(len(all_follow))) # all_follow = sorted(all_follow, key=lambda m: export.utils.parse_date_msg(m)) # for f in all_follow: # mail += emit_mail_xml(f, li, thread_nbr, msg_nbr) # recursive_sort_by_date(msg) # for f in msg['follow-up']: # mail += emit_mail_xml(f, li, thread_nbr, msg_nbr) # followups = export.utils.index_follow_up(msg) # followups.sort(key=lambda tup: tup[0]) # sort by date...? # for d, f in followups: # msg_nbr += 1 # mail += emit_mail_xml(f, li, thread_nbr, msg_nbr) return mail def export_single_tag(t, sel, fout): global hashes if t not in list(sel.keys()): logging.error("Tag: " + t + " does not exists.") return False logging.info("Exporting tag:" + t) ch = sel[t] chapter = "\n" chapter_title = make_xml_element("title", t) + "\n" chapter_desc = make_xml_element("desc", ch['desc']) + "\n" chapter_mails = "\n" hashes = [] thread_nbr = 0 for m in ch['lists']: mail = emit_mail_xml(m, m['list'], thread_nbr, 0) if mail == '': continue chapter_mails += mail thread_nbr += 1 msg_nbr = 0 # recursuve "follow-up" if 'follow-up' in m: print('follow-up') all_follow = recursive_get_follow_up(m) print(str(len(all_follow))) all_follow = sorted(all_follow, key=lambda m: export.utils.parse_date_msg(m)) for f in all_follow: mail = emit_mail_xml(f, m['list'], thread_nbr, msg_nbr) if mail != '': chapter_mails += mail msg_nbr += 1 chapter_mails += "\n" chapter = "\n" + chapter_title + chapter_desc + chapter_mails + "" fout.write(chapter.encode('utf-8')) return True def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump): with open(sel_dump) as fin: d = json.load(fin) with open(xml_dump, 'wb') as fout: for k in d.keys(): if not export_single_tag(k, d, fout): logging.error("Error exporting: " + k) return False return True def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump): with open(sel_dump) as fin: d = json.load(fin) now = datetime.datetime.now() xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S")) with open(xml_out, 'wb') as fout: if not export_single_tag(tag, d, fout): logging.error("Error exporting: " + tag) return False return True #------------------------------------------------------------ # The following functions parse the archive files directly #------------------------------------------------------------ def export_file(f, fout): with open(f) as fp: d = json.load(fp) all_mail = et.Element('all') for t in d['threads']: emit_mail_xml(t, all_mail) fout.write(et.tostring(all_mail).decode('utf-8', 'ignore')) def parse_date_file(fname): return datetime.datetime.strptime(fname, '%B_%Y.json') def export_year(d, dt, fout): dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))] chapter = et.Element('chapter') year = et.SubElement(chapter, 'year') year.text = dt.strftime('%Y') # SORT MONTHS BEFORE WRITING TO XML dates = [] for f in dir_files: fdt = parse_date_file(os.path.basename(f)) if dt.year != fdt.year: continue dates.append((fdt, f)) dates.sort(key=lambda tup: tup[0]) for d, f in dates: logging.debug(f) section = et.SubElement(chapter, 'section') month = et.SubElement(section, 'month') month.text = d.strftime('%B') with open(f) as fp: dj = json.load(fp) mails = et.SubElement(section, 'mails') for t in dj['threads']: emit_mail_xml(t, mails) # write utf8 to file (et.tostring are bytes) # fout.write(et.tostring(chapter).decode('utf-8', 'ignore')) fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))