import argparse, os, glob, sys, json, email.utils, logging import xml.etree.ElementTree as et from datetime import datetime import regex as re nn = 0 logging.basicConfig(level=logging.CRITICAL) xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]') nl_re = re.compile('\n\n\n') ind_re = re.compile('--------------------------------------------------------------------------') def format_subject(s): return ' '.join(s.split()) def format_content(c): c = c.strip() # new lines c = re.sub(nl_re, '\n', c) ## weird stuff # 1. indesign automatic overset... (? dunno why ?) # ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime) c = re.sub(ind_re, '', c) return c return c.strip().replace("\n\n\n", "\n") # See for Nevejan?s research def remove_invalid_xml_characters(s): return re.sub(xml_re, '', s) # return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s) # return re.sub(r'\p{Cc}-[\r\n\t]+', '', s) def parse_date_file(fname): return datetime.strptime(fname, '%B_%Y.json') # returns utc timestamp def parse_date_msg(msg): date_str = msg['date'] time_tz = None try: date_tz = email.utils.parsedate_tz(date_str) time_tz = email.utils.mktime_tz(date_tz) #utc timestamp except TypeError: logging.warning("Format Date TypeError") loggin.warning(" > " + date_str) return None except ValueError: loggin.warning("Format Date ValueError") loggin.warning(" > " + date_str) return None except Exception as ex: print(ex) finally: return time_tz # recursive def index_follow_up(msg): r = [] if 'follow-up' in msg: for m in msg['follow-up']: d = parse_date_msg(m) if d is None: d = parse_date_msg(msg) # same as parent r.append((d, m)) r += index_follow_up(m) return r def emit_mail_xml(msg, xmlel): global nn nn += 1 mail = et.SubElement(xmlel, 'mail') subject = et.SubElement(mail, 'subject') subject.text = format_subject(msg['subject']) to = et.SubElement(mail, 'to') if 'to' in msg: to.text = msg['to'] else: to.text = 'n/a' from_ = et.SubElement(mail, 'from') from_.text = msg['from'] date = et.SubElement(mail, 'date') date.text = msg['date'] content = et.SubElement(mail, 'content') content.text = format_content(msg['content']) # recursuve "follow-up" if 'follow-up' in msg: followups = index_follow_up(msg) followups.sort(key=lambda tup: tup[0]) for d, f in followups: emit_mail_xml(f, xmlel) def export_file(f, fout): with open(f) as fp: d = json.load(fp) all_mail = et.Element('all') for t in d['threads']: emit_mail_xml(t, all_mail) fout.write(et.tostring(all_mail).decode('utf-8', 'ignore')) def export_year(d, dt, fout): dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))] chapter = et.Element('chapter') year = et.SubElement(chapter, 'year') year.text = dt.strftime('%Y') # SORT MONTHS BEFORE WRITING TO XML dates = [] for f in dir_files: fdt = parse_date_file(os.path.basename(f)) if dt.year != fdt.year: continue dates.append((fdt, f)) dates.sort(key=lambda tup: tup[0]) for d, f in dates: logging.debug(f) section = et.SubElement(chapter, 'section') month = et.SubElement(section, 'month') month.text = d.strftime('%B') with open(f) as fp: dj = json.load(fp) mails = et.SubElement(section, 'mails') for t in dj['threads']: emit_mail_xml(t, mails) # write utf8 to file (et.tostring are bytes) # fout.write(et.tostring(chapter).decode('utf-8', 'ignore')) fout.write(remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore'))) if __name__ == "__main__": p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!') p.add_argument('file', metavar="f", help="mailinglist file(s) or dir(s) to export", nargs="+") p.add_argument('--year', '-y', metavar='y', type=str, help='year of archive') p.add_argument('--interval', '-i', metavar='i', type=str, help='years interval') p.add_argument('--output', '-o', metavar='o', type=str, help='output directory') args = p.parse_args() years = [] if args.year: years.append(datetime.strptime(args.year, '%Y')) if args.interval: r = args.interval.split('-') years = list(map(lambda x: datetime.strptime(str(x), '%Y'), range(int(r[0]), int(r[1]) + 1))) if not args.file: sys.exit('No file(s). Aborting.') # with open("out.xml", "w") as fout: for f in args.file: if not os.path.isdir(f): logging.warning(f + ' is not a valid directory.') continue if len(years) > 0: for y in years: yn = y.strftime('%Y') foutname = os.path.basename(f) + "_" + yn + ".xml" if args.output and os.path.isdir(args.output): foutname = os.path.join(args.output, foutname) with open(foutname, "w") as fout: sys.stdout.write("Processing - " + yn) export_year(f, y, fout) print(" - nbr of message exported: " + str(nn)) nn = 0 else: if not os.path.isfile(f): sys.exit(f + ' is not a valid file. Aborting.') export_file(f, fout)