export
This commit is contained in:
@@ -0,0 +1,24 @@
|
||||
import os, logging, glob
|
||||
import xmltodict, json
|
||||
import config
|
||||
|
||||
xml_dump = os.path.join(config.export['path'], config.export['xml'])
|
||||
|
||||
def list_all(dirname=config.export['path'], extension="xml"):
|
||||
|
||||
if not os.path.isdir(dirname):
|
||||
logging.error(dirname + " is not a valid directory.")
|
||||
return None
|
||||
|
||||
return [f for f in glob.glob(os.path.join(dirname, "*." + extension))]
|
||||
|
||||
def get(fn, extension="xml"):
|
||||
if not os.path.isfile(fn):
|
||||
logging.error(fn + " is not a valid file.")
|
||||
return None
|
||||
|
||||
if extension == "xml":
|
||||
with open(fn) as fp:
|
||||
dxml = fp.read()
|
||||
return xmltodict.parse(dxml) #<--- this is werid parse mails/mail....
|
||||
# return json.loads(json.dumps(xmltodict.parse(dxml)))
|
||||
@@ -0,0 +1,161 @@
|
||||
import json, os
|
||||
import xml.etree.ElementTree as et
|
||||
import export.utils
|
||||
import config
|
||||
from datetime import datetime
|
||||
|
||||
nn = 0
|
||||
|
||||
sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump'])
|
||||
xml_dump = os.path.join(config.export['path'], config.export['xml'])
|
||||
|
||||
|
||||
def export_generate_path(tag):
|
||||
now = datetime.now()
|
||||
return os.path.join(config.export['path'], tag + "_[now].xml")
|
||||
|
||||
def emit_mail_xml(msg, xmlel):
|
||||
|
||||
global nn
|
||||
nn += 1
|
||||
|
||||
mail = et.SubElement(xmlel, 'mail')
|
||||
|
||||
subject = et.SubElement(mail, 'subject')
|
||||
subject.text = export.utils.format_subject(msg['subject'])
|
||||
|
||||
to = et.SubElement(mail, 'to')
|
||||
if 'to' in msg:
|
||||
to.text = msg['to']
|
||||
else:
|
||||
to.text = 'n/a'
|
||||
|
||||
from_ = et.SubElement(mail, 'from')
|
||||
from_.text = msg['from']
|
||||
|
||||
date = et.SubElement(mail, 'date')
|
||||
date.text = msg['date']
|
||||
|
||||
content = et.SubElement(mail, 'content')
|
||||
content.text = export.utils.format_content(msg['content'])
|
||||
|
||||
# recursuve "follow-up"
|
||||
if 'follow-up' in msg:
|
||||
followups = export.utils.index_follow_up(msg)
|
||||
followups.sort(key=lambda tup: tup[0])
|
||||
for d, f in followups:
|
||||
emit_mail_xml(f, xmlel)
|
||||
|
||||
|
||||
#------------------------------------------------------------
|
||||
# The following functions parse the selection files
|
||||
#------------------------------------------------------------
|
||||
|
||||
def export_single_tag(t, sel, fout):
|
||||
|
||||
if t not in list(sel.keys()):
|
||||
logging.error("Tag: " + t + " does not exists.")
|
||||
return False
|
||||
|
||||
ch = sel[t]
|
||||
|
||||
chapter = et.Element('chapter')
|
||||
chapter_title = et.SubElement(chapter, 'title')
|
||||
chapter_title.text = t
|
||||
|
||||
chapter_desc = et.SubElement(chapter, 'desc')
|
||||
chapter_desc.text = ch['desc']
|
||||
|
||||
chapter_mails = et.SubElement(chapter, 'mails')
|
||||
|
||||
for m in ch['lists']:
|
||||
emit_mail_xml(m, chapter_mails)
|
||||
|
||||
fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
|
||||
|
||||
return True
|
||||
|
||||
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
|
||||
|
||||
with open(sel_dump) as fin:
|
||||
d = json.load(fin)
|
||||
|
||||
with open(xml_dump, 'w') as fout:
|
||||
for k in d.keys():
|
||||
if not export_single_tag(k, d, fout):
|
||||
logging.error("Error exporting: " + k)
|
||||
return False
|
||||
return True
|
||||
|
||||
def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
|
||||
|
||||
with open(sel_dump) as fin:
|
||||
d = json.load(fin)
|
||||
|
||||
now = datetime.now()
|
||||
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
|
||||
|
||||
with open(xml_out, 'w') as fout:
|
||||
if not export_single_tag(tag, d, fout):
|
||||
logging.error("Error exporting: " + tag)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
|
||||
#------------------------------------------------------------
|
||||
# The following functions parse the archive files directly
|
||||
#------------------------------------------------------------
|
||||
|
||||
def export_file(f, fout):
|
||||
|
||||
with open(f) as fp:
|
||||
d = json.load(fp)
|
||||
|
||||
all_mail = et.Element('all')
|
||||
for t in d['threads']:
|
||||
emit_mail_xml(t, all_mail)
|
||||
|
||||
fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
|
||||
|
||||
def parse_date_file(fname):
|
||||
return datetime.strptime(fname, '%B_%Y.json')
|
||||
|
||||
def export_year(d, dt, fout):
|
||||
|
||||
dir_files = [f for f in glob.glob(os.path.join(d, "*.json"))]
|
||||
|
||||
chapter = et.Element('chapter')
|
||||
year = et.SubElement(chapter, 'year')
|
||||
year.text = dt.strftime('%Y')
|
||||
|
||||
# SORT MONTHS BEFORE WRITING TO XML
|
||||
dates = []
|
||||
for f in dir_files:
|
||||
|
||||
fdt = parse_date_file(os.path.basename(f))
|
||||
if dt.year != fdt.year:
|
||||
continue
|
||||
|
||||
dates.append((fdt, f))
|
||||
|
||||
dates.sort(key=lambda tup: tup[0])
|
||||
|
||||
for d, f in dates:
|
||||
|
||||
logging.debug(f)
|
||||
|
||||
section = et.SubElement(chapter, 'section')
|
||||
month = et.SubElement(section, 'month')
|
||||
month.text = d.strftime('%B')
|
||||
|
||||
with open(f) as fp:
|
||||
dj = json.load(fp)
|
||||
|
||||
mails = et.SubElement(section, 'mails')
|
||||
for t in dj['threads']:
|
||||
emit_mail_xml(t, mails)
|
||||
|
||||
# write utf8 to file (et.tostring are bytes)
|
||||
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
||||
fout.write(export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('utf-8', 'ignore')))
|
||||
@@ -0,0 +1,62 @@
|
||||
from datetime import datetime
|
||||
import regex as re
|
||||
import email.utils, logging
|
||||
|
||||
xml_re = re.compile('[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]')
|
||||
nl_re = re.compile('\n\n\n')
|
||||
ind_re = re.compile('--------------------------------------------------------------------------')
|
||||
|
||||
def format_subject(s):
|
||||
return ' '.join(s.split())
|
||||
|
||||
def format_content(c):
|
||||
|
||||
c = c.strip()
|
||||
|
||||
# new lines
|
||||
c = re.sub(nl_re, '\n', c)
|
||||
|
||||
## weird stuff
|
||||
# 1. indesign automatic overset... (? dunno why ?)
|
||||
# ex: Sat, 22 Nov 1997 18:23:59 -0500 (The Mattel Crackdown -- Nettime)
|
||||
c = re.sub(ind_re, '', c)
|
||||
|
||||
return c
|
||||
|
||||
# returns utc timestamp
|
||||
def parse_date_msg(msg):
|
||||
date_str = msg['date']
|
||||
time_tz = None
|
||||
try:
|
||||
date_tz = email.utils.parsedate_tz(date_str)
|
||||
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
||||
except TypeError:
|
||||
logging.warning("Format Date TypeError")
|
||||
loggin.warning(" > " + date_str)
|
||||
return None
|
||||
except ValueError:
|
||||
loggin.warning("Format Date ValueError")
|
||||
loggin.warning(" > " + date_str)
|
||||
return None
|
||||
except Exception as ex:
|
||||
print(ex)
|
||||
finally:
|
||||
return time_tz
|
||||
|
||||
# recursive
|
||||
def index_follow_up(msg):
|
||||
r = []
|
||||
if 'follow-up' in msg:
|
||||
for m in msg['follow-up']:
|
||||
d = parse_date_msg(m)
|
||||
if d is None:
|
||||
d = parse_date_msg(msg) # same as parent
|
||||
r.append((d, m))
|
||||
r += index_follow_up(m)
|
||||
return r
|
||||
|
||||
# See for Nevejan?s research
|
||||
def remove_invalid_xml_characters(s):
|
||||
return re.sub(xml_re, '', s)
|
||||
# return re.sub(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]', '', s)
|
||||
# return re.sub(r'\p{Cc}-[\r\n\t]+', '', s)
|
||||
Reference in New Issue
Block a user