duplication fix + templates

This commit is contained in:
gauthiier 2019-12-31 09:53:46 +01:00
parent e5b7f43567
commit 4ebab083bf
10 changed files with 176 additions and 159 deletions

BIN
book/fonts/Doves Type Imprint OTF v1.9.zip (Stored with Git LFS) Normal file

Binary file not shown.

BIN
book/templates/template-DOVE-NBR-two column range left.indt (Stored with Git LFS) Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

34
export/check.py Normal file
View File

@ -0,0 +1,34 @@
import xml.etree.ElementTree as et
import os, logging, glob, hashlib
logging.basicConfig(level=logging.DEBUG)
def check_duplicates(xml_file):
hashes = []
r = et.parse(xml_file).getroot()
for m in r.findall('mails/mail'):
f = m.find('from').text
s = m.find('subject').text
d = m.find('date').text
h = hashlib.sha256((f + s + d).encode("utf-8")).hexdigest()
if h in hashes:
logging.info("* Duplicate: " + s + " - " + f + " - " + d)
# logging.info(h)
else:
hashes.append(h)
if __name__ == "__main__":
d = "out/"
xml_files = [f for f in glob.glob(os.path.join(d, "*.xml"))]
logging.info("Checking duplicates")
for f in xml_files:
logging.info("> " + os.path.basename(f))
check_duplicates(f)

View File

@ -3,39 +3,83 @@ import xml.etree.ElementTree as et
from xml.sax.saxutils import unescape, escape
import export.utils
import config
from datetime import datetime
from export import emailreply
import re
import re, hashlib, datetime
nn = 0
sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump'])
xml_dump = os.path.join(config.export['path'], config.export['xml'])
'''
utils
'''
def export_generate_path(tag):
now = datetime.now()
now = datetime.datetime.now()
return os.path.join(config.export['path'], tag + "_[now].xml")
'''
xml export
'''
def hash(m):
return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest()
def make_xml_element(el, val):
return "<" + el + ">" + escape(val) + "</" + el + ">"
'''
This is pretty patched up...........................................................
'''
def emit_mail_xml(msg, li):
def to_listserv(li, msg):
if li == 'crumb': # patch
return '<new-media-curating@jiscmail.ac.uk>'
elif li == 'spectre':
return 'spectre@mikrolisten.de'
elif li == 'empyre':
return '<empyre@lists.cofa.unsw.edu.au>'
elif li == 'nettime_bold':
return 'nettime-bold@nettime.org'
elif li == 'nettime_l':
# nettime-l@desk.nl -- June 8 1999
# mettime-l-temp@material.net -- July 15 1999
# nettime-l@bbs.thing.net> -- July 19 2007
# nettime-l@kein.org
dtz = export.utils.parse_date_msg(msg)
if dtz is not None:
d = datetime.datetime.fromtimestamp(dtz).date()
if d < datetime.date(1999, 6, 8):
return 'nettime-l@desk.nl'
elif d < datetime.date(1999, 7, 15):
return 'mettime-l-temp@material.net'
elif d < datetime.date(2007, 7, 19):
return 'nettime-l@bbs.thing.net'
return 'nettime-l@kein.org'
else:
logging.warning("no listserv to...")
return 'n/a'
global nn
def emit_mail_xml(msg, li, thread_nbr, msg_nbr):
global nn, hashes
nn += 1
logging.info("export xml: " + li)
h = hash(msg) # patch
if h in hashes:
logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...")
return ''
else:
hashes.append(h)
nbr = make_xml_element("nbr", str(thread_nbr) + "." + str(msg_nbr)) + "\n"
subject = make_xml_element("subject", msg['subject']) + "\n"
if 'to' in msg:
to = make_xml_element("to", msg['to']) + "\n"
else:
to = make_xml_element("to", 'n/a') + "\n"
to = make_xml_element("to", to_listserv(li, msg)) + "\n" # patch
from_ = make_xml_element("from", msg['from']) + "\n"
from_ = make_xml_element("from", msg['author_name']) + "\n"
date = make_xml_element("date", msg['date']) + "\n"
@ -47,11 +91,16 @@ def emit_mail_xml(msg, li):
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
e.read()
content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
# content_stripped = re.sub(r'\n[ ]{2,}', '\n\n', e.reply) # patch
# # content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
# content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', content_stripped)
# # content_stripped = re.sub(r'\n[ ]{2,}\w', '\n\n', content_stripped)
content_stripped = e.reply
content = make_xml_element("content", content_stripped) + "\n"
mail = "<mail>\n" + subject + from_ + to + date + content + "</mail>\n"
mail = "<mail>\n" + nbr + subject + from_ + to + date + content + "</mail>\n"
# content = et.SubElement(mail, 'content')
# content.text = e.reply
@ -61,79 +110,22 @@ def emit_mail_xml(msg, li):
followups = export.utils.index_follow_up(msg)
followups.sort(key=lambda tup: tup[0])
for d, f in followups:
mail += emit_mail_xml(f, li)
msg_nbr += 1
mail += emit_mail_xml(f, li, thread_nbr, msg_nbr)
return mail
# def emit_mail_xml(msg, li, xmlel):
# global nn
# nn += 1
# logging.info("export xml: " + li)
# mail = et.SubElement(xmlel, 'mail')
# subject = et.SubElement(mail, 'subject')
# subject.text = export.utils.format_subject(msg['subject'])
# to = et.SubElement(mail, 'to')
# if 'to' in msg:
# to.text = msg['to']
# else:
# to.text = 'n/a'
# from_ = et.SubElement(mail, 'from')
# from_.text = msg['from']
# date = et.SubElement(mail, 'date')
# date.text = msg['date']
# '''
# todo:
# - filter reply
# - unescape XML
# '''
# e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
# e.read()
# escape_table = {
# "&": "&amp;",
# ">": "&gt;",
# "<": "&lt;"
# }
# content_str = "<content>" + escape(e.reply, escape_table) + "</content>"
# print(content_str)
# content = et.fromstring(content_str)
# mail.append(content)
# # content = et.SubElement(mail, 'content')
# # content.text = e.reply
# # recursuve "follow-up"
# if 'follow-up' in msg:
# followups = export.utils.index_follow_up(msg)
# followups.sort(key=lambda tup: tup[0])
# for d, f in followups:
# emit_mail_xml(f, li, xmlel)
#------------------------------------------------------------
# The following functions parse the selection files
#------------------------------------------------------------
def export_single_tag(t, sel, fout):
global hashes
if t not in list(sel.keys()):
logging.error("Tag: " + t + " does not exists.")
return False
ch = sel[t]
logging.info("Exporting tag:" + t)
ch = sel[t]
chapter = "<chapter>\n"
@ -143,64 +135,21 @@ def export_single_tag(t, sel, fout):
chapter_mails = "<mails>\n"
for m in ch['lists']:
chapter_mails += emit_mail_xml(m, m['list'])
hashes = []
thread_nbr = 0
for m in ch['lists']:
chapter_mails += emit_mail_xml(m, m['list'], thread_nbr, 0)
thread_nbr += 1
chapter_mails += "</mails>\n"
chapter = "<chapter>\n" + chapter_mails + "</chapter>"
chapter = "<chapter>\n" + chapter_title + chapter_desc + chapter_mails + "</chapter>"
fout.write(chapter.encode('utf-8'))
# # root = et.ElementTree(chapter)
# # root.write(fout, encoding="utf-8", xml_declaration=True)
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
return True
# def export_single_tag(t, sel, fout):
# if t not in list(sel.keys()):
# logging.error("Tag: " + t + " does not exists.")
# return False
# ch = sel[t]
# chapter = et.Element('chapter')
# chapter_title = et.SubElement(chapter, 'title')
# chapter_title.text = t
# chapter_desc = et.SubElement(chapter, 'desc')
# chapter_desc.text = ch['desc']
# chapter_mails = et.SubElement(chapter, 'mails')
# for m in ch['lists']:
# emit_mail_xml(m, m['list'], chapter_mails)
# # root = et.ElementTree(chapter)
# # root.write(fout, encoding="utf-8", xml_declaration=True)
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
# return True
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin:
@ -218,7 +167,7 @@ def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
with open(sel_dump) as fin:
d = json.load(fin)
now = datetime.now()
now = datetime.datetime.now()
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
with open(xml_out, 'wb') as fout:
@ -245,7 +194,7 @@ def export_file(f, fout):
fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
def parse_date_file(fname):
return datetime.strptime(fname, '%B_%Y.json')
return datetime.datetime.strptime(fname, '%B_%Y.json')
def export_year(d, dt, fout):

File diff suppressed because one or more lines are too long

View File

@ -112,6 +112,14 @@
{
"list": "nettime_l",
"url": "https://nettime.org/Lists-Archives/nettime-l-0305/msg00078.html"
},
{
"list": "nettime_l",
"url": "https://nettime.org/Lists-Archives/nettime-l-9906/msg00069.html"
},
{
"list": "nettime_l",
"url": "https://nettime.org/Lists-Archives/nettime-l-0707/msg00025.html"
}
],
"desc": "..."