duplication fix + templates
This commit is contained in:
parent
e5b7f43567
commit
4ebab083bf
BIN
book/fonts/Doves Type Imprint OTF v1.9.zip
(Stored with Git LFS)
Normal file
BIN
book/fonts/Doves Type Imprint OTF v1.9.zip
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
book/templates/template-DOVE-NBR-two column range left.indt
(Stored with Git LFS)
Normal file
BIN
book/templates/template-DOVE-NBR-two column range left.indt
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
book/templates/template-GOTHIC-NBR-two column range left.indt
(Stored with Git LFS)
Normal file
BIN
book/templates/template-GOTHIC-NBR-two column range left.indt
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
book/templates/template-GOTHIC-two column range left.indt
(Stored with Git LFS)
BIN
book/templates/template-GOTHIC-two column range left.indt
(Stored with Git LFS)
Binary file not shown.
BIN
book/templates/template-MINUSCULE-NBR-two column range left.indt
(Stored with Git LFS)
Normal file
BIN
book/templates/template-MINUSCULE-NBR-two column range left.indt
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
book/templates/template-MINUSCULE-two column range left.indt
(Stored with Git LFS)
BIN
book/templates/template-MINUSCULE-two column range left.indt
(Stored with Git LFS)
Binary file not shown.
34
export/check.py
Normal file
34
export/check.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import xml.etree.ElementTree as et
|
||||||
|
import os, logging, glob, hashlib
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
def check_duplicates(xml_file):
|
||||||
|
|
||||||
|
hashes = []
|
||||||
|
r = et.parse(xml_file).getroot()
|
||||||
|
|
||||||
|
for m in r.findall('mails/mail'):
|
||||||
|
f = m.find('from').text
|
||||||
|
s = m.find('subject').text
|
||||||
|
d = m.find('date').text
|
||||||
|
h = hashlib.sha256((f + s + d).encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
if h in hashes:
|
||||||
|
logging.info("* Duplicate: " + s + " - " + f + " - " + d)
|
||||||
|
# logging.info(h)
|
||||||
|
else:
|
||||||
|
hashes.append(h)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
d = "out/"
|
||||||
|
xml_files = [f for f in glob.glob(os.path.join(d, "*.xml"))]
|
||||||
|
|
||||||
|
logging.info("Checking duplicates")
|
||||||
|
|
||||||
|
for f in xml_files:
|
||||||
|
logging.info("> " + os.path.basename(f))
|
||||||
|
check_duplicates(f)
|
||||||
|
|
||||||
|
|
||||||
@ -3,39 +3,83 @@ import xml.etree.ElementTree as et
|
|||||||
from xml.sax.saxutils import unescape, escape
|
from xml.sax.saxutils import unescape, escape
|
||||||
import export.utils
|
import export.utils
|
||||||
import config
|
import config
|
||||||
from datetime import datetime
|
|
||||||
from export import emailreply
|
from export import emailreply
|
||||||
import re
|
import re, hashlib, datetime
|
||||||
|
|
||||||
nn = 0
|
nn = 0
|
||||||
|
|
||||||
sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump'])
|
sel_dump = os.path.join(config.selection['path'], config.selection['sel_dump'])
|
||||||
xml_dump = os.path.join(config.export['path'], config.export['xml'])
|
xml_dump = os.path.join(config.export['path'], config.export['xml'])
|
||||||
|
|
||||||
|
'''
|
||||||
|
utils
|
||||||
|
'''
|
||||||
|
|
||||||
def export_generate_path(tag):
|
def export_generate_path(tag):
|
||||||
now = datetime.now()
|
now = datetime.datetime.now()
|
||||||
return os.path.join(config.export['path'], tag + "_[now].xml")
|
return os.path.join(config.export['path'], tag + "_[now].xml")
|
||||||
|
|
||||||
|
'''
|
||||||
|
xml export
|
||||||
|
'''
|
||||||
|
|
||||||
|
def hash(m):
|
||||||
|
return hashlib.sha256((m['from'] + m['subject'] + m['date']).encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def make_xml_element(el, val):
|
def make_xml_element(el, val):
|
||||||
return "<" + el + ">" + escape(val) + "</" + el + ">"
|
return "<" + el + ">" + escape(val) + "</" + el + ">"
|
||||||
|
|
||||||
|
'''
|
||||||
|
This is pretty patched up...........................................................
|
||||||
|
'''
|
||||||
|
|
||||||
def emit_mail_xml(msg, li):
|
def to_listserv(li, msg):
|
||||||
|
if li == 'crumb': # patch
|
||||||
|
return '<new-media-curating@jiscmail.ac.uk>'
|
||||||
|
elif li == 'spectre':
|
||||||
|
return 'spectre@mikrolisten.de'
|
||||||
|
elif li == 'empyre':
|
||||||
|
return '<empyre@lists.cofa.unsw.edu.au>'
|
||||||
|
elif li == 'nettime_bold':
|
||||||
|
return 'nettime-bold@nettime.org'
|
||||||
|
elif li == 'nettime_l':
|
||||||
|
# nettime-l@desk.nl -- June 8 1999
|
||||||
|
# mettime-l-temp@material.net -- July 15 1999
|
||||||
|
# nettime-l@bbs.thing.net> -- July 19 2007
|
||||||
|
# nettime-l@kein.org
|
||||||
|
dtz = export.utils.parse_date_msg(msg)
|
||||||
|
if dtz is not None:
|
||||||
|
d = datetime.datetime.fromtimestamp(dtz).date()
|
||||||
|
if d < datetime.date(1999, 6, 8):
|
||||||
|
return 'nettime-l@desk.nl'
|
||||||
|
elif d < datetime.date(1999, 7, 15):
|
||||||
|
return 'mettime-l-temp@material.net'
|
||||||
|
elif d < datetime.date(2007, 7, 19):
|
||||||
|
return 'nettime-l@bbs.thing.net'
|
||||||
|
return 'nettime-l@kein.org'
|
||||||
|
else:
|
||||||
|
logging.warning("no listserv to...")
|
||||||
|
return 'n/a'
|
||||||
|
|
||||||
global nn
|
def emit_mail_xml(msg, li, thread_nbr, msg_nbr):
|
||||||
|
|
||||||
|
global nn, hashes
|
||||||
nn += 1
|
nn += 1
|
||||||
|
|
||||||
logging.info("export xml: " + li)
|
h = hash(msg) # patch
|
||||||
|
if h in hashes:
|
||||||
|
logging.warning("Duplicate: " + msg['from'] + " - " + msg['subject'] + " - " + msg['date'] + ". Skipping...")
|
||||||
|
return ''
|
||||||
|
else:
|
||||||
|
hashes.append(h)
|
||||||
|
|
||||||
|
nbr = make_xml_element("nbr", str(thread_nbr) + "." + str(msg_nbr)) + "\n"
|
||||||
|
|
||||||
subject = make_xml_element("subject", msg['subject']) + "\n"
|
subject = make_xml_element("subject", msg['subject']) + "\n"
|
||||||
|
|
||||||
if 'to' in msg:
|
to = make_xml_element("to", to_listserv(li, msg)) + "\n" # patch
|
||||||
to = make_xml_element("to", msg['to']) + "\n"
|
|
||||||
else:
|
|
||||||
to = make_xml_element("to", 'n/a') + "\n"
|
|
||||||
|
|
||||||
from_ = make_xml_element("from", msg['from']) + "\n"
|
from_ = make_xml_element("from", msg['author_name']) + "\n"
|
||||||
|
|
||||||
date = make_xml_element("date", msg['date']) + "\n"
|
date = make_xml_element("date", msg['date']) + "\n"
|
||||||
|
|
||||||
@ -47,11 +91,16 @@ def emit_mail_xml(msg, li):
|
|||||||
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
|
e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
|
||||||
e.read()
|
e.read()
|
||||||
|
|
||||||
content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
|
# content_stripped = re.sub(r'\n[ ]{2,}', '\n\n', e.reply) # patch
|
||||||
|
# # content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', e.reply)
|
||||||
|
# content_stripped = re.sub(r'(?<!\n)\n(?!\n)', ' ', content_stripped)
|
||||||
|
# # content_stripped = re.sub(r'\n[ ]{2,}\w', '\n\n', content_stripped)
|
||||||
|
|
||||||
|
content_stripped = e.reply
|
||||||
|
|
||||||
content = make_xml_element("content", content_stripped) + "\n"
|
content = make_xml_element("content", content_stripped) + "\n"
|
||||||
|
|
||||||
mail = "<mail>\n" + subject + from_ + to + date + content + "</mail>\n"
|
mail = "<mail>\n" + nbr + subject + from_ + to + date + content + "</mail>\n"
|
||||||
|
|
||||||
# content = et.SubElement(mail, 'content')
|
# content = et.SubElement(mail, 'content')
|
||||||
# content.text = e.reply
|
# content.text = e.reply
|
||||||
@ -61,79 +110,22 @@ def emit_mail_xml(msg, li):
|
|||||||
followups = export.utils.index_follow_up(msg)
|
followups = export.utils.index_follow_up(msg)
|
||||||
followups.sort(key=lambda tup: tup[0])
|
followups.sort(key=lambda tup: tup[0])
|
||||||
for d, f in followups:
|
for d, f in followups:
|
||||||
mail += emit_mail_xml(f, li)
|
msg_nbr += 1
|
||||||
|
mail += emit_mail_xml(f, li, thread_nbr, msg_nbr)
|
||||||
|
|
||||||
return mail
|
return mail
|
||||||
|
|
||||||
|
|
||||||
# def emit_mail_xml(msg, li, xmlel):
|
|
||||||
|
|
||||||
# global nn
|
|
||||||
# nn += 1
|
|
||||||
|
|
||||||
# logging.info("export xml: " + li)
|
|
||||||
|
|
||||||
# mail = et.SubElement(xmlel, 'mail')
|
|
||||||
|
|
||||||
# subject = et.SubElement(mail, 'subject')
|
|
||||||
# subject.text = export.utils.format_subject(msg['subject'])
|
|
||||||
|
|
||||||
# to = et.SubElement(mail, 'to')
|
|
||||||
# if 'to' in msg:
|
|
||||||
# to.text = msg['to']
|
|
||||||
# else:
|
|
||||||
# to.text = 'n/a'
|
|
||||||
|
|
||||||
# from_ = et.SubElement(mail, 'from')
|
|
||||||
# from_.text = msg['from']
|
|
||||||
|
|
||||||
# date = et.SubElement(mail, 'date')
|
|
||||||
# date.text = msg['date']
|
|
||||||
|
|
||||||
# '''
|
|
||||||
# todo:
|
|
||||||
# - filter reply
|
|
||||||
# - unescape XML
|
|
||||||
# '''
|
|
||||||
# e = emailreply.EmailMessage(export.utils.format_content(msg['content']))
|
|
||||||
# e.read()
|
|
||||||
|
|
||||||
# escape_table = {
|
|
||||||
# "&": "&",
|
|
||||||
# ">": ">",
|
|
||||||
# "<": "<"
|
|
||||||
# }
|
|
||||||
|
|
||||||
# content_str = "<content>" + escape(e.reply, escape_table) + "</content>"
|
|
||||||
|
|
||||||
# print(content_str)
|
|
||||||
|
|
||||||
# content = et.fromstring(content_str)
|
|
||||||
# mail.append(content)
|
|
||||||
|
|
||||||
# # content = et.SubElement(mail, 'content')
|
|
||||||
# # content.text = e.reply
|
|
||||||
|
|
||||||
# # recursuve "follow-up"
|
|
||||||
# if 'follow-up' in msg:
|
|
||||||
# followups = export.utils.index_follow_up(msg)
|
|
||||||
# followups.sort(key=lambda tup: tup[0])
|
|
||||||
# for d, f in followups:
|
|
||||||
# emit_mail_xml(f, li, xmlel)
|
|
||||||
|
|
||||||
|
|
||||||
#------------------------------------------------------------
|
|
||||||
# The following functions parse the selection files
|
|
||||||
#------------------------------------------------------------
|
|
||||||
|
|
||||||
def export_single_tag(t, sel, fout):
|
def export_single_tag(t, sel, fout):
|
||||||
|
|
||||||
|
global hashes
|
||||||
|
|
||||||
if t not in list(sel.keys()):
|
if t not in list(sel.keys()):
|
||||||
logging.error("Tag: " + t + " does not exists.")
|
logging.error("Tag: " + t + " does not exists.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
ch = sel[t]
|
logging.info("Exporting tag:" + t)
|
||||||
|
|
||||||
|
ch = sel[t]
|
||||||
|
|
||||||
chapter = "<chapter>\n"
|
chapter = "<chapter>\n"
|
||||||
|
|
||||||
@ -143,64 +135,21 @@ def export_single_tag(t, sel, fout):
|
|||||||
|
|
||||||
chapter_mails = "<mails>\n"
|
chapter_mails = "<mails>\n"
|
||||||
|
|
||||||
|
hashes = []
|
||||||
|
thread_nbr = 0
|
||||||
|
|
||||||
for m in ch['lists']:
|
for m in ch['lists']:
|
||||||
chapter_mails += emit_mail_xml(m, m['list'])
|
chapter_mails += emit_mail_xml(m, m['list'], thread_nbr, 0)
|
||||||
|
thread_nbr += 1
|
||||||
|
|
||||||
chapter_mails += "</mails>\n"
|
chapter_mails += "</mails>\n"
|
||||||
|
|
||||||
chapter = "<chapter>\n" + chapter_mails + "</chapter>"
|
chapter = "<chapter>\n" + chapter_title + chapter_desc + chapter_mails + "</chapter>"
|
||||||
|
|
||||||
fout.write(chapter.encode('utf-8'))
|
fout.write(chapter.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
# # root = et.ElementTree(chapter)
|
|
||||||
# # root.write(fout, encoding="utf-8", xml_declaration=True)
|
|
||||||
|
|
||||||
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
|
|
||||||
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
|
||||||
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
|
|
||||||
|
|
||||||
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
|
||||||
|
|
||||||
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
# def export_single_tag(t, sel, fout):
|
|
||||||
|
|
||||||
# if t not in list(sel.keys()):
|
|
||||||
# logging.error("Tag: " + t + " does not exists.")
|
|
||||||
# return False
|
|
||||||
|
|
||||||
# ch = sel[t]
|
|
||||||
|
|
||||||
# chapter = et.Element('chapter')
|
|
||||||
# chapter_title = et.SubElement(chapter, 'title')
|
|
||||||
# chapter_title.text = t
|
|
||||||
|
|
||||||
# chapter_desc = et.SubElement(chapter, 'desc')
|
|
||||||
# chapter_desc.text = ch['desc']
|
|
||||||
|
|
||||||
# chapter_mails = et.SubElement(chapter, 'mails')
|
|
||||||
|
|
||||||
# for m in ch['lists']:
|
|
||||||
# emit_mail_xml(m, m['list'], chapter_mails)
|
|
||||||
|
|
||||||
|
|
||||||
# # root = et.ElementTree(chapter)
|
|
||||||
# # root.write(fout, encoding="utf-8", xml_declaration=True)
|
|
||||||
|
|
||||||
# # xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter).decode('unicode-escape', 'ignore'))
|
|
||||||
# xml = export.utils.remove_invalid_xml_characters(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
|
||||||
# xml = '<?xml version="1.0" encoding="UTF-8"?>' + xml
|
|
||||||
|
|
||||||
# print(et.tostring(chapter, encoding="utf-8").decode('utf-8', 'ignore'))
|
|
||||||
|
|
||||||
# fout.write(et.tostring(chapter).decode('utf-8', 'ignore'))
|
|
||||||
|
|
||||||
# return True
|
|
||||||
|
|
||||||
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
|
def export_selection_all(sel_dump=sel_dump, xml_out=xml_dump):
|
||||||
|
|
||||||
with open(sel_dump) as fin:
|
with open(sel_dump) as fin:
|
||||||
@ -218,7 +167,7 @@ def export_selection_tag(tag, sel_dump=sel_dump, xml_out=xml_dump):
|
|||||||
with open(sel_dump) as fin:
|
with open(sel_dump) as fin:
|
||||||
d = json.load(fin)
|
d = json.load(fin)
|
||||||
|
|
||||||
now = datetime.now()
|
now = datetime.datetime.now()
|
||||||
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
|
xml_out.replace("[now]", now.strftime("%d-%m-%y_%H:%M:%S"))
|
||||||
|
|
||||||
with open(xml_out, 'wb') as fout:
|
with open(xml_out, 'wb') as fout:
|
||||||
@ -245,7 +194,7 @@ def export_file(f, fout):
|
|||||||
fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
|
fout.write(et.tostring(all_mail).decode('utf-8', 'ignore'))
|
||||||
|
|
||||||
def parse_date_file(fname):
|
def parse_date_file(fname):
|
||||||
return datetime.strptime(fname, '%B_%Y.json')
|
return datetime.datetime.strptime(fname, '%B_%Y.json')
|
||||||
|
|
||||||
def export_year(d, dt, fout):
|
def export_year(d, dt, fout):
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@ -112,6 +112,14 @@
|
|||||||
{
|
{
|
||||||
"list": "nettime_l",
|
"list": "nettime_l",
|
||||||
"url": "https://nettime.org/Lists-Archives/nettime-l-0305/msg00078.html"
|
"url": "https://nettime.org/Lists-Archives/nettime-l-0305/msg00078.html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"list": "nettime_l",
|
||||||
|
"url": "https://nettime.org/Lists-Archives/nettime-l-9906/msg00069.html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"list": "nettime_l",
|
||||||
|
"url": "https://nettime.org/Lists-Archives/nettime-l-0707/msg00025.html"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"desc": "..."
|
"desc": "..."
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user