diff --git a/etherdump b/etherdump index 96636d4..4aede89 100755 --- a/etherdump +++ b/etherdump @@ -10,13 +10,87 @@ from datetime import datetime from xml.etree import cElementTree as ET from urllib import urlencode from urllib2 import urlopen, HTTPError, URLError -# local mods -from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents -from linkify import linkify, urlify, filename_to_padid + # external dependencies (use pip to install these) import html5lib, jinja2 +def filename_to_padid (t): + t = t.replace("_", " ") + t = re.sub(r"\.html$", "", t) + return t + +def normalize_pad_name (n): + if '?' in n: + n = n.split('?', 1)[0] + if '/' in n: + n = n.split('/', 1)[0] + return n + +def urlify (t, ext=".html"): + return t.replace(" ", "_") + ext + +def linkify (src, urlify=urlify): + + collect = [] + + def s (m): + contents = strip_tags(m.group(1)) + contents = normalize_pad_name(contents) + collect.append(contents) + link = urlify(contents) + # link = link.split("?", 1)[0] + return "[[{1}]]".format(link, contents) + + # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src) + ## question marks are ignored by etherpad, so split/strip it + ## strip slashes as well!! (/timeslider) + src = re.sub(r"\[\[(.+?)\]\]", s, src) + return (src, collect) + +def strip_tags (text): + return re.sub(r"<.*?>", "", text) + +def set_text_contents (element, text): + """ ok this isn't really general, but works for singly wrapped elements """ + while len(element) == 1: + element = element[0] + element.text = text + +def text_contents (element): + return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '') + +def contents (element, method="html"): + return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) + +def get_parent(tree, elt): + for parent in tree.iter(): + for child in parent: + if child == elt: + return parent + +def remove_recursive (tree, elt): + """ Remove element and (any resulting) empty containing elements """ + p = get_parent(tree, elt) + if p: + p.remove(elt) + if len(p) == 0 and (p.text == None or p.text.strip() == ""): + # print ("empty parent", p, file=sys.stderr) + remove_recursive(tree, p) + + +def trim_removed_spans (t): + # remove and empty parents + for n in t.findall(".//span[@class='removed']"): + remove_recursive(t, n) + # then strip any leading br's from body + while True: + tag = t.find("./body")[0] + if tag.tag == "br": + remove_recursive(t, tag) + else: + break + def get_template_env (tpath=None): paths = [] if tpath and os.path.isdir(tpath): @@ -114,7 +188,13 @@ while len(todo) > 0: # | | | | | | __/ || (_| | # |_| |_| |_|\___|\__\__,_| - meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json")) + meta_url = urlify(padid, ext=".json") + meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8")) + raw_url = urlify(padid, ext=".txt") + raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8")) + colors_url = urlify(padid, ext=".html") + colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8")) + if not args.hidepaths: print (meta_out, file=sys.stderr) if not args.pretend: @@ -137,7 +217,9 @@ while len(todo) > 0: if args.showurls: print (authors_url, file=sys.stderr) meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs'] - + meta['colors'] = colors_url + meta['raw'] = raw_url + meta['meta'] = meta_url with open(meta_out, "w") as f: json.dump(meta, f) @@ -146,7 +228,6 @@ while len(todo) > 0: # | | | (_| |\ V V / # |_| \__,_| \_/\_/ - raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt")) if not args.hidepaths: print (raw_out, file=sys.stderr) text_url = apiurl+"getText?"+urlencode(data) @@ -171,7 +252,6 @@ while len(todo) > 0: # | (_| (_) | | (_) | | \__ \ # \___\___/|_|\___/|_| |___/ - colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html")) if not args.hidepaths: print (colors_out, file=sys.stderr) data['startRev'] = "0" @@ -272,7 +352,11 @@ while len(todo) > 0: style = style, revision = meta['total_revisions'], padid = padid, - timestamp = datetime.now() + timestamp = datetime.now(), + meta_url = meta_url, + raw_url = raw_url, + colors_url = colors_url, + lastedited = meta['lastedited'] ).encode("utf-8")) # _