From 76cb1b28a1e17ac525965408cf1feb231c902d4e Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Thu, 23 Jul 2015 18:09:20 +0200 Subject: [PATCH] dump html with trim and linkify --- dump_html.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++ linkify.py | 29 ++++++++++++++++++++++ trim.py | 54 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+) create mode 100755 dump_html.py create mode 100644 linkify.py create mode 100644 trim.py diff --git a/dump_html.py b/dump_html.py new file mode 100755 index 0000000..0a380a3 --- /dev/null +++ b/dump_html.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +from __future__ import print_function +from argparse import ArgumentParser +import json, sys, os +from urllib import urlencode +from urllib2 import urlopen, HTTPError, URLError +from xml.etree import cElementTree as ET +import html5lib +from trim import trim_removed_spans, contents +from linkify import linkify, urlify + + +p = ArgumentParser("") +p.add_argument("padid", help="the padid") +p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") +p.add_argument("--path", default="output", help="path to save files, default: output") +p.add_argument("--verbose", default=False, action="store_true") +p.add_argument("--limit", type=int, default=None) +args = p.parse_args() + +with open(args.padinfo) as f: + info = json.load(f) +apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) + +todo = [args.padid] +done = set() +count = 0 + +while len(todo) > 0: + padid = todo[0] + todo = todo[1:] + done.add(padid) + + data = {} + data['apikey'] = info['apikey'] + data['padID'] = padid.encode("utf-8") + + out = "{0}/{1}".format(args.path, urlify(padid)) + print ("{0}".format(out), file=sys.stderr) + + total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) + total_revisions = json.load(urlopen(total_revisions))['data']['revisions'] + if args.verbose: + print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr) + + data['startRev'] = "0" + requesturl = apiurl+'createDiffHTML?'+urlencode(data) + html = json.load(urlopen(requesturl))['data']['html'] + t = html5lib.parse(html, namespaceHTMLElements=False) + trim_removed_spans(t) + html = ET.tostring(t, method="html") + + html, links = linkify(html) + for l in links: + if l not in todo and l not in done: + if args.verbose: + print (" link: {0}".format(l), file=sys.stderr) + todo.append(l) + + try: + os.makedirs(args.path) + except OSError: + pass + with open(out, "w") as f: + f.write(html.encode("utf-8")) + + count += 1 + if args.limit and count >= args.limit: + break diff --git a/linkify.py b/linkify.py new file mode 100644 index 0000000..359a0dd --- /dev/null +++ b/linkify.py @@ -0,0 +1,29 @@ +from __future__ import print_function +import re, sys + + +def urlify (t): + return t.replace(" ", "_") + ".html" + +def linkify (src, urlify=urlify): + + collect = [] + + def s (m): + contents = m.group(1) + collect.append(contents) + link = urlify(contents) + return "[[{1}]]".format(link, contents) + + src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src) + return (src, collect) + + +if __name__ == "__main__": + src = sys.stdin.read() + src, links = linkify(src) + + for l in links: + print (l) + + print (src) diff --git a/trim.py b/trim.py new file mode 100644 index 0000000..085cc96 --- /dev/null +++ b/trim.py @@ -0,0 +1,54 @@ +from __future__ import print_function +import html5lib, sys, re +from xml.etree import cElementTree as ET + + +def contents (element, method="html"): + return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) + +def iterparent(tree): + for parent in tree.iter(): + for child in parent: + yield parent, child + +def get_parent(tree, elt): + for parent in tree.iter(): + for child in parent: + if child == elt: + return parent + +def remove_recursive (tree, elt): + """ Remove element and (any resulting) empty containing elements """ + p = get_parent(tree, elt) + if p: + p.remove(elt) + if len(p) == 0 and (p.text == None or p.text.strip() == ""): + # print ("empty parent", p, file=sys.stderr) + remove_recursive(tree, p) + + +def trim_removed_spans (t): + # remove and empty parents + for n in t.findall(".//span[@class='removed']"): + remove_recursive(t, n) + # then strip any leading br's from body + while True: + tag = t.find("./body")[0] + if tag.tag == "br": + remove_recursive(t, tag) + else: + break + +def trim_removed_spans_src (src): + t = html5lib.parse(src, namespaceHTMLElements=False) + trim_removed_spans(t) + return contents(t.find("./body")) + + +if __name__ == "__main__": + src = sys.stdin.read() + # t = html5lib.parse(src, namespaceHTMLElements=False) + # trim_rems_tree(t) + # print (ET.tostring(t)) + print (trim_removed_spans_src(src).encode("utf-8")) +