diff --git a/dump_html.py b/dump_html.py
new file mode 100755
index 0000000..0a380a3
--- /dev/null
+++ b/dump_html.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+from __future__ import print_function
+from argparse import ArgumentParser
+import json, sys, os
+from urllib import urlencode
+from urllib2 import urlopen, HTTPError, URLError
+from xml.etree import cElementTree as ET
+import html5lib
+from trim import trim_removed_spans, contents
+from linkify import linkify, urlify
+
+
+p = ArgumentParser("")
+p.add_argument("padid", help="the padid")
+p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
+p.add_argument("--path", default="output", help="path to save files, default: output")
+p.add_argument("--verbose", default=False, action="store_true")
+p.add_argument("--limit", type=int, default=None)
+args = p.parse_args()
+
+with open(args.padinfo) as f:
+ info = json.load(f)
+apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
+
+todo = [args.padid]
+done = set()
+count = 0
+
+while len(todo) > 0:
+ padid = todo[0]
+ todo = todo[1:]
+ done.add(padid)
+
+ data = {}
+ data['apikey'] = info['apikey']
+ data['padID'] = padid.encode("utf-8")
+
+ out = "{0}/{1}".format(args.path, urlify(padid))
+ print ("{0}".format(out), file=sys.stderr)
+
+ total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
+ total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
+ if args.verbose:
+ print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
+
+ data['startRev'] = "0"
+ requesturl = apiurl+'createDiffHTML?'+urlencode(data)
+ html = json.load(urlopen(requesturl))['data']['html']
+ t = html5lib.parse(html, namespaceHTMLElements=False)
+ trim_removed_spans(t)
+ html = ET.tostring(t, method="html")
+
+ html, links = linkify(html)
+ for l in links:
+ if l not in todo and l not in done:
+ if args.verbose:
+ print (" link: {0}".format(l), file=sys.stderr)
+ todo.append(l)
+
+ try:
+ os.makedirs(args.path)
+ except OSError:
+ pass
+ with open(out, "w") as f:
+ f.write(html.encode("utf-8"))
+
+ count += 1
+ if args.limit and count >= args.limit:
+ break
diff --git a/linkify.py b/linkify.py
new file mode 100644
index 0000000..359a0dd
--- /dev/null
+++ b/linkify.py
@@ -0,0 +1,29 @@
+from __future__ import print_function
+import re, sys
+
+
+def urlify (t):
+ return t.replace(" ", "_") + ".html"
+
+def linkify (src, urlify=urlify):
+
+ collect = []
+
+ def s (m):
+ contents = m.group(1)
+ collect.append(contents)
+ link = urlify(contents)
+ return "[[{1}]]".format(link, contents)
+
+ src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src)
+ return (src, collect)
+
+
+if __name__ == "__main__":
+ src = sys.stdin.read()
+ src, links = linkify(src)
+
+ for l in links:
+ print (l)
+
+ print (src)
diff --git a/trim.py b/trim.py
new file mode 100644
index 0000000..085cc96
--- /dev/null
+++ b/trim.py
@@ -0,0 +1,54 @@
+from __future__ import print_function
+import html5lib, sys, re
+from xml.etree import cElementTree as ET
+
+
+def contents (element, method="html"):
+ return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
+
+def iterparent(tree):
+ for parent in tree.iter():
+ for child in parent:
+ yield parent, child
+
+def get_parent(tree, elt):
+ for parent in tree.iter():
+ for child in parent:
+ if child == elt:
+ return parent
+
+def remove_recursive (tree, elt):
+ """ Remove element and (any resulting) empty containing elements """
+ p = get_parent(tree, elt)
+ if p:
+ p.remove(elt)
+ if len(p) == 0 and (p.text == None or p.text.strip() == ""):
+ # print ("empty parent", p, file=sys.stderr)
+ remove_recursive(tree, p)
+
+
+def trim_removed_spans (t):
+ # remove and empty parents
+ for n in t.findall(".//span[@class='removed']"):
+ remove_recursive(t, n)
+ # then strip any leading br's from body
+ while True:
+ tag = t.find("./body")[0]
+ if tag.tag == "br":
+ remove_recursive(t, tag)
+ else:
+ break
+
+def trim_removed_spans_src (src):
+ t = html5lib.parse(src, namespaceHTMLElements=False)
+ trim_removed_spans(t)
+ return contents(t.find("./body"))
+
+
+if __name__ == "__main__":
+ src = sys.stdin.read()
+ # t = html5lib.parse(src, namespaceHTMLElements=False)
+ # trim_rems_tree(t)
+ # print (ET.tostring(t))
+ print (trim_removed_spans_src(src).encode("utf-8"))
+