diff --git a/etherdump b/etherdump
index 96636d4..4aede89 100755
--- a/etherdump
+++ b/etherdump
@@ -10,13 +10,87 @@ from datetime import datetime
from xml.etree import cElementTree as ET
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
-# local mods
-from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents
-from linkify import linkify, urlify, filename_to_padid
+
# external dependencies (use pip to install these)
import html5lib, jinja2
+def filename_to_padid (t):
+ t = t.replace("_", " ")
+ t = re.sub(r"\.html$", "", t)
+ return t
+
+def normalize_pad_name (n):
+ if '?' in n:
+ n = n.split('?', 1)[0]
+ if '/' in n:
+ n = n.split('/', 1)[0]
+ return n
+
+def urlify (t, ext=".html"):
+ return t.replace(" ", "_") + ext
+
+def linkify (src, urlify=urlify):
+
+ collect = []
+
+ def s (m):
+ contents = strip_tags(m.group(1))
+ contents = normalize_pad_name(contents)
+ collect.append(contents)
+ link = urlify(contents)
+ # link = link.split("?", 1)[0]
+ return "[[{1}]]".format(link, contents)
+
+ # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
+ ## question marks are ignored by etherpad, so split/strip it
+ ## strip slashes as well!! (/timeslider)
+ src = re.sub(r"\[\[(.+?)\]\]", s, src)
+ return (src, collect)
+
+def strip_tags (text):
+ return re.sub(r"<.*?>", "", text)
+
+def set_text_contents (element, text):
+ """ ok this isn't really general, but works for singly wrapped elements """
+ while len(element) == 1:
+ element = element[0]
+ element.text = text
+
+def text_contents (element):
+ return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
+
+def contents (element, method="html"):
+ return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
+
+def get_parent(tree, elt):
+ for parent in tree.iter():
+ for child in parent:
+ if child == elt:
+ return parent
+
+def remove_recursive (tree, elt):
+ """ Remove element and (any resulting) empty containing elements """
+ p = get_parent(tree, elt)
+ if p:
+ p.remove(elt)
+ if len(p) == 0 and (p.text == None or p.text.strip() == ""):
+ # print ("empty parent", p, file=sys.stderr)
+ remove_recursive(tree, p)
+
+
+def trim_removed_spans (t):
+ # remove and empty parents
+ for n in t.findall(".//span[@class='removed']"):
+ remove_recursive(t, n)
+ # then strip any leading br's from body
+ while True:
+ tag = t.find("./body")[0]
+ if tag.tag == "br":
+ remove_recursive(t, tag)
+ else:
+ break
+
def get_template_env (tpath=None):
paths = []
if tpath and os.path.isdir(tpath):
@@ -114,7 +188,13 @@ while len(todo) > 0:
# | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_|
- meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json"))
+ meta_url = urlify(padid, ext=".json")
+ meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
+ raw_url = urlify(padid, ext=".txt")
+ raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
+ colors_url = urlify(padid, ext=".html")
+ colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
+
if not args.hidepaths:
print (meta_out, file=sys.stderr)
if not args.pretend:
@@ -137,7 +217,9 @@ while len(todo) > 0:
if args.showurls:
print (authors_url, file=sys.stderr)
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
-
+ meta['colors'] = colors_url
+ meta['raw'] = raw_url
+ meta['meta'] = meta_url
with open(meta_out, "w") as f:
json.dump(meta, f)
@@ -146,7 +228,6 @@ while len(todo) > 0:
# | | | (_| |\ V V /
# |_| \__,_| \_/\_/
- raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt"))
if not args.hidepaths:
print (raw_out, file=sys.stderr)
text_url = apiurl+"getText?"+urlencode(data)
@@ -171,7 +252,6 @@ while len(todo) > 0:
# | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/
- colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html"))
if not args.hidepaths:
print (colors_out, file=sys.stderr)
data['startRev'] = "0"
@@ -272,7 +352,11 @@ while len(todo) > 0:
style = style,
revision = meta['total_revisions'],
padid = padid,
- timestamp = datetime.now()
+ timestamp = datetime.now(),
+ meta_url = meta_url,
+ raw_url = raw_url,
+ colors_url = colors_url,
+ lastedited = meta['lastedited']
).encode("utf-8"))
# _