diff --git a/etherdump/commands/gethtml.py b/etherdump/commands/gethtml.py index ca0c79a..9a2fc48 100644 --- a/etherdump/commands/gethtml.py +++ b/etherdump/commands/gethtml.py @@ -1,8 +1,12 @@ from __future__ import print_function from argparse import ArgumentParser import json -from urllib import urlencode -from urllib2 import urlopen, HTTPError, URLError +from urllib.parse import urlencode +from urllib.request import urlopen +from urllib.error import HTTPError, URLError + +# from urllib import urlencode +# from urllib2 import urlopen, HTTPError, URLError def main(args): @@ -27,8 +31,8 @@ def main(args): if args.showurl: print (requesturl) else: - results = json.load(urlopen(requesturl))['data'] + results = json.loads(urlopen(requesturl).read().decode("utf-8"))['data'] if args.format == "json": print (json.dumps(results)) else: - print (results['html'].encode("utf-8")) + print (results['html']) diff --git a/etherdump/commands/pull.py b/etherdump/commands/pull.py index 220c193..4b776ab 100644 --- a/etherdump/commands/pull.py +++ b/etherdump/commands/pull.py @@ -244,11 +244,15 @@ def main (args): html = html['data']['html'] ver["path"] = p+".raw.html" ver["url"] = quote(ver["path"]) - doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) - html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links) + # JUN 2016: chaning to save REALLY the RAW / unchanged HTML from the API with open(ver["path"], "w") as f: - # f.write(html.encode("utf-8")) - print (ET.tostring(doc, method="html", encoding="unicode"), file=f) + print(html, file=f) + + # doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) + # html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links) + # with open(ver["path"], "w") as f: + # # f.write(html.encode("utf-8")) + # print (ET.tostring(doc, method="html", encoding="unicode"), file=f) # output meta if args.all or args.meta: diff --git a/etherdump/commands/pushhtml.py b/etherdump/commands/pushhtml.py new file mode 100644 index 0000000..e8faad7 --- /dev/null +++ b/etherdump/commands/pushhtml.py @@ -0,0 +1,33 @@ +from etherdump.commands.sethtml import sethtml, pushhtml +import argparse +import os, sys +import json + + +def main(args): + p = argparse.ArgumentParser("""Indiscriminantly PUSH the contents of dumped html files to an etherpad, clobbering any existing content!""") + p.add_argument("input", nargs="+", help="Metadata files, e.g. *.meta.json") + p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json") + p.add_argument("--basepath", default=".") + args = p.parse_args(args) + + with open(args.padinfo) as f: + info = json.load(f) + + apiurl = info.get("localapiurl", info["apiurl"]) + apikey = info['apikey'] + for n in args.input: + with open(n) as f: + meta = json.load(f) + for v in meta['versions']: + if v['type'] == 'html': + path = v['path'] + if args.basepath: + path = os.path.join(args.basepath, path) + break + padid = meta['padid'] + with open(path) as f: + htmlsrc = f.read() + print ("Pushing {0} to {1}".format(path, padid), file=sys.stderr) + pushhtml(apiurl, apikey, padid, htmlsrc) + diff --git a/etherdump/commands/sethtml.py b/etherdump/commands/sethtml.py index 7b6a0cf..669ffc3 100644 --- a/etherdump/commands/sethtml.py +++ b/etherdump/commands/sethtml.py @@ -1,15 +1,50 @@ -from __future__ import print_function -from argparse import ArgumentParser -import json, sys -from urllib import urlencode -from urllib2 import urlopen, HTTPError, URLError -import requests +from urllib.request import urlopen +from urllib.parse import urlencode +import json +import re +import argparse -LIMIT_BYTES = 100*1000 +# HOST_PORT="localhost:9001" +# APIKEY="439afe700152ed5f5cdc43e9bf0a6ab0697c422db0e7277d43f2e1af4f155d79" + +def create_pad (apiurl, apikey, padid): + # url = "http://{0}/api/1/createPad".format(hostport) + url = apiurl + "createPad" + data = ( + ('apikey', apikey), + ('padID', padid), + ) + f = urlopen(url, data=urlencode(data).encode("utf-8")) + return json.loads(f.read().decode("utf-8")) + + +def sethtml (apiurl, apikey, padid, html): + # strip the (initial) title tag + html = re.sub(r"