2020-05-31 23:30:39 +02:00

262 lines
12 KiB
Python

from argparse import ArgumentParser
import sys, json, re, os
from datetime import datetime
from urllib.parse import urlencode, quote
from urllib.request import urlopen, URLError, HTTPError
from etherdump.commands.common import *
from time import sleep
from etherdump.commands.html5tidy import html5tidy
import html5lib
from xml.etree import ElementTree as ET
from fnmatch import fnmatch
# debugging
# import ElementTree as ET
"""
pull(meta):
Update meta data files for those that have changed.
Check for changed pads by looking at revisions & comparing to existing
todo...
use/prefer public interfaces ? (export functions)
"""
def try_deleting (files):
for f in files:
try:
os.remove(f)
except OSError as e:
pass
def main (args):
p = ArgumentParser("Check for pads that have changed since last sync (according to .meta.json)")
p.add_argument("padid", nargs="*", default=[])
p.add_argument("--glob", default=False, help="download pads matching a glob pattern")
p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")
p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")
p.add_argument("--pub", default="p", help="folder to store files for public pads, default: p")
p.add_argument("--group", default="g", help="folder to store files for group pads, default: g")
p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")
p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False")
p.add_argument("--text", default=False, action="store_true", help="download text to PADID.txt, default: False")
p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False")
p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.diff.html, default: False")
p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False")
p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False")
p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout")
p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous")
p.add_argument("--no-raw-ext", default=False, action="store_true", help="save plain text as padname with no (additional) extension")
p.add_argument("--fix-names", default=False, action="store_true", help="normalize padid's (no spaces, special control chars) for use in file names")
p.add_argument("--filter-ext", default=None, help="filter pads by extension")
p.add_argument("--css", default="/styles.css", help="add css url to output pages, default: /styles.css")
p.add_argument("--script", default="/versions.js", help="add script url to output pages, default: /versions.js")
p.add_argument("--nopublish", default="__NOPUBLISH__", help="no publish magic word, default: __NOPUBLISH__")
args = p.parse_args(args)
raw_ext = ".raw.txt"
if args.no_raw_ext:
raw_ext = ""
info = loadpadinfo(args.padinfo)
data = {}
data['apikey'] = info['apikey']
if args.padid:
padids = args.padid
elif args.glob:
padids = getjson(info['localapiurl']+'listAllPads?'+urlencode(data))['data']['padIDs']
padids = [x for x in padids if fnmatch(x, args.glob)]
else:
padids = getjson(info['localapiurl']+'listAllPads?'+urlencode(data))['data']['padIDs']
padids.sort()
numpads = len(padids)
# maxmsglen = 0
count = 0
for i, padid in enumerate(padids):
print (padid, file=sys.stderr)
if args.skip != None and i<args.skip:
continue
progressbar(i, numpads, padid)
data['padID'] = padid # .encode("utf-8")
p = padpath(padid, args.pub, args.group, args.fix_names)
if args.folder:
p = os.path.join(p, padid) # padid.encode("utf-8")
metapath = p + ".meta.json"
revisions = None
tries = 1
skip = False
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
meta = {}
# if type(padurlbase) == unicode:
# padurlbase = padurlbase.encode("utf-8")
while True:
try:
if os.path.exists(metapath):
with open(metapath) as f:
meta.update(json.load(f))
revisions = getjson(info['localapiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
if meta['revisions'] == revisions and not args.force:
skip=True
break
meta['padid'] = padid # .encode("utf-8")
versions = meta["versions"] = []
versions.append({
"url": padurlbase + quote(padid),
"type": "pad",
"code": 200
})
if revisions == None:
meta['revisions'] = getjson(info['localapiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
else:
meta['revisions' ] = revisions
if (meta['revisions'] == 0) and (not args.zerorevs):
# print("Skipping zero revs", file=sys.stderr)
skip=True
break
# todo: load more metadata!
meta['group'], meta['pad'] = splitpadname(padid)
meta['pathbase'] = p
meta['lastedited_raw'] = int(getjson(info['localapiurl']+'getLastEdited?'+urlencode(data))['data']['lastEdited'])
meta['lastedited_iso'] = datetime.fromtimestamp(int(meta['lastedited_raw'])/1000).isoformat()
meta['author_ids'] = getjson(info['localapiurl']+'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs']
break
except HTTPError as e:
tries += 1
if tries > 3:
print ("Too many failures ({0}), skipping".format(padid), file=sys.stderr)
skip=True
break
else:
sleep(3)
except TypeError as e:
print ("Type Error loading pad {0} (phantom pad?), skipping".format(padid), file=sys.stderr)
skip=True
break
if skip:
continue
count += 1
if args.output:
print (padid)
if args.all or (args.meta or args.text or args.html or args.dhtml):
try:
os.makedirs(os.path.split(metapath)[0])
except OSError:
pass
if args.all or args.text:
text = getjson(info['localapiurl']+'getText?'+urlencode(data))
ver = {"type": "text"}
versions.append(ver)
ver["code"] = text["_code"]
if text["_code"] == 200:
text = text['data']['text']
##########################################
## ENFORCE __NOPUBLISH__ MAGIC WORD
##########################################
if args.nopublish and args.nopublish in text:
# NEED TO PURGE ANY EXISTING DOCS
print ("NOPUBLISH", file=sys.stderr)
try_deleting((p+raw_ext,p+".raw.html",p+".diff.html",p+".meta.json"))
continue
ver["path"] = p+raw_ext
ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f:
f.write(text)
# once the content is settled, compute a hash
# and link it in the metadata!
links = []
if args.css:
links.append({"href":args.css, "rel":"stylesheet"})
# todo, make this process reflect which files actually were made
versionbaseurl = quote(padid)
links.append({"href":versions[0]["url"], "rel":"alternate", "type":"text/html", "title":"Etherpad"})
if args.all or args.text:
links.append({"href":versionbaseurl+raw_ext, "rel":"alternate", "type":"text/plain", "title":"Plain text"})
if args.all or args.html:
links.append({"href":versionbaseurl+".raw.html", "rel":"alternate", "type":"text/html", "title":"HTML"})
if args.all or args.dhtml:
links.append({"href":versionbaseurl+".diff.html", "rel":"alternate", "type":"text/html", "title":"HTML with author colors"})
if args.all or args.meta:
links.append({"href":versionbaseurl+".meta.json", "rel":"alternate", "type":"application/json", "title":"Meta data"})
# links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"})
if args.all or args.dhtml:
data['startRev'] = "0"
html = getjson(info['localapiurl']+'createDiffHTML?'+urlencode(data))
ver = {"type": "diffhtml"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
try:
html = html['data']['html']
ver["path"] = p+".diff.html"
ver["url"] = quote(ver["path"])
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links)
with open(ver["path"], "w") as f:
# f.write(html.encode("utf-8"))
print(ET.tostring(doc, method="html", encoding="unicode"), file=f)
except TypeError:
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
ver["message"] = html["message"]
# with open(ver["path"], "w") as f:
# print ("""<pre>{0}</pre>""".format(json.dumps(html, indent=2)), file=f)
# Process text, html, dhtml, all options
if args.all or args.html:
html = getjson(info['localapiurl']+'getHTML?'+urlencode(data))
ver = {"type": "html"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
html = html['data']['html']
ver["path"] = p+".raw.html"
ver["url"] = quote(ver["path"])
# JUN 2016: chaning to save REALLY the RAW / unchanged HTML from the API
with open(ver["path"], "w") as f:
print(html, file=f)
# doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
# html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links)
# with open(ver["path"], "w") as f:
# # f.write(html.encode("utf-8"))
# print (ET.tostring(doc, method="html", encoding="unicode"), file=f)
# output meta
if args.all or args.meta:
ver = {"type": "meta"}
versions.append(ver)
ver["path"] = metapath
ver["url"] = quote(metapath)
with open(metapath, "w") as f:
json.dump(meta, f, indent=2)
print("\n{0} pad(s) loaded".format(count), file=sys.stderr)