utf-8 and p[link] logic
This commit is contained in:
parent
00d7a537c7
commit
3d8dd1fbc1
@ -280,17 +280,18 @@ def main (args):
|
|||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
p['text'] = ''
|
p['text'] = ''
|
||||||
|
|
||||||
# ADD IN LINK TO PAD AS "link"
|
pref_for_link = ["html","pad", "text"] ## making a design decision here (otherwise need to explicitly provide --link in command)
|
||||||
for v in linkversions:
|
for x in pref_for_link:
|
||||||
if v in versions_by_type:
|
if x in versions_by_type:
|
||||||
vdata = versions_by_type[v]
|
vdata = versions_by_type[x]
|
||||||
try:
|
try:
|
||||||
if v == "pad":
|
if v == "pad" or os.path.exists(vdata["path"]):
|
||||||
p["link"] = absurl(vdata["url"], linkbase)
|
p["link"] = absurl(vdata["url"], linkbase)
|
||||||
break
|
break
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
# Not sure this goes here but fixing relpaths with output is quite nice...
|
# Not sure this goes here but fixing relpaths with output is quite nice...
|
||||||
if args.output:
|
if args.output:
|
||||||
outpath = Path(args.output).parent
|
outpath = Path(args.output).parent
|
||||||
@ -307,7 +308,14 @@ def main (args):
|
|||||||
v["url"] = v["path"]
|
v["url"] = v["path"]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if not p["link"].startswith("http"):
|
||||||
|
vpath = Path(p["link"])
|
||||||
|
try:
|
||||||
|
p["link"] = os.path.join(os.path.relpath(vpath.parent, outpath), vpath.name)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -260,9 +260,9 @@ def main (args):
|
|||||||
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
|
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
|
||||||
doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
|
doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
|
||||||
html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1")
|
html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1")
|
||||||
with open(ver["path"], "w") as f:
|
with open(ver["path"], "w", encoding="utf-8") as f:
|
||||||
# f.write(html.encode("utf-8"))
|
# f.write(html.encode("utf-8"))
|
||||||
print(ET.tostring(doc, method="html", encoding="utf-8"), file=f)
|
print(ET.tostring(doc, method="html", encoding="unicode"), file=f)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
|
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
|
||||||
ver["message"] = html["message"]
|
ver["message"] = html["message"]
|
||||||
@ -279,15 +279,11 @@ def main (args):
|
|||||||
html = html['data']['html']
|
html = html['data']['html']
|
||||||
ver["path"] = p+".raw.html"
|
ver["path"] = p+".raw.html"
|
||||||
ver["url"] = quote(ver["path"])
|
ver["url"] = quote(ver["path"])
|
||||||
# JUN 2016: chaning to save REALLY the RAW / unchanged HTML from the API
|
doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
|
||||||
with open(ver["path"], "w") as f:
|
html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1")
|
||||||
print(html, file=f)
|
with open(ver["path"], "w", encoding="utf-8") as f:
|
||||||
|
# f.write(html.encode("utf-8"))
|
||||||
# doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
|
print (ET.tostring(doc, method="html", encoding="unicode"), file=f)
|
||||||
# html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links)
|
|
||||||
# with open(ver["path"], "w") as f:
|
|
||||||
# # f.write(html.encode("utf-8"))
|
|
||||||
# print (ET.tostring(doc, method="html", encoding="unicode"), file=f)
|
|
||||||
|
|
||||||
# output meta
|
# output meta
|
||||||
if args.all or args.meta:
|
if args.all or args.meta:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user