utf-8 and p[link] logic

This commit is contained in:
gauthiier 2021-12-10 15:16:51 +01:00
parent 00d7a537c7
commit 3d8dd1fbc1
2 changed files with 21 additions and 17 deletions

View File

@ -280,17 +280,18 @@ def main (args):
except FileNotFoundError: except FileNotFoundError:
p['text'] = '' p['text'] = ''
# ADD IN LINK TO PAD AS "link" pref_for_link = ["html","pad", "text"] ## making a design decision here (otherwise need to explicitly provide --link in command)
for v in linkversions: for x in pref_for_link:
if v in versions_by_type: if x in versions_by_type:
vdata = versions_by_type[v] vdata = versions_by_type[x]
try: try:
if v == "pad": if v == "pad" or os.path.exists(vdata["path"]):
p["link"] = absurl(vdata["url"], linkbase) p["link"] = absurl(vdata["url"], linkbase)
break break
except KeyError as e: except KeyError as e:
pass pass
# Not sure this goes here but fixing relpaths with output is quite nice... # Not sure this goes here but fixing relpaths with output is quite nice...
if args.output: if args.output:
outpath = Path(args.output).parent outpath = Path(args.output).parent
@ -307,7 +308,14 @@ def main (args):
v["url"] = v["path"] v["url"] = v["path"]
except Exception as e: except Exception as e:
pass pass
if not p["link"].startswith("http"):
vpath = Path(p["link"])
try:
p["link"] = os.path.join(os.path.relpath(vpath.parent, outpath), vpath.name)
except Exception as e:
pass

View File

@ -260,9 +260,9 @@ def main (args):
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False) # doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1") html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1")
with open(ver["path"], "w") as f: with open(ver["path"], "w", encoding="utf-8") as f:
# f.write(html.encode("utf-8")) # f.write(html.encode("utf-8"))
print(ET.tostring(doc, method="html", encoding="utf-8"), file=f) print(ET.tostring(doc, method="html", encoding="unicode"), file=f)
except TypeError: except TypeError:
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file! # Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
ver["message"] = html["message"] ver["message"] = html["message"]
@ -279,15 +279,11 @@ def main (args):
html = html['data']['html'] html = html['data']['html']
ver["path"] = p+".raw.html" ver["path"] = p+".raw.html"
ver["url"] = quote(ver["path"]) ver["url"] = quote(ver["path"])
# JUN 2016: chaning to save REALLY the RAW / unchanged HTML from the API doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
with open(ver["path"], "w") as f: html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1")
print(html, file=f) with open(ver["path"], "w", encoding="utf-8") as f:
# f.write(html.encode("utf-8"))
# doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) print (ET.tostring(doc, method="html", encoding="unicode"), file=f)
# html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links)
# with open(ver["path"], "w") as f:
# # f.write(html.encode("utf-8"))
# print (ET.tostring(doc, method="html", encoding="unicode"), file=f)
# output meta # output meta
if args.all or args.meta: if args.all or args.meta: