utf-8 and p[link] logic

This commit is contained in:
gauthiier 2021-12-10 15:16:51 +01:00
parent 00d7a537c7
commit 3d8dd1fbc1
2 changed files with 21 additions and 17 deletions

View File

@ -280,17 +280,18 @@ def main (args):
except FileNotFoundError:
p['text'] = ''
# ADD IN LINK TO PAD AS "link"
for v in linkversions:
if v in versions_by_type:
vdata = versions_by_type[v]
pref_for_link = ["html","pad", "text"] ## making a design decision here (otherwise need to explicitly provide --link in command)
for x in pref_for_link:
if x in versions_by_type:
vdata = versions_by_type[x]
try:
if v == "pad":
if v == "pad" or os.path.exists(vdata["path"]):
p["link"] = absurl(vdata["url"], linkbase)
break
except KeyError as e:
pass
# Not sure this goes here but fixing relpaths with output is quite nice...
if args.output:
outpath = Path(args.output).parent
@ -308,6 +309,13 @@ def main (args):
except Exception as e:
pass
if not p["link"].startswith("http"):
vpath = Path(p["link"])
try:
p["link"] = os.path.join(os.path.relpath(vpath.parent, outpath), vpath.name)
except Exception as e:
pass

View File

@ -260,9 +260,9 @@ def main (args):
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1")
with open(ver["path"], "w") as f:
with open(ver["path"], "w", encoding="utf-8") as f:
# f.write(html.encode("utf-8"))
print(ET.tostring(doc, method="html", encoding="utf-8"), file=f)
print(ET.tostring(doc, method="html", encoding="unicode"), file=f)
except TypeError:
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
ver["message"] = html["message"]
@ -279,15 +279,11 @@ def main (args):
html = html['data']['html']
ver["path"] = p+".raw.html"
ver["url"] = quote(ver["path"])
# JUN 2016: chaning to save REALLY the RAW / unchanged HTML from the API
with open(ver["path"], "w") as f:
print(html, file=f)
# doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
# html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links)
# with open(ver["path"], "w") as f:
# # f.write(html.encode("utf-8"))
# print (ET.tostring(doc, method="html", encoding="unicode"), file=f)
doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1")
with open(ver["path"], "w", encoding="utf-8") as f:
# f.write(html.encode("utf-8"))
print (ET.tostring(doc, method="html", encoding="unicode"), file=f)
# output meta
if args.all or args.meta: