From 3d8dd1fbc1496c9f39271e519ef350e43b65f2ee Mon Sep 17 00:00:00 2001 From: gauthiier Date: Fri, 10 Dec 2021 15:16:51 +0100 Subject: [PATCH] utf-8 and p[link] logic --- pppadump/commands/index.py | 20 ++++++++++++++------ pppadump/commands/pull.py | 18 +++++++----------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pppadump/commands/index.py b/pppadump/commands/index.py index 75ed04d..129cfad 100644 --- a/pppadump/commands/index.py +++ b/pppadump/commands/index.py @@ -280,17 +280,18 @@ def main (args): except FileNotFoundError: p['text'] = '' - # ADD IN LINK TO PAD AS "link" - for v in linkversions: - if v in versions_by_type: - vdata = versions_by_type[v] + pref_for_link = ["html","pad", "text"] ## making a design decision here (otherwise need to explicitly provide --link in command) + for x in pref_for_link: + if x in versions_by_type: + vdata = versions_by_type[x] try: - if v == "pad": + if v == "pad" or os.path.exists(vdata["path"]): p["link"] = absurl(vdata["url"], linkbase) break except KeyError as e: pass + # Not sure this goes here but fixing relpaths with output is quite nice... if args.output: outpath = Path(args.output).parent @@ -307,7 +308,14 @@ def main (args): v["url"] = v["path"] except Exception as e: pass - + + if not p["link"].startswith("http"): + vpath = Path(p["link"]) + try: + p["link"] = os.path.join(os.path.relpath(vpath.parent, outpath), vpath.name) + except Exception as e: + pass + diff --git a/pppadump/commands/pull.py b/pppadump/commands/pull.py index f3e4c96..38c9906 100644 --- a/pppadump/commands/pull.py +++ b/pppadump/commands/pull.py @@ -260,9 +260,9 @@ def main (args): # doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False) doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1") - with open(ver["path"], "w") as f: + with open(ver["path"], "w", encoding="utf-8") as f: # f.write(html.encode("utf-8")) - print(ET.tostring(doc, method="html", encoding="utf-8"), file=f) + print(ET.tostring(doc, method="html", encoding="unicode"), file=f) except TypeError: # Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file! ver["message"] = html["message"] @@ -279,15 +279,11 @@ def main (args): html = html['data']['html'] ver["path"] = p+".raw.html" ver["url"] = quote(ver["path"]) - # JUN 2016: chaning to save REALLY the RAW / unchanged HTML from the API - with open(ver["path"], "w") as f: - print(html, file=f) - - # doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) - # html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links) - # with open(ver["path"], "w") as f: - # # f.write(html.encode("utf-8")) - # print (ET.tostring(doc, method="html", encoding="unicode"), file=f) + doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) + html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links, viewport_meta="width=device-width,initial-scale=1") + with open(ver["path"], "w", encoding="utf-8") as f: + # f.write(html.encode("utf-8")) + print (ET.tostring(doc, method="html", encoding="unicode"), file=f) # output meta if args.all or args.meta: