fixed some html decoding issues

2019-12-21 21:57:14 +01:00 · 2019-12-21 21:57:14 +01:00 · 008ba6a9b5
commit 008ba6a9b5
parent 1ef4782fc8
3 changed files with 20 additions and 3 deletions
--- a/crawl.py
+++ b/crawl.py
@ -24,6 +24,11 @@ if __name__ == "__main__":
    l = []
    if len(args.url) == 1 and  args.url[0] == 'all':
        l = config.lists
    elif args.names == None and args.subnames == None:
        for u in args.url:
            for li in config.lists:
                if u == li['name']:
                    l.append(li)
    elif len(args.url) == len(args.names) == len(args.subnames):
        i = 0
        for u in args.url:
--- a/lists/mhonarc.py
+++ b/lists/mhonarc.py
@ -159,8 +159,14 @@ def archive_thread(li, base_url, parent_thread_data):
 def collect_message(url, message):
    response = urllib.request.urlopen(url)
-    html = response.read().decode(encoding="utf-8")
+
-    # html = response.read()
+    html = response.read()
    try:
        html = html.decode(encoding="utf-8")
    except:
        logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
        pass
    soup = BeautifulSoup(html, "html5lib")    
    #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@ -69,7 +69,13 @@ def collect_threads_from_url(url, name, base_arch_dir):
 	response = urllib.request.urlopen(url)
-	html = response.read().decode(encoding="utf-8")
+
 	html = response.read()
 	try:
 		html = html.decode(encoding="utf-8")
 	except:
 		logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
 	soup = BeautifulSoup(html, "html5lib")
 	ul = soup.find_all('ul')[1];