fixed some html decoding issues

This commit is contained in:
gauthiier
2019-12-21 21:57:14 +01:00
parent 1ef4782fc8
commit 008ba6a9b5
3 changed files with 20 additions and 3 deletions
+8 -2
View File
@@ -159,8 +159,14 @@ def archive_thread(li, base_url, parent_thread_data):
def collect_message(url, message):
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
# html = response.read()
html = response.read()
try:
html = html.decode(encoding="utf-8")
except:
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
pass
soup = BeautifulSoup(html, "html5lib")
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
+7 -1
View File
@@ -69,7 +69,13 @@ def collect_threads_from_url(url, name, base_arch_dir):
response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
html = response.read()
try:
html = html.decode(encoding="utf-8")
except:
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
soup = BeautifulSoup(html, "html5lib")
ul = soup.find_all('ul')[1];