fixed some html decoding issues
This commit is contained in:
+8
-2
@@ -159,8 +159,14 @@ def archive_thread(li, base_url, parent_thread_data):
|
||||
def collect_message(url, message):
|
||||
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read().decode(encoding="utf-8")
|
||||
# html = response.read()
|
||||
|
||||
html = response.read()
|
||||
try:
|
||||
html = html.decode(encoding="utf-8")
|
||||
except:
|
||||
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
|
||||
pass
|
||||
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
|
||||
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
||||
|
||||
+7
-1
@@ -69,7 +69,13 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
||||
|
||||
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read().decode(encoding="utf-8")
|
||||
|
||||
html = response.read()
|
||||
try:
|
||||
html = html.decode(encoding="utf-8")
|
||||
except:
|
||||
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
|
||||
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
|
||||
ul = soup.find_all('ul')[1];
|
||||
|
||||
Reference in New Issue
Block a user