fixed some html decoding issues

This commit is contained in:
gauthiier 2019-12-21 21:57:14 +01:00
parent 1ef4782fc8
commit 008ba6a9b5
3 changed files with 20 additions and 3 deletions

View File

@ -24,6 +24,11 @@ if __name__ == "__main__":
l = [] l = []
if len(args.url) == 1 and args.url[0] == 'all': if len(args.url) == 1 and args.url[0] == 'all':
l = config.lists l = config.lists
elif args.names == None and args.subnames == None:
for u in args.url:
for li in config.lists:
if u == li['name']:
l.append(li)
elif len(args.url) == len(args.names) == len(args.subnames): elif len(args.url) == len(args.names) == len(args.subnames):
i = 0 i = 0
for u in args.url: for u in args.url:

View File

@ -159,8 +159,14 @@ def archive_thread(li, base_url, parent_thread_data):
def collect_message(url, message): def collect_message(url, message):
response = urllib.request.urlopen(url) response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
# html = response.read() html = response.read()
try:
html = html.decode(encoding="utf-8")
except:
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
pass
soup = BeautifulSoup(html, "html5lib") soup = BeautifulSoup(html, "html5lib")
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre> #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>

View File

@ -69,7 +69,13 @@ def collect_threads_from_url(url, name, base_arch_dir):
response = urllib.request.urlopen(url) response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8")
html = response.read()
try:
html = html.decode(encoding="utf-8")
except:
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
soup = BeautifulSoup(html, "html5lib") soup = BeautifulSoup(html, "html5lib")
ul = soup.find_all('ul')[1]; ul = soup.find_all('ul')[1];