fixed some html decoding issues
This commit is contained in:
parent
1ef4782fc8
commit
008ba6a9b5
5
crawl.py
5
crawl.py
@ -24,6 +24,11 @@ if __name__ == "__main__":
|
||||
l = []
|
||||
if len(args.url) == 1 and args.url[0] == 'all':
|
||||
l = config.lists
|
||||
elif args.names == None and args.subnames == None:
|
||||
for u in args.url:
|
||||
for li in config.lists:
|
||||
if u == li['name']:
|
||||
l.append(li)
|
||||
elif len(args.url) == len(args.names) == len(args.subnames):
|
||||
i = 0
|
||||
for u in args.url:
|
||||
|
||||
@ -159,8 +159,14 @@ def archive_thread(li, base_url, parent_thread_data):
|
||||
def collect_message(url, message):
|
||||
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read().decode(encoding="utf-8")
|
||||
# html = response.read()
|
||||
|
||||
html = response.read()
|
||||
try:
|
||||
html = html.decode(encoding="utf-8")
|
||||
except:
|
||||
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
|
||||
pass
|
||||
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
|
||||
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
||||
|
||||
@ -69,7 +69,13 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
||||
|
||||
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read().decode(encoding="utf-8")
|
||||
|
||||
html = response.read()
|
||||
try:
|
||||
html = html.decode(encoding="utf-8")
|
||||
except:
|
||||
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
|
||||
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
|
||||
ul = soup.find_all('ul')[1];
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user