fixed some html decoding issues
This commit is contained in:
parent
1ef4782fc8
commit
008ba6a9b5
5
crawl.py
5
crawl.py
@ -24,6 +24,11 @@ if __name__ == "__main__":
|
|||||||
l = []
|
l = []
|
||||||
if len(args.url) == 1 and args.url[0] == 'all':
|
if len(args.url) == 1 and args.url[0] == 'all':
|
||||||
l = config.lists
|
l = config.lists
|
||||||
|
elif args.names == None and args.subnames == None:
|
||||||
|
for u in args.url:
|
||||||
|
for li in config.lists:
|
||||||
|
if u == li['name']:
|
||||||
|
l.append(li)
|
||||||
elif len(args.url) == len(args.names) == len(args.subnames):
|
elif len(args.url) == len(args.names) == len(args.subnames):
|
||||||
i = 0
|
i = 0
|
||||||
for u in args.url:
|
for u in args.url:
|
||||||
|
|||||||
@ -159,8 +159,14 @@ def archive_thread(li, base_url, parent_thread_data):
|
|||||||
def collect_message(url, message):
|
def collect_message(url, message):
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
response = urllib.request.urlopen(url)
|
||||||
html = response.read().decode(encoding="utf-8")
|
|
||||||
# html = response.read()
|
html = response.read()
|
||||||
|
try:
|
||||||
|
html = html.decode(encoding="utf-8")
|
||||||
|
except:
|
||||||
|
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
|
||||||
|
pass
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
||||||
|
|||||||
@ -69,7 +69,13 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
|||||||
|
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
response = urllib.request.urlopen(url)
|
||||||
html = response.read().decode(encoding="utf-8")
|
|
||||||
|
html = response.read()
|
||||||
|
try:
|
||||||
|
html = html.decode(encoding="utf-8")
|
||||||
|
except:
|
||||||
|
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
ul = soup.find_all('ul')[1];
|
ul = soup.find_all('ul')[1];
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user