diff --git a/crawl.py b/crawl.py index d91861a..a717333 100644 --- a/crawl.py +++ b/crawl.py @@ -24,6 +24,11 @@ if __name__ == "__main__": l = [] if len(args.url) == 1 and args.url[0] == 'all': l = config.lists + elif args.names == None and args.subnames == None: + for u in args.url: + for li in config.lists: + if u == li['name']: + l.append(li) elif len(args.url) == len(args.names) == len(args.subnames): i = 0 for u in args.url: diff --git a/lists/mhonarc.py b/lists/mhonarc.py index 1ba9c35..750c15d 100644 --- a/lists/mhonarc.py +++ b/lists/mhonarc.py @@ -159,8 +159,14 @@ def archive_thread(li, base_url, parent_thread_data): def collect_message(url, message): response = urllib.request.urlopen(url) - html = response.read().decode(encoding="utf-8") - # html = response.read() + + html = response.read() + try: + html = html.decode(encoding="utf-8") + except: + logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).") + pass + soup = BeautifulSoup(html, "html5lib") #note: this should follow an RFC header standard -- MHonArc has header info in the 1th
diff --git a/lists/pipermail.py b/lists/pipermail.py
index c08c68f..f462753 100644
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@@ -69,7 +69,13 @@ def collect_threads_from_url(url, name, base_arch_dir):
 
 
 	response = urllib.request.urlopen(url)
-	html = response.read().decode(encoding="utf-8")
+
+	html = response.read()
+	try:
+		html = html.decode(encoding="utf-8")
+	except:
+		logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
+
 	soup = BeautifulSoup(html, "html5lib")
 
 	ul = soup.find_all('ul')[1];