diff --git a/lists/crawl.py b/lists/crawl.py index 4aaf016..582052a 100644 --- a/lists/crawl.py +++ b/lists/crawl.py @@ -30,6 +30,9 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"): sublist_name = name mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) + elif "oldboys" in name: + mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) + else: print('mhonarc?') diff --git a/lists/listserv.py b/lists/listserv.py index c17287b..23713d0 100644 --- a/lists/listserv.py +++ b/lists/listserv.py @@ -1,16 +1,19 @@ import urllib.request, urllib.parse import logging, os, sys, traceback, re, time, json, gzip, difflib from bs4 import BeautifulSoup +import lists.util DELAY = 0.2 def collect_from_url(url, name, base_archive_dir): - response = urllib.request.urlopen(url) - #html = response.read().decode(encoding="utf-8") - html = response.read() - soup = BeautifulSoup(html, "html5lib") + # response = urllib.request.urlopen(url) + # #html = response.read().decode(encoding="utf-8") + # html = response.read() + # soup = BeautifulSoup(html, "html5lib") + + soup = lists.util.request(url) threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li') lists = [] @@ -74,10 +77,12 @@ def collect_threads_from_url(url, name, base_arch_dir): logging.info("can't open archive " + file_path + "... rearchiving.") - response = urllib.request.urlopen(url) - #html = response.read().decode(encoding="utf-8") - html = response.read() - soup = BeautifulSoup(html, "html5lib") + # response = urllib.request.urlopen(url) + # #html = response.read().decode(encoding="utf-8") + # html = response.read() + # soup = BeautifulSoup(html, "html5lib") + + soup = lists.util.request(url) table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr') lists = [] @@ -135,10 +140,12 @@ def collect_threads_from_url(url, name, base_arch_dir): def collect_message(url, message): - response = urllib.request.urlopen(url) - #html = response.read().decode(encoding="utf-8") - html = response.read() - soup = BeautifulSoup(html, "html5lib") + # response = urllib.request.urlopen(url) + # #html = response.read().decode(encoding="utf-8") + # html = response.read() + # soup = BeautifulSoup(html, "html5lib") + + soup = lists.util.request(url) tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False) diff --git a/lists/mhonarc.py b/lists/mhonarc.py index 750c15d..8a88036 100644 --- a/lists/mhonarc.py +++ b/lists/mhonarc.py @@ -1,14 +1,17 @@ import urllib.request, urllib.parse import logging, os, sys, traceback, re, time, json, gzip from bs4 import BeautifulSoup +import lists.util DELAY = 0.2 def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False): - response = urllib.request.urlopen(url) - html = response.read() - soup = BeautifulSoup(html, "html5lib") + # response = urllib.request.urlopen(url) + # html = response.read() + # soup = BeautifulSoup(html, "html5lib") + + soup = lists.util.request(url) # base url base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') @@ -71,10 +74,12 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox= def collect_threads_from_url(url, base_archive_dir, mbox=False): - response = urllib.request.urlopen(url) - html = response.read() - soup = BeautifulSoup(html, "html5lib") + # response = urllib.request.urlopen(url) + # html = response.read() + # soup = BeautifulSoup(html, "html5lib") + soup = lists.util.request(url) + # base url base_url = url diff --git a/lists/mhonarc_nettime.py b/lists/mhonarc_nettime.py index e5e106b..aeba81e 100644 --- a/lists/mhonarc_nettime.py +++ b/lists/mhonarc_nettime.py @@ -2,17 +2,23 @@ import urllib.request, urllib.parse import logging, os, sys, traceback, re, time, json, gzip from datetime import datetime from bs4 import BeautifulSoup +import lists.util DELAY = 0.2 def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False): - response = urllib.request.urlopen(url) - html = response.read() - soup = BeautifulSoup(html, "html5lib") + # response = urllib.request.urlopen(url) + # html = response.read() + # soup = BeautifulSoup(html, "html5lib") + + soup = lists.util.request(url) # base url - base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') + try: + base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') + except: + base_url = url logging.debug(base_url) @@ -27,9 +33,10 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox= os.makedirs(d) threads = [] - lists = soup.select('ul:nth-of-type(2) li') + # lists = soup.select('ul:nth-of-type(2) li') + li = soup.select('ul li') - for l in lists: + for l in li: if l.strong is None: continue @@ -75,15 +82,18 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox= return None def new_name(n): - dt = datetime.strptime(n, 'nettime-l_%b_%y') + # ex: nettime-bold_Mar_99 + dt = datetime.strptime(n[-6:], '%b_%y') return dt.strftime('%B_%Y') def collect_threads_from_url(url, base_archive_dir, mbox=False): - response = urllib.request.urlopen(url) - html = response.read() - soup = BeautifulSoup(html, "html5lib") + # response = urllib.request.urlopen(url) + # html = response.read() + # soup = BeautifulSoup(html, "html5lib") + + soup = lists.util.request(url) logging.debug("collecting: " + url) @@ -111,12 +121,12 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False): with open(file_path, 'r') as fpin: threads = json.load(fpin) else: - lists = soup.select('ul:nth-of-type(1) > li') + li = soup.select('ul:nth-of-type(1) > li') - nbr_threads = str(len(lists)) + nbr_threads = str(len(li)) n = 0 - for l in lists: + for l in li: n += 1 logging.info("> " + str(n) + " / " + nbr_threads) @@ -181,10 +191,12 @@ def collect_message(url, message): logging.debug("collecting message: " + url) - response = urllib.request.urlopen(url) - html = response.read().decode(encoding="utf-8") - # html = response.read() - soup = BeautifulSoup(html, "html5lib") + # response = urllib.request.urlopen(url) + # html = response.read().decode(encoding="utf-8") + # # html = response.read() + # soup = BeautifulSoup(html, "html5lib") + + soup = lists.util.request(url) #note: this should follow an RFC header standard -- MHonArc has header info in the 1th
@@ -214,7 +226,7 @@ def collect_message(url, message):
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# new way
- message['content'] = parse_xmessage(html)
+ message['content'] = parse_xmessage(str(soup))
# mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
diff --git a/lists/pipermail.py b/lists/pipermail.py
index f462753..1f3e33a 100644
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@@ -2,15 +2,18 @@ import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip, difflib
from bs4 import BeautifulSoup
import lists.mhonarc
+import lists.util
DELAY = 0.2
def collect_from_url(url, name, base_archive_dir):
- response = urllib.request.urlopen(url)
- # html = response.read().decode(encoding="utf-8")
- html = response.read()
- soup = BeautifulSoup(html, "html5lib")
+ soup = lists.util.request(url)
+
+ # response = urllib.request.urlopen(url)
+ # # html = response.read().decode(encoding="utf-8")
+ # html = response.read()
+ # soup = BeautifulSoup(html, "html5lib")
threads_list = soup.find_all('tr')
lists = []
@@ -67,16 +70,17 @@ def collect_threads_from_url(url, name, base_arch_dir):
except:
logging.info("can't open archive " + file_path + "... rearchiving.")
+ soup = lists.util.request(url)
- response = urllib.request.urlopen(url)
+ # response = urllib.request.urlopen(url)
- html = response.read()
- try:
- html = html.decode(encoding="utf-8")
- except:
- logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
+ # html = response.read()
+ # try:
+ # html = html.decode(encoding="utf-8")
+ # except:
+ # logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
- soup = BeautifulSoup(html, "html5lib")
+ # soup = BeautifulSoup(html, "html5lib")
ul = soup.find_all('ul')[1];
lists = ul.find_all('li', recursive=False)
@@ -201,10 +205,12 @@ def archive_thread_hybrid_mhonarc(li, base_url, parent_thread_data):
def collect_message(url, message):
# logging.info(" + " + url)
- response = urllib.request.urlopen(url)
- # html = response.read().decode(encoding="utf-8")
- html = response.read()
- soup = BeautifulSoup(html, "html5lib")
+ # response = urllib.request.urlopen(url)
+ # # html = response.read().decode(encoding="utf-8")
+ # html = response.read()
+ # soup = BeautifulSoup(html, "html5lib")
+
+ soup = lists.util.request(url)
if lists.mhonarc.test_xcomment(soup):
logging.info("Mhonarc detected, switching to mhonarc parsing...")
diff --git a/lists/util.py b/lists/util.py
new file mode 100644
index 0000000..6ce64bf
--- /dev/null
+++ b/lists/util.py
@@ -0,0 +1,13 @@
+import urllib.request, urllib.parse
+import logging, os, sys, traceback, re, time, json, gzip, difflib
+from bs4 import BeautifulSoup
+
+def request(url):
+ response = urllib.request.urlopen(url)
+ html = response.read()
+ try:
+ html = html.decode(encoding="utf-8")
+ except:
+ logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
+ soup = BeautifulSoup(html, "html5lib")
+ return soup