This commit is contained in:
gauthiier 2019-12-22 08:22:20 +01:00
parent a6fa141bbd
commit a8cfaee935
6 changed files with 97 additions and 51 deletions

View File

@ -30,6 +30,9 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"):
sublist_name = name sublist_name = name
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
elif "oldboys" in name:
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
else: else:
print('mhonarc?') print('mhonarc?')

View File

@ -1,16 +1,19 @@
import urllib.request, urllib.parse import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip, difflib import logging, os, sys, traceback, re, time, json, gzip, difflib
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import lists.util
DELAY = 0.2 DELAY = 0.2
def collect_from_url(url, name, base_archive_dir): def collect_from_url(url, name, base_archive_dir):
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
#html = response.read().decode(encoding="utf-8") # #html = response.read().decode(encoding="utf-8")
html = response.read() # html = response.read()
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li') threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
lists = [] lists = []
@ -74,10 +77,12 @@ def collect_threads_from_url(url, name, base_arch_dir):
logging.info("can't open archive " + file_path + "... rearchiving.") logging.info("can't open archive " + file_path + "... rearchiving.")
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
#html = response.read().decode(encoding="utf-8") # #html = response.read().decode(encoding="utf-8")
html = response.read() # html = response.read()
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr') table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
lists = [] lists = []
@ -135,10 +140,12 @@ def collect_threads_from_url(url, name, base_arch_dir):
def collect_message(url, message): def collect_message(url, message):
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
#html = response.read().decode(encoding="utf-8") # #html = response.read().decode(encoding="utf-8")
html = response.read() # html = response.read()
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False) tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)

View File

@ -1,14 +1,17 @@
import urllib.request, urllib.parse import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip import logging, os, sys, traceback, re, time, json, gzip
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import lists.util
DELAY = 0.2 DELAY = 0.2
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False): def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
html = response.read() # html = response.read()
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
# base url # base url
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
@ -71,10 +74,12 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
def collect_threads_from_url(url, base_archive_dir, mbox=False): def collect_threads_from_url(url, base_archive_dir, mbox=False):
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
html = response.read() # html = response.read()
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
# base url # base url
base_url = url base_url = url

View File

@ -2,17 +2,23 @@ import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip import logging, os, sys, traceback, re, time, json, gzip
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import lists.util
DELAY = 0.2 DELAY = 0.2
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False): def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
html = response.read() # html = response.read()
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
# base url # base url
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') try:
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
except:
base_url = url
logging.debug(base_url) logging.debug(base_url)
@ -27,9 +33,10 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
os.makedirs(d) os.makedirs(d)
threads = [] threads = []
lists = soup.select('ul:nth-of-type(2) li') # lists = soup.select('ul:nth-of-type(2) li')
li = soup.select('ul li')
for l in lists: for l in li:
if l.strong is None: if l.strong is None:
continue continue
@ -75,15 +82,18 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
return None return None
def new_name(n): def new_name(n):
dt = datetime.strptime(n, 'nettime-l_%b_%y') # ex: nettime-bold_Mar_99
dt = datetime.strptime(n[-6:], '%b_%y')
return dt.strftime('%B_%Y') return dt.strftime('%B_%Y')
def collect_threads_from_url(url, base_archive_dir, mbox=False): def collect_threads_from_url(url, base_archive_dir, mbox=False):
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
html = response.read() # html = response.read()
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
logging.debug("collecting: " + url) logging.debug("collecting: " + url)
@ -111,12 +121,12 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
with open(file_path, 'r') as fpin: with open(file_path, 'r') as fpin:
threads = json.load(fpin) threads = json.load(fpin)
else: else:
lists = soup.select('ul:nth-of-type(1) > li') li = soup.select('ul:nth-of-type(1) > li')
nbr_threads = str(len(lists)) nbr_threads = str(len(li))
n = 0 n = 0
for l in lists: for l in li:
n += 1 n += 1
logging.info("> " + str(n) + " / " + nbr_threads) logging.info("> " + str(n) + " / " + nbr_threads)
@ -181,10 +191,12 @@ def collect_message(url, message):
logging.debug("collecting message: " + url) logging.debug("collecting message: " + url)
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8") # html = response.read().decode(encoding="utf-8")
# html = response.read() # # html = response.read()
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre> #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
@ -214,7 +226,7 @@ def collect_message(url, message):
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text # message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# new way # new way
message['content'] = parse_xmessage(html) message['content'] = parse_xmessage(str(soup))
# mhonarc xcomments # mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html

View File

@ -2,15 +2,18 @@ import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip, difflib import logging, os, sys, traceback, re, time, json, gzip, difflib
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import lists.mhonarc import lists.mhonarc
import lists.util
DELAY = 0.2 DELAY = 0.2
def collect_from_url(url, name, base_archive_dir): def collect_from_url(url, name, base_archive_dir):
response = urllib.request.urlopen(url) soup = lists.util.request(url)
# html = response.read().decode(encoding="utf-8")
html = response.read() # response = urllib.request.urlopen(url)
soup = BeautifulSoup(html, "html5lib") # # html = response.read().decode(encoding="utf-8")
# html = response.read()
# soup = BeautifulSoup(html, "html5lib")
threads_list = soup.find_all('tr') threads_list = soup.find_all('tr')
lists = [] lists = []
@ -67,16 +70,17 @@ def collect_threads_from_url(url, name, base_arch_dir):
except: except:
logging.info("can't open archive " + file_path + "... rearchiving.") logging.info("can't open archive " + file_path + "... rearchiving.")
soup = lists.util.request(url)
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
html = response.read() # html = response.read()
try: # try:
html = html.decode(encoding="utf-8") # html = html.decode(encoding="utf-8")
except: # except:
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).") # logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
ul = soup.find_all('ul')[1]; ul = soup.find_all('ul')[1];
lists = ul.find_all('li', recursive=False) lists = ul.find_all('li', recursive=False)
@ -201,10 +205,12 @@ def archive_thread_hybrid_mhonarc(li, base_url, parent_thread_data):
def collect_message(url, message): def collect_message(url, message):
# logging.info(" + " + url) # logging.info(" + " + url)
response = urllib.request.urlopen(url) # response = urllib.request.urlopen(url)
# html = response.read().decode(encoding="utf-8") # # html = response.read().decode(encoding="utf-8")
html = response.read() # html = response.read()
soup = BeautifulSoup(html, "html5lib") # soup = BeautifulSoup(html, "html5lib")
soup = lists.util.request(url)
if lists.mhonarc.test_xcomment(soup): if lists.mhonarc.test_xcomment(soup):
logging.info("Mhonarc detected, switching to mhonarc parsing...") logging.info("Mhonarc detected, switching to mhonarc parsing...")

13
lists/util.py Normal file
View File

@ -0,0 +1,13 @@
import urllib.request, urllib.parse
import logging, os, sys, traceback, re, time, json, gzip, difflib
from bs4 import BeautifulSoup
def request(url):
response = urllib.request.urlopen(url)
html = response.read()
try:
html = html.decode(encoding="utf-8")
except:
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
soup = BeautifulSoup(html, "html5lib")
return soup