crawling
This commit is contained in:
parent
a6fa141bbd
commit
a8cfaee935
@ -30,6 +30,9 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"):
|
|||||||
sublist_name = name
|
sublist_name = name
|
||||||
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
|
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
|
||||||
|
|
||||||
|
elif "oldboys" in name:
|
||||||
|
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('mhonarc?')
|
print('mhonarc?')
|
||||||
|
|
||||||
|
|||||||
@ -1,16 +1,19 @@
|
|||||||
import urllib.request, urllib.parse
|
import urllib.request, urllib.parse
|
||||||
import logging, os, sys, traceback, re, time, json, gzip, difflib
|
import logging, os, sys, traceback, re, time, json, gzip, difflib
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import lists.util
|
||||||
|
|
||||||
|
|
||||||
DELAY = 0.2
|
DELAY = 0.2
|
||||||
|
|
||||||
def collect_from_url(url, name, base_archive_dir):
|
def collect_from_url(url, name, base_archive_dir):
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
#html = response.read().decode(encoding="utf-8")
|
# #html = response.read().decode(encoding="utf-8")
|
||||||
html = response.read()
|
# html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
|
threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
|
||||||
lists = []
|
lists = []
|
||||||
@ -74,10 +77,12 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
|||||||
logging.info("can't open archive " + file_path + "... rearchiving.")
|
logging.info("can't open archive " + file_path + "... rearchiving.")
|
||||||
|
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
#html = response.read().decode(encoding="utf-8")
|
# #html = response.read().decode(encoding="utf-8")
|
||||||
html = response.read()
|
# html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
|
table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
|
||||||
lists = []
|
lists = []
|
||||||
@ -135,10 +140,12 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
|||||||
|
|
||||||
def collect_message(url, message):
|
def collect_message(url, message):
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
#html = response.read().decode(encoding="utf-8")
|
# #html = response.read().decode(encoding="utf-8")
|
||||||
html = response.read()
|
# html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)
|
tr = soup.find_all('table', {'class': 'tableframe'})[3].find_all('tbody', recursive=False)[0].find_all('tr', recursive=False)
|
||||||
|
|
||||||
|
|||||||
@ -1,14 +1,17 @@
|
|||||||
import urllib.request, urllib.parse
|
import urllib.request, urllib.parse
|
||||||
import logging, os, sys, traceback, re, time, json, gzip
|
import logging, os, sys, traceback, re, time, json, gzip
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import lists.util
|
||||||
|
|
||||||
DELAY = 0.2
|
DELAY = 0.2
|
||||||
|
|
||||||
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
|
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
html = response.read()
|
# html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
# base url
|
# base url
|
||||||
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
||||||
@ -71,10 +74,12 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
|
|||||||
|
|
||||||
def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
html = response.read()
|
# html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
# base url
|
# base url
|
||||||
base_url = url
|
base_url = url
|
||||||
|
|
||||||
|
|||||||
@ -2,17 +2,23 @@ import urllib.request, urllib.parse
|
|||||||
import logging, os, sys, traceback, re, time, json, gzip
|
import logging, os, sys, traceback, re, time, json, gzip
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import lists.util
|
||||||
|
|
||||||
DELAY = 0.2
|
DELAY = 0.2
|
||||||
|
|
||||||
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
|
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
html = response.read()
|
# html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
# base url
|
# base url
|
||||||
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
try:
|
||||||
|
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
||||||
|
except:
|
||||||
|
base_url = url
|
||||||
|
|
||||||
logging.debug(base_url)
|
logging.debug(base_url)
|
||||||
|
|
||||||
@ -27,9 +33,10 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
|
|||||||
os.makedirs(d)
|
os.makedirs(d)
|
||||||
|
|
||||||
threads = []
|
threads = []
|
||||||
lists = soup.select('ul:nth-of-type(2) li')
|
# lists = soup.select('ul:nth-of-type(2) li')
|
||||||
|
li = soup.select('ul li')
|
||||||
|
|
||||||
for l in lists:
|
for l in li:
|
||||||
|
|
||||||
if l.strong is None:
|
if l.strong is None:
|
||||||
continue
|
continue
|
||||||
@ -75,15 +82,18 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def new_name(n):
|
def new_name(n):
|
||||||
dt = datetime.strptime(n, 'nettime-l_%b_%y')
|
# ex: nettime-bold_Mar_99
|
||||||
|
dt = datetime.strptime(n[-6:], '%b_%y')
|
||||||
return dt.strftime('%B_%Y')
|
return dt.strftime('%B_%Y')
|
||||||
|
|
||||||
|
|
||||||
def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
html = response.read()
|
# html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
logging.debug("collecting: " + url)
|
logging.debug("collecting: " + url)
|
||||||
|
|
||||||
@ -111,12 +121,12 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
|||||||
with open(file_path, 'r') as fpin:
|
with open(file_path, 'r') as fpin:
|
||||||
threads = json.load(fpin)
|
threads = json.load(fpin)
|
||||||
else:
|
else:
|
||||||
lists = soup.select('ul:nth-of-type(1) > li')
|
li = soup.select('ul:nth-of-type(1) > li')
|
||||||
|
|
||||||
nbr_threads = str(len(lists))
|
nbr_threads = str(len(li))
|
||||||
n = 0
|
n = 0
|
||||||
|
|
||||||
for l in lists:
|
for l in li:
|
||||||
n += 1
|
n += 1
|
||||||
logging.info("> " + str(n) + " / " + nbr_threads)
|
logging.info("> " + str(n) + " / " + nbr_threads)
|
||||||
|
|
||||||
@ -181,10 +191,12 @@ def collect_message(url, message):
|
|||||||
|
|
||||||
logging.debug("collecting message: " + url)
|
logging.debug("collecting message: " + url)
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
html = response.read().decode(encoding="utf-8")
|
# html = response.read().decode(encoding="utf-8")
|
||||||
# html = response.read()
|
# # html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
||||||
|
|
||||||
@ -214,7 +226,7 @@ def collect_message(url, message):
|
|||||||
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
||||||
|
|
||||||
# new way
|
# new way
|
||||||
message['content'] = parse_xmessage(html)
|
message['content'] = parse_xmessage(str(soup))
|
||||||
|
|
||||||
# mhonarc xcomments
|
# mhonarc xcomments
|
||||||
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
||||||
|
|||||||
@ -2,15 +2,18 @@ import urllib.request, urllib.parse
|
|||||||
import logging, os, sys, traceback, re, time, json, gzip, difflib
|
import logging, os, sys, traceback, re, time, json, gzip, difflib
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import lists.mhonarc
|
import lists.mhonarc
|
||||||
|
import lists.util
|
||||||
|
|
||||||
DELAY = 0.2
|
DELAY = 0.2
|
||||||
|
|
||||||
def collect_from_url(url, name, base_archive_dir):
|
def collect_from_url(url, name, base_archive_dir):
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
soup = lists.util.request(url)
|
||||||
# html = response.read().decode(encoding="utf-8")
|
|
||||||
html = response.read()
|
# response = urllib.request.urlopen(url)
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# # html = response.read().decode(encoding="utf-8")
|
||||||
|
# html = response.read()
|
||||||
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
threads_list = soup.find_all('tr')
|
threads_list = soup.find_all('tr')
|
||||||
lists = []
|
lists = []
|
||||||
@ -67,16 +70,17 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
|||||||
except:
|
except:
|
||||||
logging.info("can't open archive " + file_path + "... rearchiving.")
|
logging.info("can't open archive " + file_path + "... rearchiving.")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
|
|
||||||
html = response.read()
|
# html = response.read()
|
||||||
try:
|
# try:
|
||||||
html = html.decode(encoding="utf-8")
|
# html = html.decode(encoding="utf-8")
|
||||||
except:
|
# except:
|
||||||
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
|
# logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
ul = soup.find_all('ul')[1];
|
ul = soup.find_all('ul')[1];
|
||||||
lists = ul.find_all('li', recursive=False)
|
lists = ul.find_all('li', recursive=False)
|
||||||
@ -201,10 +205,12 @@ def archive_thread_hybrid_mhonarc(li, base_url, parent_thread_data):
|
|||||||
def collect_message(url, message):
|
def collect_message(url, message):
|
||||||
# logging.info(" + " + url)
|
# logging.info(" + " + url)
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
# response = urllib.request.urlopen(url)
|
||||||
# html = response.read().decode(encoding="utf-8")
|
# # html = response.read().decode(encoding="utf-8")
|
||||||
html = response.read()
|
# html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
# soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
soup = lists.util.request(url)
|
||||||
|
|
||||||
if lists.mhonarc.test_xcomment(soup):
|
if lists.mhonarc.test_xcomment(soup):
|
||||||
logging.info("Mhonarc detected, switching to mhonarc parsing...")
|
logging.info("Mhonarc detected, switching to mhonarc parsing...")
|
||||||
|
|||||||
13
lists/util.py
Normal file
13
lists/util.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
import urllib.request, urllib.parse
|
||||||
|
import logging, os, sys, traceback, re, time, json, gzip, difflib
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def request(url):
|
||||||
|
response = urllib.request.urlopen(url)
|
||||||
|
html = response.read()
|
||||||
|
try:
|
||||||
|
html = html.decode(encoding="utf-8")
|
||||||
|
except:
|
||||||
|
logging.warning("Error decoding(utf-8): " + url + "... Continuing (non-utf).")
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
return soup
|
||||||
Loading…
x
Reference in New Issue
Block a user