updated nettime crawler
This commit is contained in:
parent
62fea815d1
commit
e560cb4cdb
@ -13,9 +13,11 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
|
|||||||
# base url
|
# base url
|
||||||
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
||||||
|
|
||||||
#collect name
|
logging.debug(base_url)
|
||||||
list_name = soup.select('body p:nth-of-type(2) title')[0].string
|
|
||||||
logging.info("Getting " + list_name + " list archive for " + sublist_name)
|
# #collect name
|
||||||
|
# list_name = soup.select('body p:nth-of-type(2) title')[0].string
|
||||||
|
# logging.info("Getting " + list_name + " list archive for " + sublist_name)
|
||||||
|
|
||||||
# create (main) directory
|
# create (main) directory
|
||||||
# this is where all temp files will be created
|
# this is where all temp files will be created
|
||||||
@ -26,20 +28,19 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
|
|||||||
threads = []
|
threads = []
|
||||||
lists = soup.select('ul:nth-of-type(2) li')
|
lists = soup.select('ul:nth-of-type(2) li')
|
||||||
|
|
||||||
|
|
||||||
for l in lists:
|
for l in lists:
|
||||||
|
|
||||||
if l.strong is None:
|
if l.strong is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
name = l.strong.string
|
name = l.strong.string
|
||||||
print(name + " - " + sublist_name)
|
|
||||||
|
|
||||||
if name.lower() == sublist_name.lower():
|
if name.lower() == sublist_name.lower():
|
||||||
|
|
||||||
|
logging.debug(name)
|
||||||
|
|
||||||
threads_url_list = []
|
threads_url_list = []
|
||||||
threads_links = l.select('ul li a')
|
threads_links = l.select('ul li a')
|
||||||
|
|
||||||
for t in threads_links:
|
for t in threads_links:
|
||||||
thread_url = urllib.parse.urljoin(base_url, t.get('href'))
|
thread_url = urllib.parse.urljoin(base_url, t.get('href'))
|
||||||
threads_url_list.append(thread_url)
|
threads_url_list.append(thread_url)
|
||||||
@ -56,7 +57,7 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
|
|||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
except:
|
except:
|
||||||
logging.warning("Error archiving: " + l[1] + "... Continuing.")
|
logging.warning("Error archiving: " + name + "... Continuing.")
|
||||||
ex_t, ex, tb = sys.exc_info()
|
ex_t, ex, tb = sys.exc_info()
|
||||||
print(ex_t)
|
print(ex_t)
|
||||||
traceback.print_tb(tb)
|
traceback.print_tb(tb)
|
||||||
@ -78,17 +79,17 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
|||||||
html = response.read()
|
html = response.read()
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
logging.debug("collecting: " + url)
|
||||||
|
|
||||||
# base url
|
# base url
|
||||||
base_url = url
|
base_url = url
|
||||||
|
|
||||||
# collect name
|
# collect name
|
||||||
|
threads_name = soup.select('head title')[0].string
|
||||||
e = soup.select('p:nth-of-type(1) title')
|
|
||||||
print(soup)
|
|
||||||
|
|
||||||
threads_name = soup.select('p:nth-of-type(1) title')[0].string
|
|
||||||
threads_name = threads_name.replace(' ', '_')
|
threads_name = threads_name.replace(' ', '_')
|
||||||
|
|
||||||
|
logging.debug(threads_name)
|
||||||
|
|
||||||
# thread data struct
|
# thread data struct
|
||||||
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
|
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
|
||||||
|
|
||||||
@ -110,14 +111,18 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
|||||||
n += 1
|
n += 1
|
||||||
logging.info("> " + str(n) + " / " + nbr_threads)
|
logging.info("> " + str(n) + " / " + nbr_threads)
|
||||||
|
|
||||||
try:
|
thread = archive_thread(l, base_url, None)
|
||||||
thread = archive_thread(l, base_url, None)
|
threads['threads'].append(thread)
|
||||||
threads['threads'].append(thread)
|
|
||||||
except:
|
|
||||||
ex_type, ex, tb = sys.exc_info()
|
# try:
|
||||||
traceback.print_tb(tb)
|
# thread = archive_thread(l, base_url, None)
|
||||||
del tb
|
# threads['threads'].append(thread)
|
||||||
continue
|
# except:
|
||||||
|
# ex_type, ex, tb = sys.exc_info()
|
||||||
|
# traceback.print_tb(tb)
|
||||||
|
# del tb
|
||||||
|
# continue
|
||||||
|
|
||||||
time.sleep(DELAY)
|
time.sleep(DELAY)
|
||||||
|
|
||||||
@ -165,6 +170,8 @@ def archive_thread(li, base_url, parent_thread_data):
|
|||||||
|
|
||||||
def collect_message(url, message):
|
def collect_message(url, message):
|
||||||
|
|
||||||
|
logging.debug("collecting message: " + url)
|
||||||
|
|
||||||
response = urllib.request.urlopen(url)
|
response = urllib.request.urlopen(url)
|
||||||
html = response.read().decode(encoding="utf-8")
|
html = response.read().decode(encoding="utf-8")
|
||||||
# html = response.read()
|
# html = response.read()
|
||||||
@ -193,21 +200,12 @@ def collect_message(url, message):
|
|||||||
if field.lower() in message_labels:
|
if field.lower() in message_labels:
|
||||||
message[field.lower()] = i.text.strip(field + ": ")
|
message[field.lower()] = i.text.strip(field + ": ")
|
||||||
|
|
||||||
## reformat from -- [author_name, email_addr]
|
|
||||||
|
|
||||||
# from_addr = email.utils.parseaddr(message['from'])
|
# old way
|
||||||
# message['author_name'] = from_addr[0]
|
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
||||||
# message['from'] = from_addr[1]
|
|
||||||
|
|
||||||
## -- content --
|
# new way
|
||||||
# test
|
message['content'] = parse_xmessage(html)
|
||||||
# c1 = soup.select('pre:nth-of-type(1)')
|
|
||||||
# if len(c1) > 0:
|
|
||||||
# message['content'] = c1[0].text
|
|
||||||
# else:
|
|
||||||
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
|
||||||
|
|
||||||
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
|
||||||
|
|
||||||
# mhonarc xcomments
|
# mhonarc xcomments
|
||||||
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
||||||
@ -217,5 +215,9 @@ def parse_xcomment(soup, xcom):
|
|||||||
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
|
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
|
||||||
return com
|
return com
|
||||||
|
|
||||||
def test_xcomment(soup):
|
# (edit 21.12.2019): this is the new way as of 2018 -- when no more moderation on Nettime...
|
||||||
return soup.find(text=re.compile('X-Message-Id')) is not None
|
def parse_xmessage(html):
|
||||||
|
rr = r'<!--X-Body-of-Message-->.*?<!--X-Body-of-Message-End-->'
|
||||||
|
s = re.search(rr, html, re.DOTALL)
|
||||||
|
se = BeautifulSoup(s.group(), "html5lib")
|
||||||
|
return se.get_text()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user