updated nettime crawler

This commit is contained in:
gauthiier 2019-12-21 14:35:37 +01:00
parent 62fea815d1
commit e560cb4cdb
2 changed files with 37 additions and 35 deletions

View File

@ -13,9 +13,11 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
# base url # base url
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
#collect name logging.debug(base_url)
list_name = soup.select('body p:nth-of-type(2) title')[0].string
logging.info("Getting " + list_name + " list archive for " + sublist_name) # #collect name
# list_name = soup.select('body p:nth-of-type(2) title')[0].string
# logging.info("Getting " + list_name + " list archive for " + sublist_name)
# create (main) directory # create (main) directory
# this is where all temp files will be created # this is where all temp files will be created
@ -26,20 +28,19 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
threads = [] threads = []
lists = soup.select('ul:nth-of-type(2) li') lists = soup.select('ul:nth-of-type(2) li')
for l in lists: for l in lists:
if l.strong is None: if l.strong is None:
continue continue
name = l.strong.string name = l.strong.string
print(name + " - " + sublist_name)
if name.lower() == sublist_name.lower(): if name.lower() == sublist_name.lower():
logging.debug(name)
threads_url_list = [] threads_url_list = []
threads_links = l.select('ul li a') threads_links = l.select('ul li a')
for t in threads_links: for t in threads_links:
thread_url = urllib.parse.urljoin(base_url, t.get('href')) thread_url = urllib.parse.urljoin(base_url, t.get('href'))
threads_url_list.append(thread_url) threads_url_list.append(thread_url)
@ -56,7 +57,7 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=
except KeyboardInterrupt: except KeyboardInterrupt:
sys.exit(0) sys.exit(0)
except: except:
logging.warning("Error archiving: " + l[1] + "... Continuing.") logging.warning("Error archiving: " + name + "... Continuing.")
ex_t, ex, tb = sys.exc_info() ex_t, ex, tb = sys.exc_info()
print(ex_t) print(ex_t)
traceback.print_tb(tb) traceback.print_tb(tb)
@ -78,17 +79,17 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
html = response.read() html = response.read()
soup = BeautifulSoup(html, "html5lib") soup = BeautifulSoup(html, "html5lib")
logging.debug("collecting: " + url)
# base url # base url
base_url = url base_url = url
# collect name # collect name
threads_name = soup.select('head title')[0].string
e = soup.select('p:nth-of-type(1) title')
print(soup)
threads_name = soup.select('p:nth-of-type(1) title')[0].string
threads_name = threads_name.replace(' ', '_') threads_name = threads_name.replace(' ', '_')
logging.debug(threads_name)
# thread data struct # thread data struct
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []} threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
@ -110,14 +111,18 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
n += 1 n += 1
logging.info("> " + str(n) + " / " + nbr_threads) logging.info("> " + str(n) + " / " + nbr_threads)
try: thread = archive_thread(l, base_url, None)
thread = archive_thread(l, base_url, None) threads['threads'].append(thread)
threads['threads'].append(thread)
except:
ex_type, ex, tb = sys.exc_info() # try:
traceback.print_tb(tb) # thread = archive_thread(l, base_url, None)
del tb # threads['threads'].append(thread)
continue # except:
# ex_type, ex, tb = sys.exc_info()
# traceback.print_tb(tb)
# del tb
# continue
time.sleep(DELAY) time.sleep(DELAY)
@ -165,6 +170,8 @@ def archive_thread(li, base_url, parent_thread_data):
def collect_message(url, message): def collect_message(url, message):
logging.debug("collecting message: " + url)
response = urllib.request.urlopen(url) response = urllib.request.urlopen(url)
html = response.read().decode(encoding="utf-8") html = response.read().decode(encoding="utf-8")
# html = response.read() # html = response.read()
@ -193,21 +200,12 @@ def collect_message(url, message):
if field.lower() in message_labels: if field.lower() in message_labels:
message[field.lower()] = i.text.strip(field + ": ") message[field.lower()] = i.text.strip(field + ": ")
## reformat from -- [author_name, email_addr]
# from_addr = email.utils.parseaddr(message['from']) # old way
# message['author_name'] = from_addr[0] # message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# message['from'] = from_addr[1]
## -- content -- # new way
# test message['content'] = parse_xmessage(html)
# c1 = soup.select('pre:nth-of-type(1)')
# if len(c1) > 0:
# message['content'] = c1[0].text
# else:
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
# mhonarc xcomments # mhonarc xcomments
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
@ -217,5 +215,9 @@ def parse_xcomment(soup, xcom):
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip() return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
return com return com
def test_xcomment(soup): # (edit 21.12.2019): this is the new way as of 2018 -- when no more moderation on Nettime...
return soup.find(text=re.compile('X-Message-Id')) is not None def parse_xmessage(html):
rr = r'<!--X-Body-of-Message-->.*?<!--X-Body-of-Message-End-->'
s = re.search(rr, html, re.DOTALL)
se = BeautifulSoup(s.group(), "html5lib")
return se.get_text()