From e560cb4cdb68218fcd5eb2d977c41c5ed3186a28 Mon Sep 17 00:00:00 2001 From: gauthiier Date: Sat, 21 Dec 2019 14:35:37 +0100 Subject: [PATCH] updated nettime crawler --- archive.py => crawl.py | 0 lists/mhonarc_nettime.py | 72 +++++++++++++++++++++------------------- 2 files changed, 37 insertions(+), 35 deletions(-) rename archive.py => crawl.py (100%) diff --git a/archive.py b/crawl.py similarity index 100% rename from archive.py rename to crawl.py diff --git a/lists/mhonarc_nettime.py b/lists/mhonarc_nettime.py index 6380c75..5fc08cf 100644 --- a/lists/mhonarc_nettime.py +++ b/lists/mhonarc_nettime.py @@ -13,9 +13,11 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox= # base url base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') - #collect name - list_name = soup.select('body p:nth-of-type(2) title')[0].string - logging.info("Getting " + list_name + " list archive for " + sublist_name) + logging.debug(base_url) + + # #collect name + # list_name = soup.select('body p:nth-of-type(2) title')[0].string + # logging.info("Getting " + list_name + " list archive for " + sublist_name) # create (main) directory # this is where all temp files will be created @@ -26,20 +28,19 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox= threads = [] lists = soup.select('ul:nth-of-type(2) li') - for l in lists: if l.strong is None: continue name = l.strong.string - print(name + " - " + sublist_name) if name.lower() == sublist_name.lower(): + logging.debug(name) + threads_url_list = [] threads_links = l.select('ul li a') - for t in threads_links: thread_url = urllib.parse.urljoin(base_url, t.get('href')) threads_url_list.append(thread_url) @@ -56,7 +57,7 @@ def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox= except KeyboardInterrupt: sys.exit(0) except: - logging.warning("Error archiving: " + l[1] + "... Continuing.") + logging.warning("Error archiving: " + name + "... Continuing.") ex_t, ex, tb = sys.exc_info() print(ex_t) traceback.print_tb(tb) @@ -78,17 +79,17 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False): html = response.read() soup = BeautifulSoup(html, "html5lib") + logging.debug("collecting: " + url) + # base url base_url = url # collect name - - e = soup.select('p:nth-of-type(1) title') - print(soup) - - threads_name = soup.select('p:nth-of-type(1) title')[0].string + threads_name = soup.select('head title')[0].string threads_name = threads_name.replace(' ', '_') + logging.debug(threads_name) + # thread data struct threads = {'name' : threads_name, 'url' : base_url, 'threads' : []} @@ -110,14 +111,18 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False): n += 1 logging.info("> " + str(n) + " / " + nbr_threads) - try: - thread = archive_thread(l, base_url, None) - threads['threads'].append(thread) - except: - ex_type, ex, tb = sys.exc_info() - traceback.print_tb(tb) - del tb - continue + thread = archive_thread(l, base_url, None) + threads['threads'].append(thread) + + + # try: + # thread = archive_thread(l, base_url, None) + # threads['threads'].append(thread) + # except: + # ex_type, ex, tb = sys.exc_info() + # traceback.print_tb(tb) + # del tb + # continue time.sleep(DELAY) @@ -165,6 +170,8 @@ def archive_thread(li, base_url, parent_thread_data): def collect_message(url, message): + logging.debug("collecting message: " + url) + response = urllib.request.urlopen(url) html = response.read().decode(encoding="utf-8") # html = response.read() @@ -193,21 +200,12 @@ def collect_message(url, message): if field.lower() in message_labels: message[field.lower()] = i.text.strip(field + ": ") - ## reformat from -- [author_name, email_addr] - # from_addr = email.utils.parseaddr(message['from']) - # message['author_name'] = from_addr[0] - # message['from'] = from_addr[1] + # old way + # message['content'] = soup.select('pre:nth-of-type(2)')[0].text - ## -- content -- - # test - # c1 = soup.select('pre:nth-of-type(1)') - # if len(c1) > 0: - # message['content'] = c1[0].text - # else: - # message['content'] = soup.select('pre:nth-of-type(2)')[0].text - - message['content'] = soup.select('pre:nth-of-type(2)')[0].text + # new way + message['content'] = parse_xmessage(html) # mhonarc xcomments # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html @@ -217,5 +215,9 @@ def parse_xcomment(soup, xcom): return com.strip('').strip(xcom + ":").strip() return com -def test_xcomment(soup): - return soup.find(text=re.compile('X-Message-Id')) is not None +# (edit 21.12.2019): this is the new way as of 2018 -- when no more moderation on Nettime... +def parse_xmessage(html): + rr = r'.*?' + s = re.search(rr, html, re.DOTALL) + se = BeautifulSoup(s.group(), "html5lib") + return se.get_text()