diff --git a/conda_env.yml b/conda_env.yml index 6e3ab94..11f9052 100644 --- a/conda_env.yml +++ b/conda_env.yml @@ -2,8 +2,7 @@ name: listservs channels: - defaults dependencies: - - ca-certificates=2019.5.15=0 - - openssl=1.0.2s=h1de35cc_0 + - openssl=1.0.2 - pip=9.0.1=py34_1 - python=3.4.5=0 - readline=6.2=2 @@ -11,8 +10,8 @@ dependencies: - sqlite=3.13.0=0 - tk=8.5.18=0 - wheel=0.29.0=py34_0 - - xz=5.2.4=h1de35cc_4 - - zlib=1.2.11=h1de35cc_3 + - xz=5.2.2 + - zlib=1.2.8 - pip: - beautifulsoup4==4.7.1 - click==7.0 diff --git a/lists/crawl.py b/lists/crawl.py index 582052a..2acba64 100644 --- a/lists/crawl.py +++ b/lists/crawl.py @@ -25,13 +25,16 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"): # special case -- nettime. # the name should be the sublist_name (i.e nettime-l) - elif "nettime" in name: + elif "nettime" in url: if sublist_name is None: sublist_name = name mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) - elif "oldboys" in name: - mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) + # elif "oldboys" in name: + # mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) + + # elif "oldboys" in name: + # mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) else: print('mhonarc?') diff --git a/lists/mhonarc_nettime.py b/lists/mhonarc_nettime.py index aeba81e..e230594 100644 --- a/lists/mhonarc_nettime.py +++ b/lists/mhonarc_nettime.py @@ -131,7 +131,8 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False): logging.info("> " + str(n) + " / " + nbr_threads) thread = archive_thread(l, base_url, None) - threads['threads'].append(thread) + if thread is not None: + threads['threads'].append(thread) # try: @@ -159,7 +160,11 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False): def archive_thread(li, base_url, parent_thread_data): - thread_link = li.select('strong a')[0] + thread_link = li.select('strong a') + if len(thread_link) == 0: # usually "" + return None + + thread_link = thread_link[0] thread_url = urllib.parse.urljoin(base_url, thread_link.get('href')) thread_id = thread_link.get('name') thread_title = thread_link.string @@ -169,12 +174,15 @@ def archive_thread(li, base_url, parent_thread_data): collect_message(thread_url, message) - follow = li.select('ul > li') - if len(follow) > 0: - for f in follow: - follow_link = f.select('strong a') - if len (follow_link) > 0: - archive_thread(f, base_url, message) ## recursion + # follow = li.select('ul:nth-of-type(1) > li') + ul = li.findChildren("ul" , recursive=False) + if len(ul) > 0: + follow = ul[0].findChildren("li" , recursive=False) + if len(follow) > 0: + for f in follow: + follow_link = f.select('strong a') + if len (follow_link) > 0: + archive_thread(f, base_url, message) ## recursion if parent_thread_data is None: return message