fixed mhonarc threads archiving...

This commit is contained in:
gauthiier 2019-12-22 17:14:17 +01:00
parent 3c86e39f4e
commit 904c8c1e2d
3 changed files with 25 additions and 15 deletions

View File

@ -2,8 +2,7 @@ name: listservs
channels: channels:
- defaults - defaults
dependencies: dependencies:
- ca-certificates=2019.5.15=0 - openssl=1.0.2
- openssl=1.0.2s=h1de35cc_0
- pip=9.0.1=py34_1 - pip=9.0.1=py34_1
- python=3.4.5=0 - python=3.4.5=0
- readline=6.2=2 - readline=6.2=2
@ -11,8 +10,8 @@ dependencies:
- sqlite=3.13.0=0 - sqlite=3.13.0=0
- tk=8.5.18=0 - tk=8.5.18=0
- wheel=0.29.0=py34_0 - wheel=0.29.0=py34_0
- xz=5.2.4=h1de35cc_4 - xz=5.2.2
- zlib=1.2.11=h1de35cc_3 - zlib=1.2.8
- pip: - pip:
- beautifulsoup4==4.7.1 - beautifulsoup4==4.7.1
- click==7.0 - click==7.0

View File

@ -25,13 +25,16 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"):
# special case -- nettime. # special case -- nettime.
# the name should be the sublist_name (i.e nettime-l) # the name should be the sublist_name (i.e nettime-l)
elif "nettime" in name: elif "nettime" in url:
if sublist_name is None: if sublist_name is None:
sublist_name = name sublist_name = name
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
elif "oldboys" in name: # elif "oldboys" in name:
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) # mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
# elif "oldboys" in name:
# mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
else: else:
print('mhonarc?') print('mhonarc?')

View File

@ -131,7 +131,8 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
logging.info("> " + str(n) + " / " + nbr_threads) logging.info("> " + str(n) + " / " + nbr_threads)
thread = archive_thread(l, base_url, None) thread = archive_thread(l, base_url, None)
threads['threads'].append(thread) if thread is not None:
threads['threads'].append(thread)
# try: # try:
@ -159,7 +160,11 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
def archive_thread(li, base_url, parent_thread_data): def archive_thread(li, base_url, parent_thread_data):
thread_link = li.select('strong a')[0] thread_link = li.select('strong a')
if len(thread_link) == 0: # usually "<Possible follow-ups>"
return None
thread_link = thread_link[0]
thread_url = urllib.parse.urljoin(base_url, thread_link.get('href')) thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
thread_id = thread_link.get('name') thread_id = thread_link.get('name')
thread_title = thread_link.string thread_title = thread_link.string
@ -169,12 +174,15 @@ def archive_thread(li, base_url, parent_thread_data):
collect_message(thread_url, message) collect_message(thread_url, message)
follow = li.select('ul > li') # follow = li.select('ul:nth-of-type(1) > li')
if len(follow) > 0: ul = li.findChildren("ul" , recursive=False)
for f in follow: if len(ul) > 0:
follow_link = f.select('strong a') follow = ul[0].findChildren("li" , recursive=False)
if len (follow_link) > 0: if len(follow) > 0:
archive_thread(f, base_url, message) ## recursion for f in follow:
follow_link = f.select('strong a')
if len (follow_link) > 0:
archive_thread(f, base_url, message) ## recursion
if parent_thread_data is None: if parent_thread_data is None:
return message return message