fixed mhonarc threads archiving...
This commit is contained in:
parent
3c86e39f4e
commit
904c8c1e2d
@ -2,8 +2,7 @@ name: listservs
|
|||||||
channels:
|
channels:
|
||||||
- defaults
|
- defaults
|
||||||
dependencies:
|
dependencies:
|
||||||
- ca-certificates=2019.5.15=0
|
- openssl=1.0.2
|
||||||
- openssl=1.0.2s=h1de35cc_0
|
|
||||||
- pip=9.0.1=py34_1
|
- pip=9.0.1=py34_1
|
||||||
- python=3.4.5=0
|
- python=3.4.5=0
|
||||||
- readline=6.2=2
|
- readline=6.2=2
|
||||||
@ -11,8 +10,8 @@ dependencies:
|
|||||||
- sqlite=3.13.0=0
|
- sqlite=3.13.0=0
|
||||||
- tk=8.5.18=0
|
- tk=8.5.18=0
|
||||||
- wheel=0.29.0=py34_0
|
- wheel=0.29.0=py34_0
|
||||||
- xz=5.2.4=h1de35cc_4
|
- xz=5.2.2
|
||||||
- zlib=1.2.11=h1de35cc_3
|
- zlib=1.2.8
|
||||||
- pip:
|
- pip:
|
||||||
- beautifulsoup4==4.7.1
|
- beautifulsoup4==4.7.1
|
||||||
- click==7.0
|
- click==7.0
|
||||||
|
|||||||
@ -25,13 +25,16 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"):
|
|||||||
|
|
||||||
# special case -- nettime.
|
# special case -- nettime.
|
||||||
# the name should be the sublist_name (i.e nettime-l)
|
# the name should be the sublist_name (i.e nettime-l)
|
||||||
elif "nettime" in name:
|
elif "nettime" in url:
|
||||||
if sublist_name is None:
|
if sublist_name is None:
|
||||||
sublist_name = name
|
sublist_name = name
|
||||||
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
|
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
|
||||||
|
|
||||||
elif "oldboys" in name:
|
# elif "oldboys" in name:
|
||||||
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
|
# mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
|
||||||
|
|
||||||
|
# elif "oldboys" in name:
|
||||||
|
# mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('mhonarc?')
|
print('mhonarc?')
|
||||||
|
|||||||
@ -131,7 +131,8 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
|||||||
logging.info("> " + str(n) + " / " + nbr_threads)
|
logging.info("> " + str(n) + " / " + nbr_threads)
|
||||||
|
|
||||||
thread = archive_thread(l, base_url, None)
|
thread = archive_thread(l, base_url, None)
|
||||||
threads['threads'].append(thread)
|
if thread is not None:
|
||||||
|
threads['threads'].append(thread)
|
||||||
|
|
||||||
|
|
||||||
# try:
|
# try:
|
||||||
@ -159,7 +160,11 @@ def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
|||||||
|
|
||||||
def archive_thread(li, base_url, parent_thread_data):
|
def archive_thread(li, base_url, parent_thread_data):
|
||||||
|
|
||||||
thread_link = li.select('strong a')[0]
|
thread_link = li.select('strong a')
|
||||||
|
if len(thread_link) == 0: # usually "<Possible follow-ups>"
|
||||||
|
return None
|
||||||
|
|
||||||
|
thread_link = thread_link[0]
|
||||||
thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
|
thread_url = urllib.parse.urljoin(base_url, thread_link.get('href'))
|
||||||
thread_id = thread_link.get('name')
|
thread_id = thread_link.get('name')
|
||||||
thread_title = thread_link.string
|
thread_title = thread_link.string
|
||||||
@ -169,12 +174,15 @@ def archive_thread(li, base_url, parent_thread_data):
|
|||||||
|
|
||||||
collect_message(thread_url, message)
|
collect_message(thread_url, message)
|
||||||
|
|
||||||
follow = li.select('ul > li')
|
# follow = li.select('ul:nth-of-type(1) > li')
|
||||||
if len(follow) > 0:
|
ul = li.findChildren("ul" , recursive=False)
|
||||||
for f in follow:
|
if len(ul) > 0:
|
||||||
follow_link = f.select('strong a')
|
follow = ul[0].findChildren("li" , recursive=False)
|
||||||
if len (follow_link) > 0:
|
if len(follow) > 0:
|
||||||
archive_thread(f, base_url, message) ## recursion
|
for f in follow:
|
||||||
|
follow_link = f.select('strong a')
|
||||||
|
if len (follow_link) > 0:
|
||||||
|
archive_thread(f, base_url, message) ## recursion
|
||||||
|
|
||||||
if parent_thread_data is None:
|
if parent_thread_data is None:
|
||||||
return message
|
return message
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user