archive work
This commit is contained in:
+8
-8
@@ -16,11 +16,11 @@ def collect_from_url(url, name, base_archive_dir):
|
||||
soup = lists.util.request(url)
|
||||
|
||||
threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
|
||||
lists = []
|
||||
li = []
|
||||
for t in threads_list:
|
||||
thread_label = t.text.strip()
|
||||
thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
|
||||
lists.append((thread_label, thread_url))
|
||||
li.append((thread_label, thread_url))
|
||||
|
||||
# create (main) directory
|
||||
# this is where all temp files will be created
|
||||
@@ -29,9 +29,9 @@ def collect_from_url(url, name, base_archive_dir):
|
||||
os.makedirs(d)
|
||||
|
||||
threads = []
|
||||
nbr_threads = str(len(lists))
|
||||
nbr_threads = str(len(li))
|
||||
n = 0
|
||||
for l in lists: ### change this
|
||||
for l in li: ### change this
|
||||
n += 1
|
||||
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||
try:
|
||||
@@ -85,18 +85,18 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
||||
soup = lists.util.request(url)
|
||||
|
||||
table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
|
||||
lists = []
|
||||
li = []
|
||||
for tr in table:
|
||||
if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
|
||||
lists.append(tr)
|
||||
li.append(tr)
|
||||
|
||||
# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
|
||||
# hence the thread parsing algorithm will also be flat -- re: a single loop
|
||||
|
||||
nbr_msgs = str(len(lists))
|
||||
nbr_msgs = str(len(li))
|
||||
n = 0
|
||||
last_message = None
|
||||
for tr in lists:
|
||||
for tr in li:
|
||||
n += 1
|
||||
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
||||
td = tr.find_all('td')
|
||||
|
||||
+8
-8
@@ -16,7 +16,7 @@ def collect_from_url(url, name, base_archive_dir):
|
||||
# soup = BeautifulSoup(html, "html5lib")
|
||||
|
||||
threads_list = soup.find_all('tr')
|
||||
lists = []
|
||||
li = []
|
||||
for t in threads_list[1:]:
|
||||
cols = t.find_all('td')
|
||||
if len(cols) < 2:
|
||||
@@ -25,7 +25,7 @@ def collect_from_url(url, name, base_archive_dir):
|
||||
thread_url = cols[1].select('a:nth-of-type(1)')[0].get('href') # this is relative
|
||||
url = (url + "/") if not url.endswith('/') else url
|
||||
thread_url = urllib.parse.urljoin(url, thread_url)
|
||||
lists.append((thread_label, thread_url)) # list of tuples
|
||||
li.append((thread_label, thread_url)) # list of tuples
|
||||
|
||||
# create (main) directory
|
||||
# this is where all temp files will be created
|
||||
@@ -34,9 +34,9 @@ def collect_from_url(url, name, base_archive_dir):
|
||||
os.makedirs(d)
|
||||
|
||||
threads = []
|
||||
nbr_threads = str(len(lists))
|
||||
nbr_threads = str(len(li))
|
||||
n = 0
|
||||
for l in lists: ### change this
|
||||
for l in li: ### change this
|
||||
n += 1
|
||||
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||
try:
|
||||
@@ -83,14 +83,14 @@ def collect_threads_from_url(url, name, base_arch_dir):
|
||||
# soup = BeautifulSoup(html, "html5lib")
|
||||
|
||||
ul = soup.find_all('ul')[1];
|
||||
lists = ul.find_all('li', recursive=False)
|
||||
li = ul.find_all('li', recursive=False)
|
||||
|
||||
is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None
|
||||
|
||||
#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
|
||||
nbr_msgs = str(len(lists))
|
||||
#li = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
|
||||
nbr_msgs = str(len(li))
|
||||
n = 0
|
||||
for li in lists:
|
||||
for li in li:
|
||||
n += 1
|
||||
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user