archive work

2019-12-31 17:53:47 +01:00
parent 8a92b3b1be
commit 7892f1fa64
4 changed files with 33 additions and 21 deletions
@@ -16,11 +16,11 @@ def collect_from_url(url, name, base_archive_dir):
 	soup = lists.util.request(url)

 	threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
-	lists = []
+	li = []
 	for t in threads_list:
 		thread_label = t.text.strip()
 		thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
-		lists.append((thread_label, thread_url))
+		li.append((thread_label, thread_url))

 	# create (main) directory 
 	# this is where all temp files will be created
@@ -29,9 +29,9 @@ def collect_from_url(url, name, base_archive_dir):
 		os.makedirs(d)		

 	threads = []
-	nbr_threads = str(len(lists))
+	nbr_threads = str(len(li))
 	n = 0
-	for l in lists: ### change this
+	for l in li: ### change this
 		n += 1
 		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
 		try:
@@ -85,18 +85,18 @@ def collect_threads_from_url(url, name, base_arch_dir):
 	soup = lists.util.request(url)

 	table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
-	lists = []
+	li = []
 	for tr in table:
 		if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
-			lists.append(tr)
+			li.append(tr)

 	# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
 	# hence the thread parsing algorithm will also be flat -- re: a single loop

-	nbr_msgs = str(len(lists))
+	nbr_msgs = str(len(li))
 	n = 0	
 	last_message = None	
-	for tr in lists:
+	for tr in li:
 		n += 1
 		logging.info("	> " + str(n) + "/" + nbr_msgs)
 		td = tr.find_all('td')
@@ -16,7 +16,7 @@ def collect_from_url(url, name, base_archive_dir):
 	# soup = BeautifulSoup(html, "html5lib")

 	threads_list = soup.find_all('tr')
-	lists = []
+	li = []
 	for t in threads_list[1:]:
 		cols = t.find_all('td')
 		if len(cols) < 2:
@@ -25,7 +25,7 @@ def collect_from_url(url, name, base_archive_dir):
 		thread_url = cols[1].select('a:nth-of-type(1)')[0].get('href') 	# this is relative
 		url = (url + "/") if not url.endswith('/') else url
 		thread_url = urllib.parse.urljoin(url, thread_url)
-		lists.append((thread_label, thread_url)) 						# list of tuples
+		li.append((thread_label, thread_url)) 						# list of tuples

 	# create (main) directory 
 	# this is where all temp files will be created
@@ -34,9 +34,9 @@ def collect_from_url(url, name, base_archive_dir):
 		os.makedirs(d)

 	threads = []
-	nbr_threads = str(len(lists))
+	nbr_threads = str(len(li))
 	n = 0
-	for l in lists: ### change this
+	for l in li: ### change this
 		n += 1
 		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
 		try:
@@ -83,14 +83,14 @@ def collect_threads_from_url(url, name, base_arch_dir):
 	# soup = BeautifulSoup(html, "html5lib")

 	ul = soup.find_all('ul')[1];
-	lists = ul.find_all('li', recursive=False)
+	li = ul.find_all('li', recursive=False)

 	is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None

-	#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
-	nbr_msgs = str(len(lists))
+	#li = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
+	nbr_msgs = str(len(li))
 	n = 0		
-	for li in lists:
+	for li in li:
 		n += 1
 		logging.info("	> " + str(n) + "/" + nbr_msgs)
 		try: