archive work

2019-12-31 17:53:47 +01:00 · 2019-12-31 17:53:47 +01:00 · 7892f1fa64
commit 7892f1fa64
parent 8a92b3b1be
4 changed files with 33 additions and 21 deletions
--- a/archive/archive.py
+++ b/archive/archive.py
@ -133,8 +133,7 @@ class Archive:

 		logging.info(" - done.")

-
-	def insert_db(self, config=None):
+	def insert_db(self, config=None, file=None):

 		if self.db_con is None:
 			if config is not None:
@ -143,12 +142,24 @@ class Archive:
 		if self.db_con is None:
 			return

+		tag = file
+		if file is not None:
+			tag = file.replace("_", " ")
+
 		try:
 			cursor = self.db_con.cursor()

 			progress = terminal.progress.ProgressBar(self.archive_name, len(self.data), fmt=terminal.progress.ProgressBar.FULL)

 			for t in self.data:
+
+				# print(tag)
+				print(t['name'])
+
+				if file is not None and t['name'] != tag:
+					continue
+
+				logging.info("inserting " + t['name'])
 				
 				n_inserted = self.recursive_insert_db(cursor, t["threads"])
 				logging.info(" -  " + str(n_inserted))
--- a/index.py
+++ b/index.py
@ -9,7 +9,7 @@ logging.basicConfig(level=logging.DEBUG)
 def list_archives(archives_dir):
 	return [d for d in os.listdir(archives_dir) if os.path.isdir(os.path.join(archives_dir, d))]

-def run(lists, archives):
+def run(lists, archives, file=None):
 	logging.debug("indexing: " + str(lists) + " from " + archives)
 	lists_db = archive.list_tables_db_config(config.db)

@ -21,12 +21,13 @@ def run(lists, archives):
 			else:
 				logging.info("Table not created. Aborting.")
 				return
-		ar.insert_db(config.db)
+		ar.insert_db(config.db, file=file)

 if __name__ == "__main__":

 	p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
 	p.add_argument('list', metavar="list", help="list(s) to index", nargs="+")
+	p.add_argument('--file', metavar="file", help="archive file to index")
 	p.add_argument('--archives', '-a', help="path to archives directory (default='archives')", default=config.archives)

 	args = p.parse_args()
@ -37,6 +38,6 @@ if __name__ == "__main__":
 	if len(args.list) == 1 and args.list[0] == "all":
 		args.list = list_archives(args.archives)

-	run(args.list, args.archives)
+	run(args.list, args.archives, args.file)


--- a/lists/listserv.py
+++ b/lists/listserv.py
@ -16,11 +16,11 @@ def collect_from_url(url, name, base_archive_dir):
 	soup = lists.util.request(url)

 	threads_list = soup.find_all('tr', {'class': 'normalgroup'})[0].find_all('li')
-	lists = []
+	li = []
 	for t in threads_list:
 		thread_label = t.text.strip()
 		thread_url = urllib.parse.urljoin(url, t.select('a')[0].get('href'))
-		lists.append((thread_label, thread_url))
+		li.append((thread_label, thread_url))

 	# create (main) directory 
 	# this is where all temp files will be created
@ -29,9 +29,9 @@ def collect_from_url(url, name, base_archive_dir):
 		os.makedirs(d)		

 	threads = []
-	nbr_threads = str(len(lists))
+	nbr_threads = str(len(li))
 	n = 0
-	for l in lists: ### change this
+	for l in li: ### change this
 		n += 1
 		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
 		try:
@ -85,18 +85,18 @@ def collect_threads_from_url(url, name, base_arch_dir):
 	soup = lists.util.request(url)

 	table = soup.find_all('table', {'class': 'tableframe'})[1].find_all('tr')
-	lists = []
+	li = []
 	for tr in table:
 		if tr.has_attr('class') and (tr['class'][0] == 'normalgroup' or tr['class'][0] == 'emphasizedgroup'):
-			lists.append(tr)
+			li.append(tr)

 	# the thread structure here is flat -- re: non-hierarchical, unlike pipermail
 	# hence the thread parsing algorithm will also be flat -- re: a single loop

-	nbr_msgs = str(len(lists))
+	nbr_msgs = str(len(li))
 	n = 0	
 	last_message = None	
-	for tr in lists:
+	for tr in li:
 		n += 1
 		logging.info("	> " + str(n) + "/" + nbr_msgs)
 		td = tr.find_all('td')
--- a/lists/pipermail.py
+++ b/lists/pipermail.py
@ -16,7 +16,7 @@ def collect_from_url(url, name, base_archive_dir):
 	# soup = BeautifulSoup(html, "html5lib")

 	threads_list = soup.find_all('tr')
-	lists = []
+	li = []
 	for t in threads_list[1:]:
 		cols = t.find_all('td')
 		if len(cols) < 2:
@ -25,7 +25,7 @@ def collect_from_url(url, name, base_archive_dir):
 		thread_url = cols[1].select('a:nth-of-type(1)')[0].get('href') 	# this is relative
 		url = (url + "/") if not url.endswith('/') else url
 		thread_url = urllib.parse.urljoin(url, thread_url)
-		lists.append((thread_label, thread_url)) 						# list of tuples
+		li.append((thread_label, thread_url)) 						# list of tuples

 	# create (main) directory 
 	# this is where all temp files will be created
@ -34,9 +34,9 @@ def collect_from_url(url, name, base_archive_dir):
 		os.makedirs(d)

 	threads = []
-	nbr_threads = str(len(lists))
+	nbr_threads = str(len(li))
 	n = 0
-	for l in lists: ### change this
+	for l in li: ### change this
 		n += 1
 		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
 		try:
@ -83,14 +83,14 @@ def collect_threads_from_url(url, name, base_arch_dir):
 	# soup = BeautifulSoup(html, "html5lib")

 	ul = soup.find_all('ul')[1];
-	lists = ul.find_all('li', recursive=False)
+	li = ul.find_all('li', recursive=False)

 	is_mhonarc_hybrid = soup.find(text=re.compile('MHonArc')) is not None

-	#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
-	nbr_msgs = str(len(lists))
+	#li = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
+	nbr_msgs = str(len(li))
 	n = 0		
-	for li in lists:
+	for li in li:
 		n += 1
 		logging.info("	> " + str(n) + "/" + nbr_msgs)
 		try: