pipermail init

2017-07-14 10:54:56 +02:00
parent a3a3be5237
commit cca498d887
4 changed files with 186 additions and 5 deletions
@@ -1,3 +1,7 @@
+# mailinglists specific
+archives/
+setenv
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -1,5 +1,5 @@
 import sys, logging, argparse
-import lists
+import lists.crawl

 logging.basicConfig(level=logging.DEBUG)

@@ -8,9 +8,15 @@ def run(args):
    if not args.url:
        sys.exit('No url(s). Aborting.')

+    if not args.names:
+        args.names = []
+
    ## check valid url?... hmm... nej
+    i = 0
    for u in args.url:
-        lists.crawl.crawl(u)
+        name = args.names[i] if i < len(args.names) else None
+        lists.crawl.crawl(u, name, args.arch)
+        i = i + 1
    
    sys.exit()

@@ -18,6 +24,7 @@ if __name__ == "__main__":

    p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
    p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
+    p.add_argument('--names', help="mailinglists' names", nargs="+")
    p.add_argument('--arch', help="path to archives directory (default='archives')", default="archives")

    args = p.parse_args()
@@ -1,4 +1,21 @@
-# crawl dispatch
+from urllib.parse import urlparse
+import lists.pipermail as pipermail
+
+DELAY = 0.2
+
+def crawl(url, name, archive_dir):
+	u = urlparse(url)
+
+	if 'pipermail' in u.path:
+		# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
+		if name is None:
+			path = u.path if not u.path.endswith('/') else u.path[:len(u.path) - 1]
+			name = path.strip().split('/')[-1]
+
+		pipermail.collect_from_url(url, name, archive_dir)
+			
+	else:
+		print('mhonarc?')
+
 	
-def crawl(url, archive_dir):
 	return
@@ -1 +1,154 @@
-# pipermail
+import urllib.request, urllib.parse
+import logging, os, sys, traceback, time, json, gzip
+from bs4 import BeautifulSoup
+
+DELAY = 0.2
+
+def collect_from_url(url, name, base_archive_dir):
+
+	response = urllib.request.urlopen(url)
+	html = response.read().decode(encoding="utf-8")
+	soup = BeautifulSoup(html, "html5lib")
+
+	threads_list = soup.find_all('tr')
+	lists = []
+	for t in threads_list[1:]:
+		cols = t.find_all('td')
+		if len(cols) < 2:
+			continue
+		thread_label = cols[0].text.strip()[:-1]
+		thread_url = cols[1].select('a:nth-of-type(1)')[0].get('href') 	# this is relative
+		url = (url + "/") if not url.endswith('/') else url
+		thread_url = urllib.parse.urljoin(url, thread_url)
+		lists.append((thread_label, thread_url)) 						# list of tuples
+
+	# create (main) directory 
+	# this is where all temp files will be created
+	d = os.path.join(base_archive_dir, name)
+	if not os.path.exists(d):
+		os.makedirs(d)
+
+	threads = []
+	nbr_threads = str(len(lists))
+	n = 0
+	for l in lists: ### change this
+		n += 1
+		logging.info("## " + str(n) + " / " + nbr_threads + " ##")
+		threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
+		
+def collect_threads_from_url(url, name, base_arch_dir):
+
+
+	threads = {'name' : name, 'url' : url, 'threads' : []}
+	
+	logging.info("Collecting threads of: " + name)
+
+	arch_name = name.replace(' ', '_')
+
+	# check if archive already exists
+	file_path = os.path.join(base_arch_dir, arch_name + '.json')
+	if os.path.isfile(file_path):
+		logging.info("archive " + name + " already exists. loading from file " + file_path)
+		with open(file_path, 'r') as fin:
+			try:
+				threads = json.load(fin)
+				return threads  
+			except:
+				logging.info("can't open archive " + file_path + "... rearchiving.")
+
+	response = urllib.request.urlopen(url)
+	html = response.read().decode(encoding="utf-8")
+	soup = BeautifulSoup(html, "html5lib")
+
+	ul = soup.find_all('ul')[1];
+	lists = ul.find_all('li', recursive=False)
+
+	#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
+	nbr_msgs = str(len(lists))
+	n = 0		
+	for li in lists:
+		n += 1
+		logging.info("	> " + str(n) + "/" + nbr_msgs)
+		try:
+			thread = archive_thread(li, url.replace('thread.html', ''), None)
+			threads['threads'].append(thread)
+		except KeyboardInterrupt:
+			sys.exit(0)		
+		except:
+			ex_t, ex, tb = sys.exc_info()
+			print(ex_t)
+			traceback.print_tb(tb)
+			del tb
+			continue
+
+		time.sleep(DELAY)
+
+	logging.info("writing archive to file " + file_path)
+
+	with open(file_path, 'w') as fp:
+		json.dump(threads, fp, indent=4)
+
+	logging.info("done.")
+
+	return threads
+
+def archive_thread(li, base_url, parent_thread_data):
+
+	thread_a = li.select('a:nth-of-type(1)')[0]
+	url = (base_url + "/") if not base_url.endswith('/') else base_url
+	thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
+	thread_id = li.select('a:nth-of-type(2)')[0].get("name")
+	thread_title = thread_a.text.strip()
+	thread_author_name = li.select('i')[0].text.strip()
+
+	message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
+
+	collect_message(thread_url, message)
+
+	
+	ul = li.find_all('ul');
+	if len(ul) == 0:
+		if parent_thread_data is None:
+			return message
+
+		if u'follow-up' not in parent_thread_data:
+			parent_thread_data[u'follow-up'] = []
+		parent_thread_data[u'follow-up'].append(message)
+		return message
+
+
+	follow = ul[0].find_all('li', recursive=False)	
+	if len(follow) > 0:
+		for f in follow:
+			follow_a = f.select('a')
+			if len(follow_a) > 0:
+				archive_thread(f, base_url, message)
+		
+	if parent_thread_data is None:
+		return message
+
+	if u'follow-up' not in parent_thread_data:
+		parent_thread_data[u'follow-up'] = []
+	parent_thread_data[u'follow-up'].append(message)
+	return message
+
+
+def collect_message(url, message):
+	# logging.info("	+ " + url)
+
+	response = urllib.request.urlopen(url)
+	html = response.read().decode(encoding="utf-8")
+	soup = BeautifulSoup(html, "html5lib")
+
+	#message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
+
+	message['subject'] = soup.select('h1:nth-of-type(1)')[0].text.strip()
+	message['author_name'] = soup.select('b:nth-of-type(1)')[0].text.strip()
+	message['from'] = soup.select('a:nth-of-type(1)')[0].text.strip()
+	message['date'] = soup.select('i:nth-of-type(1)')[0].text.strip()
+	message['message-id'] = message['id']
+	message['content-type'] = 'n/a'
+
+	message['content'] = soup.select('pre:nth-of-type(1)')[0].text
+
+