diff --git a/.gitignore b/.gitignore index 7bbc71c..9953ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# mailinglists specific +archives/ +setenv + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/archive.py b/archive.py index c78646d..18698bf 100644 --- a/archive.py +++ b/archive.py @@ -1,5 +1,5 @@ import sys, logging, argparse -import lists +import lists.crawl logging.basicConfig(level=logging.DEBUG) @@ -8,9 +8,15 @@ def run(args): if not args.url: sys.exit('No url(s). Aborting.') + if not args.names: + args.names = [] + ## check valid url?... hmm... nej + i = 0 for u in args.url: - lists.crawl.crawl(u) + name = args.names[i] if i < len(args.names) else None + lists.crawl.crawl(u, name, args.arch) + i = i + 1 sys.exit() @@ -18,6 +24,7 @@ if __name__ == "__main__": p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!') p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+") + p.add_argument('--names', help="mailinglists' names", nargs="+") p.add_argument('--arch', help="path to archives directory (default='archives')", default="archives") args = p.parse_args() diff --git a/lists/crawl.py b/lists/crawl.py index a71f636..d70b382 100644 --- a/lists/crawl.py +++ b/lists/crawl.py @@ -1,4 +1,21 @@ -# crawl dispatch +from urllib.parse import urlparse +import lists.pipermail as pipermail -def crawl(url, archive_dir): +DELAY = 0.2 + +def crawl(url, name, archive_dir): + u = urlparse(url) + + if 'pipermail' in u.path: + # if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz') + if name is None: + path = u.path if not u.path.endswith('/') else u.path[:len(u.path) - 1] + name = path.strip().split('/')[-1] + + pipermail.collect_from_url(url, name, archive_dir) + + else: + print('mhonarc?') + + return \ No newline at end of file diff --git a/lists/pipermail.py b/lists/pipermail.py index 626f86d..c139611 100644 --- a/lists/pipermail.py +++ b/lists/pipermail.py @@ -1 +1,154 @@ -# pipermail \ No newline at end of file +import urllib.request, urllib.parse +import logging, os, sys, traceback, time, json, gzip +from bs4 import BeautifulSoup + +DELAY = 0.2 + +def collect_from_url(url, name, base_archive_dir): + + response = urllib.request.urlopen(url) + html = response.read().decode(encoding="utf-8") + soup = BeautifulSoup(html, "html5lib") + + threads_list = soup.find_all('tr') + lists = [] + for t in threads_list[1:]: + cols = t.find_all('td') + if len(cols) < 2: + continue + thread_label = cols[0].text.strip()[:-1] + thread_url = cols[1].select('a:nth-of-type(1)')[0].get('href') # this is relative + url = (url + "/") if not url.endswith('/') else url + thread_url = urllib.parse.urljoin(url, thread_url) + lists.append((thread_label, thread_url)) # list of tuples + + # create (main) directory + # this is where all temp files will be created + d = os.path.join(base_archive_dir, name) + if not os.path.exists(d): + os.makedirs(d) + + threads = [] + nbr_threads = str(len(lists)) + n = 0 + for l in lists: ### change this + n += 1 + logging.info("## " + str(n) + " / " + nbr_threads + " ##") + threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d)) + +def collect_threads_from_url(url, name, base_arch_dir): + + + threads = {'name' : name, 'url' : url, 'threads' : []} + + logging.info("Collecting threads of: " + name) + + arch_name = name.replace(' ', '_') + + # check if archive already exists + file_path = os.path.join(base_arch_dir, arch_name + '.json') + if os.path.isfile(file_path): + logging.info("archive " + name + " already exists. loading from file " + file_path) + with open(file_path, 'r') as fin: + try: + threads = json.load(fin) + return threads + except: + logging.info("can't open archive " + file_path + "... rearchiving.") + + response = urllib.request.urlopen(url) + html = response.read().decode(encoding="utf-8") + soup = BeautifulSoup(html, "html5lib") + + ul = soup.find_all('ul')[1]; + lists = ul.find_all('li', recursive=False) + + #lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False) + nbr_msgs = str(len(lists)) + n = 0 + for li in lists: + n += 1 + logging.info(" > " + str(n) + "/" + nbr_msgs) + try: + thread = archive_thread(li, url.replace('thread.html', ''), None) + threads['threads'].append(thread) + except KeyboardInterrupt: + sys.exit(0) + except: + ex_t, ex, tb = sys.exc_info() + print(ex_t) + traceback.print_tb(tb) + del tb + continue + + time.sleep(DELAY) + + logging.info("writing archive to file " + file_path) + + with open(file_path, 'w') as fp: + json.dump(threads, fp, indent=4) + + logging.info("done.") + + return threads + +def archive_thread(li, base_url, parent_thread_data): + + thread_a = li.select('a:nth-of-type(1)')[0] + url = (base_url + "/") if not base_url.endswith('/') else base_url + thread_url = urllib.parse.urljoin(url, thread_a.get("href")) + thread_id = li.select('a:nth-of-type(2)')[0].get("name") + thread_title = thread_a.text.strip() + thread_author_name = li.select('i')[0].text.strip() + + message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name} + + collect_message(thread_url, message) + + + ul = li.find_all('ul'); + if len(ul) == 0: + if parent_thread_data is None: + return message + + if u'follow-up' not in parent_thread_data: + parent_thread_data[u'follow-up'] = [] + parent_thread_data[u'follow-up'].append(message) + return message + + + follow = ul[0].find_all('li', recursive=False) + if len(follow) > 0: + for f in follow: + follow_a = f.select('a') + if len(follow_a) > 0: + archive_thread(f, base_url, message) + + if parent_thread_data is None: + return message + + if u'follow-up' not in parent_thread_data: + parent_thread_data[u'follow-up'] = [] + parent_thread_data[u'follow-up'].append(message) + return message + + +def collect_message(url, message): + # logging.info(" + " + url) + + response = urllib.request.urlopen(url) + html = response.read().decode(encoding="utf-8") + soup = BeautifulSoup(html, "html5lib") + + #message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type') + + message['subject'] = soup.select('h1:nth-of-type(1)')[0].text.strip() + message['author_name'] = soup.select('b:nth-of-type(1)')[0].text.strip() + message['from'] = soup.select('a:nth-of-type(1)')[0].text.strip() + message['date'] = soup.select('i:nth-of-type(1)')[0].text.strip() + message['message-id'] = message['id'] + message['content-type'] = 'n/a' + + message['content'] = soup.select('pre:nth-of-type(1)')[0].text + +