pipermail init
This commit is contained in:
parent
a3a3be5237
commit
cca498d887
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,3 +1,7 @@
|
|||||||
|
# mailinglists specific
|
||||||
|
archives/
|
||||||
|
setenv
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
|||||||
11
archive.py
11
archive.py
@ -1,5 +1,5 @@
|
|||||||
import sys, logging, argparse
|
import sys, logging, argparse
|
||||||
import lists
|
import lists.crawl
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
@ -8,9 +8,15 @@ def run(args):
|
|||||||
if not args.url:
|
if not args.url:
|
||||||
sys.exit('No url(s). Aborting.')
|
sys.exit('No url(s). Aborting.')
|
||||||
|
|
||||||
|
if not args.names:
|
||||||
|
args.names = []
|
||||||
|
|
||||||
## check valid url?... hmm... nej
|
## check valid url?... hmm... nej
|
||||||
|
i = 0
|
||||||
for u in args.url:
|
for u in args.url:
|
||||||
lists.crawl.crawl(u)
|
name = args.names[i] if i < len(args.names) else None
|
||||||
|
lists.crawl.crawl(u, name, args.arch)
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
@ -18,6 +24,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
p = argparse.ArgumentParser(description='Mailinglists are dead. Long live mailinglists!')
|
||||||
p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
|
p.add_argument('url', metavar="url", help="mailinglist urls to archive", nargs="+")
|
||||||
|
p.add_argument('--names', help="mailinglists' names", nargs="+")
|
||||||
p.add_argument('--arch', help="path to archives directory (default='archives')", default="archives")
|
p.add_argument('--arch', help="path to archives directory (default='archives')", default="archives")
|
||||||
|
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|||||||
@ -1,4 +1,21 @@
|
|||||||
# crawl dispatch
|
from urllib.parse import urlparse
|
||||||
|
import lists.pipermail as pipermail
|
||||||
|
|
||||||
|
DELAY = 0.2
|
||||||
|
|
||||||
|
def crawl(url, name, archive_dir):
|
||||||
|
u = urlparse(url)
|
||||||
|
|
||||||
|
if 'pipermail' in u.path:
|
||||||
|
# if no name, get the trailing path element (re: /pipermail/xyz -- 'xyz')
|
||||||
|
if name is None:
|
||||||
|
path = u.path if not u.path.endswith('/') else u.path[:len(u.path) - 1]
|
||||||
|
name = path.strip().split('/')[-1]
|
||||||
|
|
||||||
|
pipermail.collect_from_url(url, name, archive_dir)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print('mhonarc?')
|
||||||
|
|
||||||
|
|
||||||
def crawl(url, archive_dir):
|
|
||||||
return
|
return
|
||||||
@ -1 +1,154 @@
|
|||||||
# pipermail
|
import urllib.request, urllib.parse
|
||||||
|
import logging, os, sys, traceback, time, json, gzip
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
DELAY = 0.2
|
||||||
|
|
||||||
|
def collect_from_url(url, name, base_archive_dir):
|
||||||
|
|
||||||
|
response = urllib.request.urlopen(url)
|
||||||
|
html = response.read().decode(encoding="utf-8")
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
threads_list = soup.find_all('tr')
|
||||||
|
lists = []
|
||||||
|
for t in threads_list[1:]:
|
||||||
|
cols = t.find_all('td')
|
||||||
|
if len(cols) < 2:
|
||||||
|
continue
|
||||||
|
thread_label = cols[0].text.strip()[:-1]
|
||||||
|
thread_url = cols[1].select('a:nth-of-type(1)')[0].get('href') # this is relative
|
||||||
|
url = (url + "/") if not url.endswith('/') else url
|
||||||
|
thread_url = urllib.parse.urljoin(url, thread_url)
|
||||||
|
lists.append((thread_label, thread_url)) # list of tuples
|
||||||
|
|
||||||
|
# create (main) directory
|
||||||
|
# this is where all temp files will be created
|
||||||
|
d = os.path.join(base_archive_dir, name)
|
||||||
|
if not os.path.exists(d):
|
||||||
|
os.makedirs(d)
|
||||||
|
|
||||||
|
threads = []
|
||||||
|
nbr_threads = str(len(lists))
|
||||||
|
n = 0
|
||||||
|
for l in lists: ### change this
|
||||||
|
n += 1
|
||||||
|
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||||
|
threads.append(collect_threads_from_url(name=l[0], url=l[1], base_arch_dir=d))
|
||||||
|
|
||||||
|
def collect_threads_from_url(url, name, base_arch_dir):
|
||||||
|
|
||||||
|
|
||||||
|
threads = {'name' : name, 'url' : url, 'threads' : []}
|
||||||
|
|
||||||
|
logging.info("Collecting threads of: " + name)
|
||||||
|
|
||||||
|
arch_name = name.replace(' ', '_')
|
||||||
|
|
||||||
|
# check if archive already exists
|
||||||
|
file_path = os.path.join(base_arch_dir, arch_name + '.json')
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
logging.info("archive " + name + " already exists. loading from file " + file_path)
|
||||||
|
with open(file_path, 'r') as fin:
|
||||||
|
try:
|
||||||
|
threads = json.load(fin)
|
||||||
|
return threads
|
||||||
|
except:
|
||||||
|
logging.info("can't open archive " + file_path + "... rearchiving.")
|
||||||
|
|
||||||
|
response = urllib.request.urlopen(url)
|
||||||
|
html = response.read().decode(encoding="utf-8")
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
ul = soup.find_all('ul')[1];
|
||||||
|
lists = ul.find_all('li', recursive=False)
|
||||||
|
|
||||||
|
#lists = soup.select('ul:nth-of-type(2)')[0].find_all('li', recursive=False)
|
||||||
|
nbr_msgs = str(len(lists))
|
||||||
|
n = 0
|
||||||
|
for li in lists:
|
||||||
|
n += 1
|
||||||
|
logging.info(" > " + str(n) + "/" + nbr_msgs)
|
||||||
|
try:
|
||||||
|
thread = archive_thread(li, url.replace('thread.html', ''), None)
|
||||||
|
threads['threads'].append(thread)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(0)
|
||||||
|
except:
|
||||||
|
ex_t, ex, tb = sys.exc_info()
|
||||||
|
print(ex_t)
|
||||||
|
traceback.print_tb(tb)
|
||||||
|
del tb
|
||||||
|
continue
|
||||||
|
|
||||||
|
time.sleep(DELAY)
|
||||||
|
|
||||||
|
logging.info("writing archive to file " + file_path)
|
||||||
|
|
||||||
|
with open(file_path, 'w') as fp:
|
||||||
|
json.dump(threads, fp, indent=4)
|
||||||
|
|
||||||
|
logging.info("done.")
|
||||||
|
|
||||||
|
return threads
|
||||||
|
|
||||||
|
def archive_thread(li, base_url, parent_thread_data):
|
||||||
|
|
||||||
|
thread_a = li.select('a:nth-of-type(1)')[0]
|
||||||
|
url = (base_url + "/") if not base_url.endswith('/') else base_url
|
||||||
|
thread_url = urllib.parse.urljoin(url, thread_a.get("href"))
|
||||||
|
thread_id = li.select('a:nth-of-type(2)')[0].get("name")
|
||||||
|
thread_title = thread_a.text.strip()
|
||||||
|
thread_author_name = li.select('i')[0].text.strip()
|
||||||
|
|
||||||
|
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
|
||||||
|
|
||||||
|
collect_message(thread_url, message)
|
||||||
|
|
||||||
|
|
||||||
|
ul = li.find_all('ul');
|
||||||
|
if len(ul) == 0:
|
||||||
|
if parent_thread_data is None:
|
||||||
|
return message
|
||||||
|
|
||||||
|
if u'follow-up' not in parent_thread_data:
|
||||||
|
parent_thread_data[u'follow-up'] = []
|
||||||
|
parent_thread_data[u'follow-up'].append(message)
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
follow = ul[0].find_all('li', recursive=False)
|
||||||
|
if len(follow) > 0:
|
||||||
|
for f in follow:
|
||||||
|
follow_a = f.select('a')
|
||||||
|
if len(follow_a) > 0:
|
||||||
|
archive_thread(f, base_url, message)
|
||||||
|
|
||||||
|
if parent_thread_data is None:
|
||||||
|
return message
|
||||||
|
|
||||||
|
if u'follow-up' not in parent_thread_data:
|
||||||
|
parent_thread_data[u'follow-up'] = []
|
||||||
|
parent_thread_data[u'follow-up'].append(message)
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
def collect_message(url, message):
|
||||||
|
# logging.info(" + " + url)
|
||||||
|
|
||||||
|
response = urllib.request.urlopen(url)
|
||||||
|
html = response.read().decode(encoding="utf-8")
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
#message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
|
||||||
|
|
||||||
|
message['subject'] = soup.select('h1:nth-of-type(1)')[0].text.strip()
|
||||||
|
message['author_name'] = soup.select('b:nth-of-type(1)')[0].text.strip()
|
||||||
|
message['from'] = soup.select('a:nth-of-type(1)')[0].text.strip()
|
||||||
|
message['date'] = soup.select('i:nth-of-type(1)')[0].text.strip()
|
||||||
|
message['message-id'] = message['id']
|
||||||
|
message['content-type'] = 'n/a'
|
||||||
|
|
||||||
|
message['content'] = soup.select('pre:nth-of-type(1)')[0].text
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user