index +search info
This commit is contained in:
+3
-1
@@ -26,7 +26,9 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"):
|
||||
# special case -- nettime.
|
||||
# the name should be the sublist_name (i.e nettime-l)
|
||||
elif "nettime" in name:
|
||||
mhonarc_nettime.collect_from_url(url, name, name, archive_dir)
|
||||
if sublist_name is None:
|
||||
sublist_name = name
|
||||
mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
|
||||
|
||||
else:
|
||||
print('mhonarc?')
|
||||
|
||||
+132
-125
@@ -6,125 +6,132 @@ DELAY = 0.2
|
||||
|
||||
def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
|
||||
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read()
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read()
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
|
||||
# base url
|
||||
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
||||
# base url
|
||||
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
||||
|
||||
#collect name
|
||||
list_name = soup.select('body p:nth-of-type(2) title')[0].string
|
||||
logging.info("Getting " + list_name + " list archive for " + sublist_name)
|
||||
list_name = soup.select('body p:nth-of-type(2) title')[0].string
|
||||
logging.info("Getting " + list_name + " list archive for " + sublist_name)
|
||||
|
||||
# create (main) directory
|
||||
# this is where all temp files will be created
|
||||
d = os.path.join(base_archive_dir, name)
|
||||
if not os.path.exists(d):
|
||||
os.makedirs(d)
|
||||
# create (main) directory
|
||||
# this is where all temp files will be created
|
||||
d = os.path.join(base_archive_dir, name)
|
||||
if not os.path.exists(d):
|
||||
os.makedirs(d)
|
||||
|
||||
threads = []
|
||||
lists = soup.select('ul:nth-of-type(2) li')
|
||||
threads = []
|
||||
lists = soup.select('ul:nth-of-type(2) li')
|
||||
|
||||
for l in lists:
|
||||
|
||||
if l.strong is None:
|
||||
continue
|
||||
for l in lists:
|
||||
|
||||
name = l.strong.string
|
||||
if l.strong is None:
|
||||
continue
|
||||
|
||||
if name.lower() == sublist_name.lower():
|
||||
name = l.strong.string
|
||||
print(name + " - " + sublist_name)
|
||||
|
||||
threads_url_list = []
|
||||
threads_links = l.select('ul li a')
|
||||
for t in threads_links:
|
||||
thread_url = urllib.parse.urljoin(base_url, t.get('href'))
|
||||
threads_url_list.append(thread_url)
|
||||
if name.lower() == sublist_name.lower():
|
||||
|
||||
nbr_threads = str(len(threads_url_list))
|
||||
n = 0
|
||||
threads_url_list = []
|
||||
threads_links = l.select('ul li a')
|
||||
|
||||
for u in threads_url_list:
|
||||
time.sleep(DELAY)
|
||||
n += 1
|
||||
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||
try:
|
||||
threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except:
|
||||
logging.warning("Error archiving: " + l[1] + "... Continuing.")
|
||||
ex_t, ex, tb = sys.exc_info()
|
||||
print(ex_t)
|
||||
traceback.print_tb(tb)
|
||||
del tb
|
||||
continue
|
||||
for t in threads_links:
|
||||
thread_url = urllib.parse.urljoin(base_url, t.get('href'))
|
||||
threads_url_list.append(thread_url)
|
||||
|
||||
return threads
|
||||
nbr_threads = str(len(threads_url_list))
|
||||
n = 0
|
||||
|
||||
# for u in threads_url_list[0:10]:
|
||||
# print "---------------------------------------"
|
||||
# tt = collect_threads_from_url(u, base_archive_dir, mbox)
|
||||
# threads.append(tt)
|
||||
for u in threads_url_list:
|
||||
time.sleep(DELAY)
|
||||
n += 1
|
||||
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||
try:
|
||||
threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except:
|
||||
logging.warning("Error archiving: " + l[1] + "... Continuing.")
|
||||
ex_t, ex, tb = sys.exc_info()
|
||||
print(ex_t)
|
||||
traceback.print_tb(tb)
|
||||
del tb
|
||||
continue
|
||||
|
||||
return None
|
||||
return threads
|
||||
|
||||
# for u in threads_url_list[0:10]:
|
||||
# print "---------------------------------------"
|
||||
# tt = collect_threads_from_url(u, base_archive_dir, mbox)
|
||||
# threads.append(tt)
|
||||
|
||||
return None
|
||||
|
||||
def collect_threads_from_url(url, base_archive_dir, mbox=False):
|
||||
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read()
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read()
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
|
||||
# base url
|
||||
base_url = url
|
||||
# base url
|
||||
base_url = url
|
||||
|
||||
# collect name
|
||||
threads_name = soup.select('p:nth-of-type(1) title')[0].string
|
||||
threads_name = threads_name.replace(' ', '_')
|
||||
# collect name
|
||||
|
||||
# thread data struct
|
||||
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
|
||||
e = soup.select('p:nth-of-type(1) title')
|
||||
print(soup)
|
||||
|
||||
logging.info("Collecting Threads of: " + threads_name)
|
||||
threads_name = soup.select('p:nth-of-type(1) title')[0].string
|
||||
threads_name = threads_name.replace(' ', '_')
|
||||
|
||||
# check if archive already exists
|
||||
file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
|
||||
if os.path.isfile(file_path):
|
||||
logging.info("archive already exists. loading from file " + file_path)
|
||||
with open(file_path, 'r') as fpin:
|
||||
threads = json.load(fpin)
|
||||
else:
|
||||
lists = soup.select('ul:nth-of-type(1) > li')
|
||||
# thread data struct
|
||||
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
|
||||
|
||||
nbr_threads = str(len(lists))
|
||||
n = 0
|
||||
logging.info("Collecting Threads of: " + threads_name)
|
||||
|
||||
for l in lists:
|
||||
n += 1
|
||||
logging.info("> " + str(n) + " / " + nbr_threads)
|
||||
# check if archive already exists
|
||||
file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
|
||||
if os.path.isfile(file_path):
|
||||
logging.info("archive already exists. loading from file " + file_path)
|
||||
with open(file_path, 'r') as fpin:
|
||||
threads = json.load(fpin)
|
||||
else:
|
||||
lists = soup.select('ul:nth-of-type(1) > li')
|
||||
|
||||
try:
|
||||
thread = archive_thread(l, base_url, None)
|
||||
threads['threads'].append(thread)
|
||||
except:
|
||||
ex_type, ex, tb = sys.exc_info()
|
||||
traceback.print_tb(tb)
|
||||
del tb
|
||||
continue
|
||||
nbr_threads = str(len(lists))
|
||||
n = 0
|
||||
|
||||
time.sleep(DELAY)
|
||||
for l in lists:
|
||||
n += 1
|
||||
logging.info("> " + str(n) + " / " + nbr_threads)
|
||||
|
||||
# write
|
||||
logging.info("writing archive to file " + file_path)
|
||||
try:
|
||||
thread = archive_thread(l, base_url, None)
|
||||
threads['threads'].append(thread)
|
||||
except:
|
||||
ex_type, ex, tb = sys.exc_info()
|
||||
traceback.print_tb(tb)
|
||||
del tb
|
||||
continue
|
||||
|
||||
with open(file_path, 'w') as fp:
|
||||
json.dump(threads, fp, indent=4)
|
||||
time.sleep(DELAY)
|
||||
|
||||
logging.info("done. ")
|
||||
# write
|
||||
logging.info("writing archive to file " + file_path)
|
||||
|
||||
return threads
|
||||
with open(file_path, 'w') as fp:
|
||||
json.dump(threads, fp, indent=4)
|
||||
|
||||
|
||||
logging.info("done. ")
|
||||
|
||||
return threads
|
||||
|
||||
|
||||
|
||||
def archive_thread(li, base_url, parent_thread_data):
|
||||
|
||||
@@ -158,57 +165,57 @@ def archive_thread(li, base_url, parent_thread_data):
|
||||
|
||||
def collect_message(url, message):
|
||||
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read().decode(encoding="utf-8")
|
||||
# html = response.read()
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
response = urllib.request.urlopen(url)
|
||||
html = response.read().decode(encoding="utf-8")
|
||||
# html = response.read()
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
|
||||
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
||||
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
||||
|
||||
message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
|
||||
message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
|
||||
|
||||
# mhonarc xcomments
|
||||
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
||||
message['subject'] = parse_xcomment(soup, "X-Subject")
|
||||
message['date'] = parse_xcomment(soup, "X-Date")
|
||||
message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
|
||||
message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
|
||||
message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
|
||||
# mhonarc xcomments
|
||||
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
||||
message['subject'] = parse_xcomment(soup, "X-Subject")
|
||||
message['date'] = parse_xcomment(soup, "X-Date")
|
||||
message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
|
||||
message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
|
||||
message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
|
||||
|
||||
# parse what is displayed on the page
|
||||
# parse what is displayed on the page
|
||||
|
||||
info = soup.select('ul:nth-of-type(1) > li')
|
||||
info = soup.select('ul:nth-of-type(1) > li')
|
||||
|
||||
for i in info:
|
||||
if i.em == None:
|
||||
continue
|
||||
field = i.em.string
|
||||
if field.lower() in message_labels:
|
||||
message[field.lower()] = i.text.strip(field + ": ")
|
||||
for i in info:
|
||||
if i.em == None:
|
||||
continue
|
||||
field = i.em.string
|
||||
if field.lower() in message_labels:
|
||||
message[field.lower()] = i.text.strip(field + ": ")
|
||||
|
||||
## reformat from -- [author_name, email_addr]
|
||||
## reformat from -- [author_name, email_addr]
|
||||
|
||||
# from_addr = email.utils.parseaddr(message['from'])
|
||||
# message['author_name'] = from_addr[0]
|
||||
# message['from'] = from_addr[1]
|
||||
# from_addr = email.utils.parseaddr(message['from'])
|
||||
# message['author_name'] = from_addr[0]
|
||||
# message['from'] = from_addr[1]
|
||||
|
||||
## -- content --
|
||||
# test
|
||||
# c1 = soup.select('pre:nth-of-type(1)')
|
||||
# if len(c1) > 0:
|
||||
# message['content'] = c1[0].text
|
||||
# else:
|
||||
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
||||
## -- content --
|
||||
# test
|
||||
# c1 = soup.select('pre:nth-of-type(1)')
|
||||
# if len(c1) > 0:
|
||||
# message['content'] = c1[0].text
|
||||
# else:
|
||||
# message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
||||
|
||||
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
||||
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
||||
|
||||
# mhonarc xcomments
|
||||
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
||||
def parse_xcomment(soup, xcom):
|
||||
com = soup.find(text=re.compile(xcom))
|
||||
if com is not None:
|
||||
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
|
||||
return com
|
||||
com = soup.find(text=re.compile(xcom))
|
||||
if com is not None:
|
||||
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
|
||||
return com
|
||||
|
||||
def test_xcomment(soup):
|
||||
return soup.find(text=re.compile('X-Message-Id')) is not None
|
||||
return soup.find(text=re.compile('X-Message-Id')) is not None
|
||||
|
||||
Reference in New Issue
Block a user