renamed lib to nettime
This commit is contained in:
@@ -0,0 +1,403 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import email, email.parser
|
||||
import os, datetime, json, gzip, re
|
||||
from random import randint
|
||||
|
||||
def format_from(from_str):
|
||||
from_addr = email.utils.parseaddr(from_str)[1]
|
||||
if '{AT}' not in from_addr:
|
||||
tok = from_str.split()
|
||||
try:
|
||||
at = tok.index('{AT}')
|
||||
from_addr = ''.join(tok[at-1:at+2])
|
||||
if from_addr.startswith('<') or from_addr.endswith('>'):
|
||||
from_addr = from_addr.strip('<').strip('>')
|
||||
except ValueError:
|
||||
return None
|
||||
return from_addr.lower()
|
||||
|
||||
def format_date(date_str):
|
||||
try:
|
||||
date_tz = email.utils.parsedate_tz(date_str)
|
||||
time_tz = email.utils.mktime_tz(date_tz) #utc timestamp
|
||||
except TypeError:
|
||||
print "Format Date TypeError"
|
||||
print " > " + date_str
|
||||
return None
|
||||
except ValueError:
|
||||
print "Format Date ValueError"
|
||||
print " > " + date_str
|
||||
return None
|
||||
|
||||
dt = datetime.datetime.fromtimestamp(time_tz)
|
||||
|
||||
try:
|
||||
pdt = pd.to_datetime(dt)
|
||||
return pdt
|
||||
except pd.tslib.OutOfBoundsDatetime:
|
||||
print 'time out of bound'
|
||||
print dt
|
||||
return None
|
||||
|
||||
def message_to_tuple_record(msg, records, references=None):
|
||||
|
||||
# check date first?
|
||||
date_time = format_date(msg['date'])
|
||||
if not date_time:
|
||||
return
|
||||
|
||||
# filter date?
|
||||
nettime_min_date = pd.to_datetime('01/10/1995', format='%d/%m/%Y')
|
||||
nettime_max_date = pd.to_datetime(datetime.datetime.now())
|
||||
if date_time < nettime_min_date or date_time > nettime_max_date:
|
||||
return None
|
||||
|
||||
# check / filter from email address second?
|
||||
from_addr = format_from(msg['from'])
|
||||
if not from_addr:
|
||||
return
|
||||
|
||||
records.append((msg['message-id'],
|
||||
from_addr,
|
||||
msg['author_name'],
|
||||
msg['subject'],
|
||||
date_time,
|
||||
msg['url'],
|
||||
len(msg['content']),
|
||||
0 if not msg.has_key('follow-up') else len(msg['follow-up']),
|
||||
references))
|
||||
|
||||
if msg.has_key('follow-up'):
|
||||
for f in msg['follow-up']:
|
||||
message_to_tuple_record(f, records, references=msg['message-id'])
|
||||
|
||||
return
|
||||
|
||||
def json_data_to_pd_dataframe(json_data):
|
||||
|
||||
records = []
|
||||
for d in json_data:
|
||||
for dd in d['threads']:
|
||||
message_to_tuple_record(dd, records)
|
||||
|
||||
df = pd.DataFrame.from_records(records,
|
||||
index='date',
|
||||
columns=['message-id',
|
||||
'from',
|
||||
'author',
|
||||
'subject',
|
||||
'date',
|
||||
'url',
|
||||
'content-length',
|
||||
'nbr-references',
|
||||
'references'])
|
||||
|
||||
df.index.name = 'date'
|
||||
|
||||
return df
|
||||
|
||||
def load_from_file(filename, archive_dir):
|
||||
|
||||
json_data = None
|
||||
if not filename.endswith('.json.gz'):
|
||||
file_path = os.path.join(archive_dir, filename + '.json.gz')
|
||||
else:
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return json_data_to_pd_dataframe(json_data['threads'])
|
||||
else:
|
||||
#list of all "filename[...].json.gz" in archive_dir
|
||||
files = sorted([f for f in os.listdir(archive_dir) if os.path.isfile(os.path.join(archive_dir, f)) and f.startswith(filename) and f.endswith('.json.gz')])
|
||||
if files:
|
||||
filename = files[-1] # take the most recent (listed alpha-chronological)
|
||||
file_path = os.path.join(archive_dir, filename)
|
||||
if os.path.isfile(file_path):
|
||||
with gzip.open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
return json_data_to_pd_dataframe(json_data['threads'])
|
||||
else:
|
||||
#list of all json files in archive_dir/filename
|
||||
dir_path = os.path.join(archive_dir, filename)
|
||||
if not os.path.isdir(dir_path):
|
||||
return None
|
||||
|
||||
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.json')]
|
||||
if not files:
|
||||
return None
|
||||
|
||||
# load all json files
|
||||
threads = []
|
||||
for file_path in files:
|
||||
with open(file_path, 'r') as fp:
|
||||
json_data = json.load(fp)
|
||||
threads.append(json_data)
|
||||
|
||||
return json_data_to_pd_dataframe(threads)
|
||||
|
||||
|
||||
class Archive:
|
||||
|
||||
|
||||
data = None # "raw" json data
|
||||
dataframe = None # main pd dataframe
|
||||
|
||||
activity = None # (very) sparse dataframe (index=date(month), columns=from, values=activity(month))
|
||||
content_length = None # (very) sparse dataframe (index=date(month), columns=from, values=content-length(month in bytes))
|
||||
|
||||
threads = None
|
||||
|
||||
def __init__(self, data="nettime-l", archive_dir="archives"):
|
||||
|
||||
if isinstance(data, pd.core.frame.DataFrame):
|
||||
self.dataframe = data.copy()
|
||||
|
||||
if isinstance(data, str):
|
||||
self.dataframe = load_from_file(data, archive_dir)
|
||||
|
||||
'''
|
||||
activity
|
||||
'''
|
||||
|
||||
def _activity(self):
|
||||
|
||||
if self.activity is None:
|
||||
from_index = self.dataframe.reindex(columns=['from'])
|
||||
self.activity = from_index.groupby([pd.TimeGrouper(freq='M'), 'from']).size().unstack('from').fillna(0)
|
||||
|
||||
return self.activity
|
||||
|
||||
def activity_from(self, email_address, resolution='M'):
|
||||
|
||||
eaddr = email_address.replace('@', '{at}').lower()
|
||||
|
||||
self._activity()
|
||||
try:
|
||||
if resolution.lower() == 'm':
|
||||
return self.activity[eaddr]
|
||||
elif resolution.lower() == 'y':
|
||||
y = self.activity[eaddr].resample('AS').sum()
|
||||
y.index = y.index.year
|
||||
return y
|
||||
else:
|
||||
return None
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
def activity_overall(self, resolution='M'):
|
||||
|
||||
self._activity()
|
||||
try:
|
||||
sum_activity_month = self.activity.sum(axis=1)
|
||||
if resolution.lower() == 'm':
|
||||
sum_activity_month.rename
|
||||
return sum_activity_month
|
||||
elif resolution.lower() == 'y':
|
||||
y = sum_activity_month.resample('AS').sum()
|
||||
y.index = y.index.year
|
||||
return y
|
||||
else:
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
def activity_from_ranking(self, resolution='M', rank=5, filter_nettime=True):
|
||||
# finish this -- re resolution AND filtering
|
||||
self._activity()
|
||||
afr = self.activity.sum(axis=0).order(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
afr = afr[afr.index.str.contains(p)]
|
||||
return afr[:rank]
|
||||
|
||||
def plot_activity_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
|
||||
|
||||
activity_rank = self.activity_from_ranking(rank=rank).keys()
|
||||
series = []
|
||||
for k in activity_rank:
|
||||
series.append(self.activity_from(k, resolution))
|
||||
|
||||
df = pd.concat(series, axis=1)
|
||||
|
||||
colors = np.random.rand(len(df),3)
|
||||
|
||||
if figsize:
|
||||
df.plot(colors=colors, figsize=figsize)
|
||||
else:
|
||||
df.plot(colors=colors)
|
||||
|
||||
'''
|
||||
content lenght
|
||||
'''
|
||||
|
||||
def _content_length(self):
|
||||
|
||||
if self.content_length is None:
|
||||
from_content_index = self.dataframe.reindex(columns=['from', 'content-length'])
|
||||
self.content_length = from_content_index.groupby([pd.TimeGrouper(freq='M'), 'from']).sum()
|
||||
self.content_length = self.content_length.reset_index().pivot(columns='from', index='date', values='content-length').fillna(0)
|
||||
|
||||
return self.content_length
|
||||
|
||||
def content_length_from(self, email_address, resolution='M'):
|
||||
|
||||
eaddr = email_address.replace('@', '{at}').lower()
|
||||
|
||||
self._content_length()
|
||||
try:
|
||||
if resolution.lower() == 'm':
|
||||
return self.content_length[eaddr]
|
||||
elif resolution.lower() == 'y':
|
||||
y = self.content_length[eaddr].resample('AS').sum()
|
||||
y.index = y.index.year
|
||||
return y
|
||||
else:
|
||||
return None
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
def content_length_overall(self):
|
||||
|
||||
self._content_length()
|
||||
try:
|
||||
sum_content_length_month = self.content_length.sum(axis=1)
|
||||
if resolution.lower() == 'm':
|
||||
return sum_content_length_month
|
||||
elif resolution.lower() == 'y':
|
||||
y = sum_content_length_month.resample('AS').sum()
|
||||
y.index = y.index.year
|
||||
return y
|
||||
else:
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
def content_length_from_ranking(self, resolution='M', rank=5, filter_nettime=True):
|
||||
# finish this -- re resolution
|
||||
self._content_length()
|
||||
cfr = self.content_length.sum(axis=0).order(ascending=False)
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
cfr = cfr[cfr.index.str.contains(p)]
|
||||
return cfr[:rank]
|
||||
|
||||
def plot_content_length_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
|
||||
|
||||
content_rank = self.content_length_from_ranking(rank=rank).keys()
|
||||
series = []
|
||||
for k in content_rank:
|
||||
series.append(self.content_length_from(k, resolution))
|
||||
|
||||
df = pd.concat(series, axis=1)
|
||||
|
||||
colors = np.random.rand(len(df),3)
|
||||
|
||||
if figsize:
|
||||
df.plot(colors=colors, figsize=figsize)
|
||||
else:
|
||||
df.plot(colors=colors)
|
||||
|
||||
'''
|
||||
threads
|
||||
'''
|
||||
|
||||
def _threads(self, thresh=0):
|
||||
|
||||
if self.threads is None:
|
||||
self.threads = self.dataframe[self.dataframe['nbr-references'] > thresh].reindex(columns=['from','nbr-references','subject', 'url', 'message-id']).sort_values('nbr-references', ascending=False)
|
||||
return self.threads;
|
||||
|
||||
def threads_ranking(self, rank=5):
|
||||
|
||||
self._threads()
|
||||
return self.threads.drop('message-id', axis=1)[:rank]
|
||||
|
||||
def threads_from(self, email_address, resolution='y'):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
self._threads()
|
||||
|
||||
eaddr = email_address.replace('@', '{at}').lower()
|
||||
|
||||
self._threads()
|
||||
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
|
||||
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq=freq), 'from']).sum()
|
||||
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
|
||||
return threads_from_ranking[eaddr]
|
||||
|
||||
def threads_from_ranking(self, rank=5, filter_nettime=True):
|
||||
|
||||
self._threads()
|
||||
threads_from = self.threads.reindex(columns=['from', 'nbr-references'])
|
||||
threads_from_ranking = threads_from.groupby([pd.TimeGrouper(freq='AS'), 'from']).sum()
|
||||
threads_from_ranking = threads_from_ranking.reset_index().pivot(columns='from', index='date', values='nbr-references').fillna(0)
|
||||
tfr = threads_from_ranking.sum(axis=0).order(ascending=False)
|
||||
|
||||
if filter_nettime:
|
||||
p = r'^((?!nettime*).)*$'
|
||||
tfr = tfr[tfr.index.str.contains(p)]
|
||||
|
||||
return tfr[:rank]
|
||||
|
||||
def plot_threads_from_ranking(self, resolution='y', rank=5, figsize=(8, 7)):
|
||||
|
||||
threads_rank = self.threads_from_ranking(rank=rank).keys()
|
||||
series = []
|
||||
for k in threads_rank:
|
||||
series.append(self.threads_from(k, resolution))
|
||||
|
||||
df = pd.concat(series, axis=1)
|
||||
|
||||
colors = np.random.rand(len(df),3)
|
||||
|
||||
if figsize:
|
||||
df.plot(colors=colors, figsize=figsize)
|
||||
else:
|
||||
df.plot(colors=colors)
|
||||
|
||||
|
||||
def threads_overall(self, resolution='y', aggregate='sum', tresh=0):
|
||||
|
||||
freq = 'M'
|
||||
if resolution.lower() == 'y':
|
||||
freq = 'AS'
|
||||
elif resolution.lower() == 'm':
|
||||
freq = 'M'
|
||||
else:
|
||||
return None
|
||||
|
||||
agg = aggregate.lower()
|
||||
if not agg in ['sum', 'mean']:
|
||||
return None
|
||||
|
||||
if not self.threads is None:
|
||||
del self.threads
|
||||
self.threads = None
|
||||
|
||||
self._threads(tresh)
|
||||
|
||||
if agg == 'sum':
|
||||
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).sum()
|
||||
else:
|
||||
y = self.threads.groupby([pd.TimeGrouper(freq=freq)]).mean()
|
||||
|
||||
if freq == 'AS':
|
||||
y.index = y.index.year
|
||||
|
||||
return y
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,252 @@
|
||||
import urllib2, urllib, urlparse
|
||||
import logging
|
||||
from bs4 import BeautifulSoup
|
||||
import email, email.parser
|
||||
from email.mime.text import MIMEText
|
||||
import mailbox
|
||||
import time, dateutil, string
|
||||
from pprint import pprint as pp
|
||||
import sys, os, re, json, gzip
|
||||
import traceback
|
||||
|
||||
DELAY = 0.2
|
||||
|
||||
# hack for the mailbox module (re: force mbox.add() encoding to utf8)
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf8')
|
||||
|
||||
|
||||
def collect_from_url(url, sublist_name, base_arch_dir="archives", mbox=False):
|
||||
|
||||
response = urllib2.urlopen(url)
|
||||
html = response.read()
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# base url
|
||||
base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
|
||||
|
||||
#collect name
|
||||
list_name = soup.select('body p:nth-of-type(2) base title')[0].string
|
||||
logging.info("Getting " + list_name + " list archive for " + sublist_name)
|
||||
|
||||
lists = soup.select('ul:nth-of-type(2) li')
|
||||
|
||||
threads = []
|
||||
|
||||
for l in lists:
|
||||
|
||||
if l.strong is None:
|
||||
continue
|
||||
|
||||
name = l.strong.string
|
||||
|
||||
if name.lower() == sublist_name.lower():
|
||||
|
||||
threads_url_list = []
|
||||
threads_links = l.select('ul li a')
|
||||
for t in threads_links:
|
||||
thread_url = urlparse.urljoin(base_url, t.get('href'))
|
||||
threads_url_list.append(thread_url)
|
||||
|
||||
nbr_threads = str(len(threads_url_list))
|
||||
n = 0
|
||||
|
||||
for u in threads_url_list:
|
||||
n += 1
|
||||
logging.info("## " + str(n) + " / " + nbr_threads + " ##")
|
||||
threads.append(collect_threads_from_url(u, base_arch_dir, mbox))
|
||||
|
||||
return threads
|
||||
|
||||
# for u in threads_url_list[0:10]:
|
||||
# print "---------------------------------------"
|
||||
# tt = collect_threads_from_url(u, base_arch_dir, mbox)
|
||||
# threads.append(tt)
|
||||
|
||||
|
||||
return None
|
||||
|
||||
def collect_threads_from_url(url, base_arch_dir, mbox):
|
||||
|
||||
response = urllib2.urlopen(url)
|
||||
html = response.read()
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# base url
|
||||
base_url = url
|
||||
|
||||
# collect name
|
||||
threads_name = soup.select('p:nth-of-type(1) title')[0].string
|
||||
threads_name = threads_name.replace(' ', '_')
|
||||
|
||||
# thread data struct
|
||||
threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
|
||||
|
||||
logging.info("Collecting Threads of: " + threads_name)
|
||||
|
||||
# check if archive already exists
|
||||
file_path = os.path.join(base_arch_dir, threads['name'] + ".json")
|
||||
if os.path.isfile(file_path):
|
||||
logging.info("archive already exists. loading from file " + file_path)
|
||||
with open(file_path, 'r') as fpin:
|
||||
threads = json.load(fpin)
|
||||
else:
|
||||
lists = soup.select('ul:nth-of-type(1) > li')
|
||||
|
||||
nbr_threads = str(len(lists))
|
||||
n = 0
|
||||
|
||||
for l in lists:
|
||||
n += 1
|
||||
logging.info("> " + str(n) + " / " + nbr_threads)
|
||||
|
||||
try:
|
||||
thread = archive_thread(l, base_url, None)
|
||||
threads['threads'].append(thread)
|
||||
except:
|
||||
ex_type, ex, tb = sys.exc_info()
|
||||
print ex_type
|
||||
print ex
|
||||
traceback.print_tb(tb)
|
||||
del tb
|
||||
continue
|
||||
|
||||
time.sleep(DELAY)
|
||||
|
||||
# write
|
||||
logging.info("writing archive to file " + file_path)
|
||||
|
||||
with open(file_path, 'w') as fp:
|
||||
json.dump(threads, fp, indent=4)
|
||||
|
||||
if mbox:
|
||||
mbox_path = os.path.join(base_arch_dir, threads['name'] + ".txt")
|
||||
mbox_path_gz = mbox_path + ".gz"
|
||||
logging.info("writing mbox " + mbox_path)
|
||||
if not os.path.isfile(mbox_path):
|
||||
box = mailbox.mbox(mbox_path)
|
||||
box.lock()
|
||||
try:
|
||||
for t in threads['threads']:
|
||||
write_mbox_message(t, box)
|
||||
box.flush()
|
||||
except:
|
||||
ex_type, ex, tb = sys.exc_info()
|
||||
print ex_type
|
||||
print ex
|
||||
traceback.print_tb(tb)
|
||||
del tb
|
||||
finally:
|
||||
box.unlock()
|
||||
|
||||
with open(mbox_path) as fpin, gzip.open(mbox_path + '.gz', 'wb') as fpout:
|
||||
fpout.writelines(fpin)
|
||||
|
||||
else:
|
||||
logging.info("mbox " + mbox_path + " already exists.")
|
||||
|
||||
logging.info("done. ")
|
||||
|
||||
return threads
|
||||
|
||||
|
||||
|
||||
def archive_thread(li, base_url, parent_thread_data):
|
||||
|
||||
thread_link = li.select('strong a')[0]
|
||||
thread_url = urlparse.urljoin(base_url, thread_link.get('href'))
|
||||
thread_id = thread_link.get('name')
|
||||
thread_title = thread_link.string
|
||||
thread_author_name = li.select('em')[0].string
|
||||
|
||||
message = {u'id': thread_id, u'subject': thread_title, u'url': thread_url, u'author_name': thread_author_name}
|
||||
|
||||
collect_message(thread_url, message)
|
||||
|
||||
follow = li.select('ul > li')
|
||||
if len(follow) > 0:
|
||||
for f in follow:
|
||||
follow_link = f.select('strong a')
|
||||
if len (follow_link) > 0:
|
||||
archive_thread(f, base_url, message) ## recursion
|
||||
|
||||
if parent_thread_data is None:
|
||||
return message
|
||||
|
||||
if u'follow-up' not in parent_thread_data:
|
||||
parent_thread_data[u'follow-up'] = []
|
||||
|
||||
parent_thread_data[u'follow-up'].append(message)
|
||||
|
||||
return message
|
||||
|
||||
|
||||
def collect_message(url, message):
|
||||
|
||||
print url
|
||||
|
||||
response = urllib2.urlopen(url)
|
||||
html = response.read()
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
|
||||
|
||||
message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')
|
||||
|
||||
# mhonarc xcomments
|
||||
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
||||
message['subject'] = parse_xcomment(soup, "X-Subject")
|
||||
message['date'] = parse_xcomment(soup, "X-Date")
|
||||
message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
|
||||
message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
|
||||
message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
|
||||
|
||||
# parse what is displayed on the page
|
||||
|
||||
info = soup.select('ul:nth-of-type(1) > li')
|
||||
|
||||
for i in info:
|
||||
if i.em == None:
|
||||
continue
|
||||
field = i.em.string
|
||||
if field.lower() in message_labels:
|
||||
message[field.lower()] = i.text.strip(field + ": ")
|
||||
|
||||
## reformat from -- [author_name, email_addr]
|
||||
|
||||
# from_addr = email.utils.parseaddr(message['from'])
|
||||
# message['author_name'] = from_addr[0]
|
||||
# message['from'] = from_addr[1]
|
||||
|
||||
## -- content --
|
||||
message['content'] = soup.select('pre:nth-of-type(2)')[0].text
|
||||
|
||||
# mhonarc xcomments
|
||||
# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
|
||||
def parse_xcomment(soup, xcom):
|
||||
com = soup.find(text=re.compile(xcom))
|
||||
if com is not None:
|
||||
return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
|
||||
return com
|
||||
|
||||
def to_mbox_message(msg):
|
||||
mime = MIMEText('', 'plain', _charset='utf8')
|
||||
mime['From'] = msg['from']
|
||||
mime['Subject'] = msg['subject']
|
||||
mime['Message-Id'] = msg['message-id']
|
||||
mime['Date'] = msg['date']
|
||||
mime.set_payload(msg['content'], charset='utf8')
|
||||
mbox_message = mailbox.mboxMessage(mime)
|
||||
mbox_message.set_from(mime['From'], email.utils.parsedate(mime['Date']))
|
||||
return mbox_message
|
||||
|
||||
# throws exception
|
||||
def write_mbox_message(msg, mbox):
|
||||
mbox_msg = to_mbox_message(msg)
|
||||
mbox.add(mbox_msg) # here
|
||||
if u'follow-up' in msg:
|
||||
for f in msg['follow-up']:
|
||||
write_mbox_message(f, mbox)
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
import urllib2, urllib, urlparse
|
||||
import os, re, json, gzip
|
||||
import mhonarccrawl
|
||||
import datetime
|
||||
|
||||
def archive_from_url(url, sublist_name="nettime-l", archive_dir="archives"):
|
||||
url = url.rstrip()
|
||||
archive_list_dir = check_dir(archive_dir, sublist_name)
|
||||
|
||||
archive_name = sublist_name.lower()
|
||||
archive_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
|
||||
archive = {'name' : sublist_name.lower(), 'url': url, 'date': archive_date, 'threads' : []}
|
||||
|
||||
archive['threads'] = mhonarccrawl.collect_from_url(url, sublist_name, archive_list_dir, mbox=True)
|
||||
|
||||
file_path = os.path.join(archive_dir, archive_name + "_" + archive_date + ".json.gz")
|
||||
with gzip.open(file_path, 'w') as fp:
|
||||
json.dump(archive, fp, indent=4)
|
||||
|
||||
return
|
||||
|
||||
def check_dir(base_dir, list_name):
|
||||
arc_dir = os.path.join(base_dir, list_name)
|
||||
if not os.path.exists(arc_dir):
|
||||
os.makedirs(arc_dir)
|
||||
return arc_dir
|
||||
Reference in New Issue
Block a user