diff --git a/README b/README index ef216a8..d9e8043 100644 --- a/README +++ b/README @@ -4,5 +4,6 @@ Options: -h, --help show this help message and exit -u URL, --url=URL nettime url -l LIST, --list=LIST nettime's list name (ex: nettime-l) + -a ARCH, --arch=ARCH path to archive directory Dependencies: bs4 \ No newline at end of file diff --git a/archive_nettime.py b/archive_nettime.py index 3f36fac..101ba3c 100644 --- a/archive_nettime.py +++ b/archive_nettime.py @@ -14,7 +14,7 @@ def run(options): ## check valid url?... nej - nettime.archive_from_url(options.url, options.list) + nettime.archive_from_url(options.url, options.list, options.arch) sys.exit() if __name__ == "__main__": @@ -22,6 +22,7 @@ if __name__ == "__main__": p = OptionParser(); p.add_option('-u', '--url', action="store", help="nettime url", default="http://www.nettime.org/archives.php") p.add_option('-l', '--list', action="store", help="nettime's list name (ex: nettime-l)", default="nettime-l") + p.add_option('-a', '--arch', action="store", help="path to archive directory", default="archives") options, args = p.parse_args() diff --git a/lib/mhonarccrawl.py b/lib/mhonarccrawl.py index 2391faf..7500425 100644 --- a/lib/mhonarccrawl.py +++ b/lib/mhonarccrawl.py @@ -9,7 +9,7 @@ from pprint import pprint as pp import sys, os, re, json, gzip import traceback -DELAY = 0.5 +DELAY = 0.2 # hack for the mailbox module (re: force mbox.add() encoding to utf8) reload(sys) @@ -99,8 +99,18 @@ def collect_threads_from_url(url, base_arch_dir, mbox): for l in lists: n += 1 logging.info("> " + str(n) + " / " + nbr_threads) - thread = archive_thread(l, base_url, None) - threads['threads'].append(thread) + + try: + thread = archive_thread(l, base_url, None) + threads['threads'].append(thread) + except: + ex_type, ex, tb = sys.exc_info() + print ex_type + print ex + traceback.print_tb(tb) + del tb + continue + time.sleep(DELAY) # write