diff --git a/archive.py b/archive.py index 18698bf..4dc0652 100644 --- a/archive.py +++ b/archive.py @@ -15,7 +15,7 @@ def run(args): i = 0 for u in args.url: name = args.names[i] if i < len(args.names) else None - lists.crawl.crawl(u, name, args.arch) + lists.crawl.crawl(url=u, name=name, sublist_name=name, archive_dir=args.arch) #<-- sublist for nettime i = i + 1 sys.exit() diff --git a/lists/crawl.py b/lists/crawl.py index 73529e2..4aaf016 100644 --- a/lists/crawl.py +++ b/lists/crawl.py @@ -26,7 +26,9 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"): # special case -- nettime. # the name should be the sublist_name (i.e nettime-l) elif "nettime" in name: - mhonarc_nettime.collect_from_url(url, name, name, archive_dir) + if sublist_name is None: + sublist_name = name + mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir) else: print('mhonarc?') diff --git a/lists/mhonarc_nettime.py b/lists/mhonarc_nettime.py index 5476f37..6380c75 100644 --- a/lists/mhonarc_nettime.py +++ b/lists/mhonarc_nettime.py @@ -6,125 +6,132 @@ DELAY = 0.2 def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False): - response = urllib.request.urlopen(url) - html = response.read() - soup = BeautifulSoup(html, "html5lib") + response = urllib.request.urlopen(url) + html = response.read() + soup = BeautifulSoup(html, "html5lib") - # base url - base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') + # base url + base_url = soup.select('body p:nth-of-type(2) base')[0].get('href') #collect name - list_name = soup.select('body p:nth-of-type(2) title')[0].string - logging.info("Getting " + list_name + " list archive for " + sublist_name) + list_name = soup.select('body p:nth-of-type(2) title')[0].string + logging.info("Getting " + list_name + " list archive for " + sublist_name) - # create (main) directory - # this is where all temp files will be created - d = os.path.join(base_archive_dir, name) - if not os.path.exists(d): - os.makedirs(d) + # create (main) directory + # this is where all temp files will be created + d = os.path.join(base_archive_dir, name) + if not os.path.exists(d): + os.makedirs(d) - threads = [] - lists = soup.select('ul:nth-of-type(2) li') + threads = [] + lists = soup.select('ul:nth-of-type(2) li') - for l in lists: - if l.strong is None: - continue + for l in lists: - name = l.strong.string + if l.strong is None: + continue - if name.lower() == sublist_name.lower(): + name = l.strong.string + print(name + " - " + sublist_name) - threads_url_list = [] - threads_links = l.select('ul li a') - for t in threads_links: - thread_url = urllib.parse.urljoin(base_url, t.get('href')) - threads_url_list.append(thread_url) + if name.lower() == sublist_name.lower(): - nbr_threads = str(len(threads_url_list)) - n = 0 + threads_url_list = [] + threads_links = l.select('ul li a') - for u in threads_url_list: - time.sleep(DELAY) - n += 1 - logging.info("## " + str(n) + " / " + nbr_threads + " ##") - try: - threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox)) - except KeyboardInterrupt: - sys.exit(0) - except: - logging.warning("Error archiving: " + l[1] + "... Continuing.") - ex_t, ex, tb = sys.exc_info() - print(ex_t) - traceback.print_tb(tb) - del tb - continue + for t in threads_links: + thread_url = urllib.parse.urljoin(base_url, t.get('href')) + threads_url_list.append(thread_url) - return threads + nbr_threads = str(len(threads_url_list)) + n = 0 - # for u in threads_url_list[0:10]: - # print "---------------------------------------" - # tt = collect_threads_from_url(u, base_archive_dir, mbox) - # threads.append(tt) + for u in threads_url_list: + time.sleep(DELAY) + n += 1 + logging.info("## " + str(n) + " / " + nbr_threads + " ##") + try: + threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox)) + except KeyboardInterrupt: + sys.exit(0) + except: + logging.warning("Error archiving: " + l[1] + "... Continuing.") + ex_t, ex, tb = sys.exc_info() + print(ex_t) + traceback.print_tb(tb) + del tb + continue - return None + return threads + + # for u in threads_url_list[0:10]: + # print "---------------------------------------" + # tt = collect_threads_from_url(u, base_archive_dir, mbox) + # threads.append(tt) + + return None def collect_threads_from_url(url, base_archive_dir, mbox=False): - response = urllib.request.urlopen(url) - html = response.read() - soup = BeautifulSoup(html, "html5lib") + response = urllib.request.urlopen(url) + html = response.read() + soup = BeautifulSoup(html, "html5lib") - # base url - base_url = url + # base url + base_url = url - # collect name - threads_name = soup.select('p:nth-of-type(1) title')[0].string - threads_name = threads_name.replace(' ', '_') + # collect name - # thread data struct - threads = {'name' : threads_name, 'url' : base_url, 'threads' : []} + e = soup.select('p:nth-of-type(1) title') + print(soup) - logging.info("Collecting Threads of: " + threads_name) + threads_name = soup.select('p:nth-of-type(1) title')[0].string + threads_name = threads_name.replace(' ', '_') - # check if archive already exists - file_path = os.path.join(base_archive_dir, threads['name'] + ".json") - if os.path.isfile(file_path): - logging.info("archive already exists. loading from file " + file_path) - with open(file_path, 'r') as fpin: - threads = json.load(fpin) - else: - lists = soup.select('ul:nth-of-type(1) > li') + # thread data struct + threads = {'name' : threads_name, 'url' : base_url, 'threads' : []} - nbr_threads = str(len(lists)) - n = 0 + logging.info("Collecting Threads of: " + threads_name) - for l in lists: - n += 1 - logging.info("> " + str(n) + " / " + nbr_threads) + # check if archive already exists + file_path = os.path.join(base_archive_dir, threads['name'] + ".json") + if os.path.isfile(file_path): + logging.info("archive already exists. loading from file " + file_path) + with open(file_path, 'r') as fpin: + threads = json.load(fpin) + else: + lists = soup.select('ul:nth-of-type(1) > li') - try: - thread = archive_thread(l, base_url, None) - threads['threads'].append(thread) - except: - ex_type, ex, tb = sys.exc_info() - traceback.print_tb(tb) - del tb - continue + nbr_threads = str(len(lists)) + n = 0 - time.sleep(DELAY) + for l in lists: + n += 1 + logging.info("> " + str(n) + " / " + nbr_threads) - # write - logging.info("writing archive to file " + file_path) + try: + thread = archive_thread(l, base_url, None) + threads['threads'].append(thread) + except: + ex_type, ex, tb = sys.exc_info() + traceback.print_tb(tb) + del tb + continue - with open(file_path, 'w') as fp: - json.dump(threads, fp, indent=4) + time.sleep(DELAY) - logging.info("done. ") + # write + logging.info("writing archive to file " + file_path) - return threads + with open(file_path, 'w') as fp: + json.dump(threads, fp, indent=4) - + logging.info("done. ") + + return threads + + def archive_thread(li, base_url, parent_thread_data): @@ -158,57 +165,57 @@ def archive_thread(li, base_url, parent_thread_data): def collect_message(url, message): - response = urllib.request.urlopen(url) - html = response.read().decode(encoding="utf-8") - # html = response.read() - soup = BeautifulSoup(html, "html5lib") + response = urllib.request.urlopen(url) + html = response.read().decode(encoding="utf-8") + # html = response.read() + soup = BeautifulSoup(html, "html5lib") - #note: this should follow an RFC header standard -- MHonArc has header info in the 1th
+	#note: this should follow an RFC header standard -- MHonArc has header info in the 1th 
 
-    message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    
+	message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    
 
-    # mhonarc xcomments
-    # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
-    message['subject'] = parse_xcomment(soup, "X-Subject")
-    message['date'] = parse_xcomment(soup, "X-Date")
-    message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
-    message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
-    message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
+	# mhonarc xcomments
+	# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+	message['subject'] = parse_xcomment(soup, "X-Subject")
+	message['date'] = parse_xcomment(soup, "X-Date")
+	message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
+	message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
+	message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
 
-    # parse what is displayed on the page
+	# parse what is displayed on the page
 
-    info = soup.select('ul:nth-of-type(1) > li')
+	info = soup.select('ul:nth-of-type(1) > li')
 
-    for i in info:
-        if i.em == None:
-            continue
-        field = i.em.string
-        if field.lower() in message_labels:
-        	message[field.lower()] = i.text.strip(field + ": ")
+	for i in info:
+		if i.em == None:
+			continue
+		field = i.em.string
+		if field.lower() in message_labels:
+			message[field.lower()] = i.text.strip(field + ": ")
 
-    ## reformat from -- [author_name, email_addr]
+	## reformat from -- [author_name, email_addr]
 
-    # from_addr = email.utils.parseaddr(message['from'])
-    # message['author_name'] = from_addr[0]
-    # message['from'] = from_addr[1]
+	# from_addr = email.utils.parseaddr(message['from'])
+	# message['author_name'] = from_addr[0]
+	# message['from'] = from_addr[1]
 
-    ## -- content --
-    # test
-    # c1 = soup.select('pre:nth-of-type(1)')
-    # if len(c1) > 0:
-    #     message['content'] = c1[0].text
-    # else:
-    #     message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+	## -- content --
+	# test
+	# c1 = soup.select('pre:nth-of-type(1)')
+	# if len(c1) > 0:
+	#     message['content'] = c1[0].text
+	# else:
+	#     message['content'] = soup.select('pre:nth-of-type(2)')[0].text
 
-    message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+	message['content'] = soup.select('pre:nth-of-type(2)')[0].text
 
 # mhonarc xcomments
 # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
 def parse_xcomment(soup, xcom):
-    com = soup.find(text=re.compile(xcom))
-    if com is not None:
-        return com.strip('').strip(xcom + ":").strip()
-    return com
+	com = soup.find(text=re.compile(xcom))
+	if com is not None:
+		return com.strip('').strip(xcom + ":").strip()
+	return com
 
 def test_xcomment(soup):
-    return soup.find(text=re.compile('X-Message-Id')) is not None
+	return soup.find(text=re.compile('X-Message-Id')) is not None
diff --git a/www/archives.py b/www/archives.py
deleted file mode 100644
index 7402376..0000000
--- a/www/archives.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import logging, os, json
-import search.archive
-
-class Singleton(type):
-    _instances = {}
-    def __call__(cls, *args, **kwargs):
-        if cls not in cls._instances:
-            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
-            logging.info('**** new Singleton instance')
-        return cls._instances[cls]
-
-class Archives(metaclass=Singleton):
-
-	def __init__(self, archives_dir=None):
-
-		if archives_dir==None:
-			from www import config
-			self.archives_dir = config.ARCHIVES_PATH
-		else:
-			self.archives_dir = archives_dir
-
-		self.data = {}
-		self.loaded = False
-
-		logging.info('loading archives...')
-
-		self.load()
-
-		logging.info('done.')
-
-	def load(self):
-
-		if self.loaded:
-			return
-
-		if not os.path.isdir(self.archives_dir):
-			logging.error("Archives:: the path - " + self.archives_dir + " - is not a valid directory. Aborting.")
-			logging.error(" -- current cwd is: " + os.getcwd())
-
-			return
-
-		arch = [d for d in os.listdir(self.archives_dir) if os.path.isdir(os.path.join(self.archives_dir, d))]
-
-		self.data = {}
-		for a in arch:
-
-			logging.info("loading " + a)
-
-			# archive_path = os.path.join(self.archives_dir, a)
-			self.data[a] = self.load_archive(self.archives_dir, a)
-
-			logging.info("done.")
-
-		self.loaded = True
-		
-
-	def load_archive(self, archive_dir, archive_name):
-
-		if not os.path.isdir(archive_dir):
-			logging.error("Archives:: the path - " + archive_dir + " - is not a valid directory. Aborting.")
-			return
-
-		archive = search.archive.Archive(archive_dir)
-		archive.load(archive_name)
-		return archive
-
-		# # -- shoudl use Archive in searh module here....
-
-		# files = [f for f in os.listdir(archive_dir) if f.endswith('.json')]
-
-		# arch = {}
-		# for f in files:
-		# 	file_path = os.path.join(archive_dir, f)
-		# 	with open(file_path) as fdata:
-		# 		arch[f.replace('.json', '')] = json.load(fdata)
-
-		# return arch	
-
-
diff --git a/www/routes.py b/www/routes.py
index f2b33be..952e80f 100644
--- a/www/routes.py
+++ b/www/routes.py
@@ -11,6 +11,11 @@ import logging
 def index():
 	return render_template("index.html")
 
+@app.route('/favicon.ico')
+def favicon():
+    return send_from_directory(os.path.join(app.root_path, 'static'),
+                               'favicon.ico', mimetype='image/vnd.microsoft.icon')	
+
 @app.route('/search')
 def searh():
 	
diff --git a/www/static/CYBERPLA.GIF b/www/static/CYBERPLA.GIF
new file mode 100644
index 0000000..f6e0f05
Binary files /dev/null and b/www/static/CYBERPLA.GIF differ
diff --git a/www/static/cover.gif b/www/static/cover.gif
new file mode 100644
index 0000000..dab2539
Binary files /dev/null and b/www/static/cover.gif differ
diff --git a/www/static/favicon.ico b/www/static/favicon.ico
new file mode 100644
index 0000000..4d55fa3
Binary files /dev/null and b/www/static/favicon.ico differ
diff --git a/www/static/ooo.js b/www/static/ooo.js
new file mode 100644
index 0000000..f05f5ac
--- /dev/null
+++ b/www/static/ooo.js
@@ -0,0 +1,17 @@
+var c = 255
+var x = 0
+function b() {
+	c -= 1;
+	document.body.style.backgroundColor = 'rgb(' + c + ',' + c + ',' + c + ')';
+}
+function m() {
+	x += 0.7
+	s = document.getElementById('search');
+	s.style.left = x + 'px'
+	s.style.top = x + 'px'
+}
+window.onload = function () {
+	// setInterval(b, 500);
+	setInterval(m, 200);
+};
+
diff --git a/www/static/search.js b/www/static/search.js
index 47db7c3..4403987 100644
--- a/www/static/search.js
+++ b/www/static/search.js
@@ -2,6 +2,11 @@
 $(document).ready(function(){
 	$('#loading').hide()
 
+	$('#info').click( function() {
+		console.log("click");
+		$('#info-search').toggle();
+	});
+
 	$('#search').submit(function(e) {
 		e.preventDefault();
 		args = $(this).serialize();	
@@ -38,7 +43,7 @@ function search_result_archive(a) {
 		
 		$.each(r.hits, function(j, h){
 			// console.log(h)
-			let hit = '
  • ' + h.subject + ' -- ' + h.author_name + '
  • '; + let hit = '
  • ' + h.subject + ' -- ' + h.author_name + '
  • '; hits += hit; }); hits += ""; diff --git a/www/templates/index.html b/www/templates/index.html index 646c4a0..155ed56 100644 --- a/www/templates/index.html +++ b/www/templates/index.html @@ -1,6 +1,11 @@ - - -

    ---> SEARCH <---

    + + Times of Nettime + + + + \ No newline at end of file diff --git a/www/templates/search.html b/www/templates/search.html index c5b5715..ee42174 100644 --- a/www/templates/search.html +++ b/www/templates/search.html @@ -1,5 +1,6 @@ + Times of Nettime @@ -21,8 +22,21 @@ {% endfor %} +
    Loading...
    +