index +search info

2019-07-17 12:55:47 +02:00 · 2019-07-17 12:55:47 +02:00 · 07a026e93f
commit 07a026e93f
parent f06a720e55
12 changed files with 186 additions and 210 deletions
--- a/archive.py
+++ b/archive.py
@ -15,7 +15,7 @@ def run(args):
    i = 0
    for u in args.url:
        name = args.names[i] if i < len(args.names) else None
-        lists.crawl.crawl(u, name, args.arch)
+        lists.crawl.crawl(url=u, name=name, sublist_name=name, archive_dir=args.arch) #<-- sublist for nettime
        i = i + 1
    sys.exit()
--- a/lists/crawl.py
+++ b/lists/crawl.py
@ -26,7 +26,9 @@ def crawl(url, name, sublist_name=None, archive_dir="archives"):
 	# special case -- nettime.
 	# the name should be the sublist_name (i.e nettime-l)
 	elif "nettime" in name:
-		mhonarc_nettime.collect_from_url(url, name, name, archive_dir)
+		if sublist_name is None:
 			sublist_name = name
 		mhonarc_nettime.collect_from_url(url, name, sublist_name, archive_dir)
 	else:
 		print('mhonarc?')
--- a/lists/mhonarc_nettime.py
+++ b/lists/mhonarc_nettime.py
@ -6,125 +6,132 @@ DELAY = 0.2
 def collect_from_url(url, name, sublist_name, base_archive_dir="archives", mbox=False):
-    response = urllib.request.urlopen(url)
+	response = urllib.request.urlopen(url)
-    html = response.read()
+	html = response.read()
-    soup = BeautifulSoup(html, "html5lib")
+	soup = BeautifulSoup(html, "html5lib")
-    # base url 
+	# base url 
-    base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
+	base_url = soup.select('body p:nth-of-type(2) base')[0].get('href')
 	#collect name
-    list_name = soup.select('body p:nth-of-type(2) title')[0].string
+	list_name = soup.select('body p:nth-of-type(2) title')[0].string
-    logging.info("Getting " + list_name + " list archive for " + sublist_name)
+	logging.info("Getting " + list_name + " list archive for " + sublist_name)
-    # create (main) directory 
+	# create (main) directory 
-    # this is where all temp files will be created
+	# this is where all temp files will be created
-    d = os.path.join(base_archive_dir, name)
+	d = os.path.join(base_archive_dir, name)
-    if not os.path.exists(d):
+	if not os.path.exists(d):
-        os.makedirs(d)
+		os.makedirs(d)
-    threads = []
+	threads = []
-    lists = soup.select('ul:nth-of-type(2) li')    
+	lists = soup.select('ul:nth-of-type(2) li')    
    for l in lists:
-    	if l.strong is None:
+	for l in lists:
    		continue
-    	name = l.strong.string
+		if l.strong is None:
 			continue
-    	if name.lower() == sublist_name.lower():
+		name = l.strong.string
 		print(name + " - " + sublist_name)
-            threads_url_list = []
+		if name.lower() == sublist_name.lower():
            threads_links = l.select('ul li a')
            for t in threads_links:
                thread_url = urllib.parse.urljoin(base_url, t.get('href'))
                threads_url_list.append(thread_url)
-            nbr_threads = str(len(threads_url_list))
+			threads_url_list = []
-            n = 0
+			threads_links = l.select('ul li a')
-            for u in threads_url_list:
+			for t in threads_links:
-                time.sleep(DELAY)
+				thread_url = urllib.parse.urljoin(base_url, t.get('href'))
-                n += 1
+				threads_url_list.append(thread_url)
                logging.info("## " + str(n) + " / " + nbr_threads + " ##")                
                try:
                    threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))   
                except KeyboardInterrupt:
                    sys.exit(0)
                except:
                    logging.warning("Error archiving: " + l[1] + "... Continuing.")
                    ex_t, ex, tb = sys.exc_info()
                    print(ex_t)
                    traceback.print_tb(tb)
                    del tb
                    continue                   
-            return threads
+			nbr_threads = str(len(threads_url_list))
 			n = 0
-            # for u in threads_url_list[0:10]:
+			for u in threads_url_list:
-            #     print "---------------------------------------"
+				time.sleep(DELAY)
-            #     tt = collect_threads_from_url(u, base_archive_dir, mbox)
+				n += 1
-            #     threads.append(tt)                
+				logging.info("## " + str(n) + " / " + nbr_threads + " ##")                
 				try:
 					threads.append(collect_threads_from_url(u, base_archive_dir=d, mbox=mbox))   
 				except KeyboardInterrupt:
 					sys.exit(0)
 				except:
 					logging.warning("Error archiving: " + l[1] + "... Continuing.")
 					ex_t, ex, tb = sys.exc_info()
 					print(ex_t)
 					traceback.print_tb(tb)
 					del tb
 					continue                   
-    return None
+			return threads
 			# for u in threads_url_list[0:10]:
 			#     print "---------------------------------------"
 			#     tt = collect_threads_from_url(u, base_archive_dir, mbox)
 			#     threads.append(tt)                
 	return None
 def collect_threads_from_url(url, base_archive_dir, mbox=False):
-    response = urllib.request.urlopen(url)
+	response = urllib.request.urlopen(url)
-    html = response.read()
+	html = response.read()
-    soup = BeautifulSoup(html, "html5lib")
+	soup = BeautifulSoup(html, "html5lib")
-    # base url 
+	# base url 
-    base_url = url
+	base_url = url
-    # collect name
+	# collect name
    threads_name = soup.select('p:nth-of-type(1) title')[0].string
    threads_name = threads_name.replace(' ', '_')
-    # thread data struct
+	e = soup.select('p:nth-of-type(1) title')
-    threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
+	print(soup)
-    logging.info("Collecting Threads of: " + threads_name)
+	threads_name = soup.select('p:nth-of-type(1) title')[0].string
 	threads_name = threads_name.replace(' ', '_')
-    # check if archive already exists
+	# thread data struct
-    file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
+	threads = {'name' : threads_name, 'url' : base_url, 'threads' : []}
    if os.path.isfile(file_path):
        logging.info("archive already exists. loading from file " + file_path)
        with open(file_path, 'r') as fpin:
            threads = json.load(fpin)
    else:
        lists = soup.select('ul:nth-of-type(1) > li')
-        nbr_threads = str(len(lists))
+	logging.info("Collecting Threads of: " + threads_name)
        n = 0
-        for l in lists:
+	# check if archive already exists
-            n += 1
+	file_path = os.path.join(base_archive_dir, threads['name'] + ".json")
-            logging.info("> " + str(n) + " / " + nbr_threads)
+	if os.path.isfile(file_path):
 		logging.info("archive already exists. loading from file " + file_path)
 		with open(file_path, 'r') as fpin:
 			threads = json.load(fpin)
 	else:
 		lists = soup.select('ul:nth-of-type(1) > li')
-            try:
+		nbr_threads = str(len(lists))
-                thread = archive_thread(l, base_url, None)
+		n = 0
                threads['threads'].append(thread)
            except:
                ex_type, ex, tb = sys.exc_info()
                traceback.print_tb(tb)
                del tb                
                continue
-            time.sleep(DELAY)
+		for l in lists:
 			n += 1
 			logging.info("> " + str(n) + " / " + nbr_threads)
-        # write 
+			try:
-        logging.info("writing archive to file " + file_path)
+				thread = archive_thread(l, base_url, None)
 				threads['threads'].append(thread)
 			except:
 				ex_type, ex, tb = sys.exc_info()
 				traceback.print_tb(tb)
 				del tb                
 				continue
-        with open(file_path, 'w') as fp:
+			time.sleep(DELAY)
            json.dump(threads, fp, indent=4)
-        logging.info("done. ")
+		# write 
 		logging.info("writing archive to file " + file_path)
-    return threads
+		with open(file_path, 'w') as fp:
 			json.dump(threads, fp, indent=4)
-    
+		logging.info("done. ")
 	return threads
 def archive_thread(li, base_url, parent_thread_data):
@ -158,57 +165,57 @@ def archive_thread(li, base_url, parent_thread_data):
 def collect_message(url, message):
-    response = urllib.request.urlopen(url)
+	response = urllib.request.urlopen(url)
-    html = response.read().decode(encoding="utf-8")
+	html = response.read().decode(encoding="utf-8")
-    # html = response.read()
+	# html = response.read()
-    soup = BeautifulSoup(html, "html5lib")    
+	soup = BeautifulSoup(html, "html5lib")    
-    #note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
+	#note: this should follow an RFC header standard -- MHonArc has header info in the 1th <pre>
-    message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    
+	message_labels = ('to', 'subject', 'from', 'date', 'message-id', 'content-type')    
-    # mhonarc xcomments
+	# mhonarc xcomments
-    # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
+	# ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
-    message['subject'] = parse_xcomment(soup, "X-Subject")
+	message['subject'] = parse_xcomment(soup, "X-Subject")
-    message['date'] = parse_xcomment(soup, "X-Date")
+	message['date'] = parse_xcomment(soup, "X-Date")
-    message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
+	message['from'] = parse_xcomment(soup, "X-From-R13") #useless...
-    message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
+	message['message-id'] = parse_xcomment(soup, 'X-Message-Id')
-    message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
+	message['content-type'] = parse_xcomment(soup, 'X-Content-Type')
-    # parse what is displayed on the page
+	# parse what is displayed on the page
-    info = soup.select('ul:nth-of-type(1) > li')
+	info = soup.select('ul:nth-of-type(1) > li')
-    for i in info:
+	for i in info:
-        if i.em == None:
+		if i.em == None:
-            continue
+			continue
-        field = i.em.string
+		field = i.em.string
-        if field.lower() in message_labels:
+		if field.lower() in message_labels:
-        	message[field.lower()] = i.text.strip(field + ": ")
+			message[field.lower()] = i.text.strip(field + ": ")
-    ## reformat from -- [author_name, email_addr]
+	## reformat from -- [author_name, email_addr]
-    # from_addr = email.utils.parseaddr(message['from'])
+	# from_addr = email.utils.parseaddr(message['from'])
-    # message['author_name'] = from_addr[0]
+	# message['author_name'] = from_addr[0]
-    # message['from'] = from_addr[1]
+	# message['from'] = from_addr[1]
-    ## -- content --
+	## -- content --
-    # test
+	# test
-    # c1 = soup.select('pre:nth-of-type(1)')
+	# c1 = soup.select('pre:nth-of-type(1)')
-    # if len(c1) > 0:
+	# if len(c1) > 0:
-    #     message['content'] = c1[0].text
+	#     message['content'] = c1[0].text
-    # else:
+	# else:
-    #     message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+	#     message['content'] = soup.select('pre:nth-of-type(2)')[0].text
-    message['content'] = soup.select('pre:nth-of-type(2)')[0].text
+	message['content'] = soup.select('pre:nth-of-type(2)')[0].text
 # mhonarc xcomments
 # ref: http://www.schlaubert.de/MHonArc/doc/resources/printxcomments.html
 def parse_xcomment(soup, xcom):
-    com = soup.find(text=re.compile(xcom))
+	com = soup.find(text=re.compile(xcom))
-    if com is not None:
+	if com is not None:
-        return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
+		return com.strip('<!-- ').strip(' -->').strip(xcom + ":").strip()
-    return com
+	return com
 def test_xcomment(soup):
-    return soup.find(text=re.compile('X-Message-Id')) is not None
+	return soup.find(text=re.compile('X-Message-Id')) is not None
--- a/www/archives.py
+++ b/www/archives.py
@ -1,79 +0,0 @@
 import logging, os, json
 import search.archive
 class Singleton(type):
    _instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
            logging.info('**** new Singleton instance')
        return cls._instances[cls]
 class Archives(metaclass=Singleton):
 	def __init__(self, archives_dir=None):
 		if archives_dir==None:
 			from www import config
 			self.archives_dir = config.ARCHIVES_PATH
 		else:
 			self.archives_dir = archives_dir
 		self.data = {}
 		self.loaded = False
 		logging.info('loading archives...')
 		self.load()
 		logging.info('done.')
 	def load(self):
 		if self.loaded:
 			return
 		if not os.path.isdir(self.archives_dir):
 			logging.error("Archives:: the path - " + self.archives_dir + " - is not a valid directory. Aborting.")
 			logging.error(" -- current cwd is: " + os.getcwd())
 			return
 		arch = [d for d in os.listdir(self.archives_dir) if os.path.isdir(os.path.join(self.archives_dir, d))]
 		self.data = {}
 		for a in arch:
 			logging.info("loading " + a)
 			# archive_path = os.path.join(self.archives_dir, a)
 			self.data[a] = self.load_archive(self.archives_dir, a)
 			logging.info("done.")
 		self.loaded = True
 	def load_archive(self, archive_dir, archive_name):
 		if not os.path.isdir(archive_dir):
 			logging.error("Archives:: the path - " + archive_dir + " - is not a valid directory. Aborting.")
 			return
 		archive = search.archive.Archive(archive_dir)
 		archive.load(archive_name)
 		return archive
 		# # -- shoudl use Archive in searh module here....
 		# files = [f for f in os.listdir(archive_dir) if f.endswith('.json')]
 		# arch = {}
 		# for f in files:
 		# 	file_path = os.path.join(archive_dir, f)
 		# 	with open(file_path) as fdata:
 		# 		arch[f.replace('.json', '')] = json.load(fdata)
 		# return arch	
--- a/www/routes.py
+++ b/www/routes.py
@ -11,6 +11,11 @@ import logging
 def index():
 	return render_template("index.html")
@app.route('/favicon.ico')
 def favicon():
    return send_from_directory(os.path.join(app.root_path, 'static'),
                               'favicon.ico', mimetype='image/vnd.microsoft.icon')	
@app.route('/search')
 def searh():
--- a/www/static/CYBERPLA.GIF
+++ b/www/static/CYBERPLA.GIF
--- a/www/static/cover.gif
+++ b/www/static/cover.gif
--- a/www/static/favicon.ico
+++ b/www/static/favicon.ico
--- a/www/static/ooo.js
+++ b/www/static/ooo.js
@ -0,0 +1,17 @@
 var c = 255
 var x = 0
 function b() {
 	c -= 1;
 	document.body.style.backgroundColor = 'rgb(' + c + ',' + c + ',' + c + ')';
 }
 function m() {
 	x += 0.7
 	s = document.getElementById('search');
 	s.style.left = x + 'px'
 	s.style.top = x + 'px'
 }
 window.onload = function () {
 	// setInterval(b, 500);
 	setInterval(m, 200);
 };
--- a/www/static/search.js
+++ b/www/static/search.js
@ -2,6 +2,11 @@
 $(document).ready(function(){
 	$('#loading').hide()
 	$('#info').click( function() {
 		console.log("click");
 		$('#info-search').toggle();
 	});
 	$('#search').submit(function(e) {
 		e.preventDefault();
 		args = $(this).serialize();	
@ -38,7 +43,7 @@ function search_result_archive(a) {
 		$.each(r.hits, function(j, h){
 			// console.log(h)
-			let hit = '<li><a href="' + h.url+ '">' + h.subject + '</a> -- <i>' + h.author_name + '</i></li>';
+			let hit = '<li><a href="' + h.url+ '" target="_blank">' + h.subject + '</a> -- <i>' + h.author_name + '</i></li>';
 			hits += hit;
 		});
 		hits += "</ul>";
--- a/www/templates/index.html
+++ b/www/templates/index.html
@ -1,6 +1,11 @@
 <html>
-<head></head>
+<head>
-<body>
+	<title>Times of Nettime</title>
-	 <a href="/search"><h3>---> SEARCH <---</h3></a>
+	<script type="text/javascript" src="{{ url_for('static',filename='ooo.js') }}"></script>
 </head>
 <body bgcolor="blue">
 	<div id="search" style="position: absolute;">
 		<a href="/search"><h3><img src="{{ url_for('static',filename='CYBERPLA.GIF') }}" width="150"></h3></a>
 	</div>	 
 </body>
 </html>
--- a/www/templates/search.html
+++ b/www/templates/search.html
@ -1,5 +1,6 @@
 <html>
 <head>
 	<title>Times of Nettime</title>
 	<link rel="stylesheet" type="text/css" href="{{ url_for('static',filename='c3.min.css') }}">
 	<script type=text/javascript src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
 	<script type="text/javascript" src="{{ url_for('static',filename='d3.min.js') }}" charset="utf-8"></script>
@ -21,8 +22,21 @@
 			 {% endfor %}
 		</select>		
 		<input type="submit" value="search" id="submit">
 		<input type="button" value=" ? " id="info">
 		<div id="loading">Loading...</div>
 	</form>
 	<div id="info-search" style="display: none">
 		<table><tbody><tr><th>Operator</th><th>   </th></tr>
 		<tr><td>+</td><td>The word is mandatory in all text returned.</td></tr>
 		<tr><td>-</td><td>The word cannot appear in any text returned.</td></tr>
 		<tr><td>&lt;</td><td>The word that follows has a lower relevance than other words, although text containing it will still match</td></tr>
 		<tr><td>&gt;</td><td>The word that follows has a higher relevance than other words.</td></tr>
 		<tr><td>()</td><td>Used to group words into subexpressions.</td></tr>
 		<tr><td>~</td><td>The word following contributes negatively to the relevance of the text (which is different to the '-' operator, which specifically excludes the word, or the '&lt;' operator, which still causes the word to contribute positively to the relevance of the text.</td></tr>
 		<tr><td>*</td><td>The wildcard, indicating zero or more characters. It can only appear at the end of a word.</td></tr>
 		<tr><td>"</td><td>Anything enclosed in the double quotes is taken as a whole (so you can match phrases, for example).</td></tr>
 		</tbody></table>				
 	</div>
 	<div id="graph"></div>
 	<div id="results"></div>
 </body>