cv_filters + stats + htmlcards

2015-07-09 20:49:06 +02:00 · 2015-07-09 20:49:06 +02:00 · caa8b3ddb5
commit caa8b3ddb5
parent a5004449a5
7 changed files with 276 additions and 58 deletions
--- a/webpagetest/html/+++/lestyle.css
+++ b/webpagetest/html/+++/lestyle.css
@ -0,0 +1,38 @@
+body {
+	width: 100%;
+	height: 100%;
+	margin: 0;
+	padding: 0;
+}
+
+.agent {
+	padding: 1.5em;
+	float: left;
+	margin: 1em;
+	width: 20em;
+	background-color: #eeeeee;
+}
+
+.card {
+	padding: 1.5em;
+	float: left;
+	display: inline;
+	margin: 1em;
+	width: 20em;
+	background-color: #eeeeee;
+}
+
+.data {
+	margin-left: 0.5em;
+}
+
+.name {
+	font-weight: bold;
+}
+
+table {
+	width: 100%;
+	dborder: 1px solid red;
+}
+
+
--- a/webpagetest/html/index_template.html
+++ b/webpagetest/html/index_template.html
@ -0,0 +1,10 @@
+<html lang="en">
+<head>
+	<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+	<link rel="stylesheet" type="text/css" href="+++/lestyle.css"/>
+	<title>Cards - Micro-Temporalities [dmi15]</title>
+</head>
+<body>
+		[[content]]
+</body>
+</html>
--- a/webpagetest/html/stats_to_htmlcards.py
+++ b/webpagetest/html/stats_to_htmlcards.py
@ -0,0 +1,113 @@
+import sys, csv, json, os, re
+
+def emit_header():
+	str_s = '<div class="agent">'
+	str_s += '<b>Location</b>:<br>'
+	str_s += 'Amsterdam<br>\n'
+	str_s += '<br>\n'
+	str_s += '<b>User-Agent</b>:<br>'
+	str_s += "Mozilla/5.0 (Windows NT 6.1; WOW64)<br>AppleWebKit/537.36 (KHTML, like Gecko)<br>Chrome/43.0.2357.132 Safari/537.36 PTST/221\n"
+	str_s += '</div>\n' 
+	return str_s
+
+
+def emit_name(name, date, time):
+	str_s = '<div class="name">' + name + '</div>\n'
+	str_s +=  '<div class="date">' + date + '</div>\n'
+	str_s +=  '<div class="time">' + time + '</div>\n'
+	return str_s
+
+def string_format_percentage(pct):
+	v = int(pct * 100)
+	return str(v)
+
+
+def emit_table_row(elem, index, total):
+	return '<tr>' + '<td>' + index +': ' + '</td><td>'+ str(int(elem[index])) + '</td><td>' + string_format_percentage(elem[index] / total) + '%' + '</td></tr>\n'
+
+def emit_size(size):
+	total = size['widget'] + size['ad'] + size['privacy'] + size['-'] + size['analytics'] + size['tracker']
+	if total == 0:
+		total = 1
+	str_s = '<div class="size">'
+	str_s += '<h4>Objects Size (bytes)</h4>\n'
+	str_s += '<div class="data">\n'
+	str_s += '<table>\n'
+	str_s += emit_table_row(size, 'ad', total).replace('ad', 'ads')
+	str_s += emit_table_row(size, 'analytics', total)
+	str_s += emit_table_row(size, 'tracker', total).replace('tracker', 'trackers')
+	str_s += emit_table_row(size, 'widget', total).replace('widget', 'widgets')
+	str_s += emit_table_row(size, '-', total).replace('-', 'other')
+	str_s += '</table>\n'
+	str_s += '</div>\n'
+	str_s += '</div>\n'
+	return str_s
+
+def emit_item(item):
+	total = item['total']
+	total_junk = item['widget'] + item['ad'] + item['privacy'] + item['analytics'] + item['tracker']
+	if total == 0:
+		total = 1
+	str_s = '<div class="items">'
+	str_s += '<h4>Page Http Request Elements</h4>\n'
+	str_s += '<div class="data">\n'
+	str_s += '<table>\n'	
+	str_s += emit_table_row(item, 'ad', total).replace('ad', 'ads')
+	str_s += emit_table_row(item, 'analytics', total)
+	str_s += emit_table_row(item, 'tracker', total).replace('tracker', 'trackers')
+	str_s += emit_table_row(item, 'widget', total).replace('widget', 'widgets')
+	str_s += emit_table_row(item, '-', total).replace('-', 'other')
+	str_s += '</table>\n'
+	str_s += '</div>\n'
+	str_s += '</div>\n'
+	return str_s
+
+def emit_time(time):
+	total = time['widget'] + time['ad'] + time['privacy'] + time['-'] + time['analytics'] + time['tracker']
+	if total == 0:
+		total = 1
+	str_s = '<div class="times">'
+	str_s += '<h4>(Micro) Timing (ms)</h4>\n'
+	str_s += '<div class="data">\n'
+	str_s += '<table>\n'	
+	str_s += emit_table_row(time, 'ad', total).replace('ad', 'ads')
+	str_s += emit_table_row(time, 'analytics', total)
+	str_s += emit_table_row(time, 'tracker', total).replace('tracker', 'trackers')
+	str_s += emit_table_row(time, 'widget', total).replace('widget', 'widgets')
+	str_s += emit_table_row(time, '-', total).replace('-', 'other')
+	str_s += '</table>\n'
+	str_s += '</div>\n'
+	str_s += '</div>\n'
+	return str_s
+
+if __name__ == '__main__':
+
+	fp = sys.stdin
+	try:
+		stats = json.loads(fp.read())
+	except Exception, ee:	
+		sys.exit('Error loading data... Aborting.')
+
+	try:
+		template = open(os.path.join('.', 'index_template.html'), 'r+');
+	except:
+		print('error opening template file. aborting...');
+		sys.exit(0);		
+
+	content = ""
+	content += emit_header()
+	for e in stats:				
+		card = "<div class='card'>\n"
+		stats = e['stats']
+		card += "\t\t\t" + emit_name(stats['host'], stats['date'], stats['time'])		
+		card += "\t\t\t" + emit_item(stats['items'])
+		card += "\t\t\t" + emit_size(stats['sizes'])
+		card += "\t\t\t" + emit_time(stats['times'])
+		card += "</div>\n"
+		content += card
+
+	html = template.read().replace('[[content]]', content);
+
+	print html
+	
+
--- a/webpagetest/wpt_csv_filter.py
+++ b/webpagetest/wpt_csv_filter.py
@ -1,13 +1,14 @@
 #!/usr/bin/python2.7

-import sys, csv, json, os
+import sys, csv, json, os, re
 from optparse import OptionParser

 # list of fileds from the wpt csv file to keep
 csv_fields = [
 'Date',
 'Time',
-'tracker_type',
+'bug_type',
+'bug_name',
 'Sequence Number',
 'Host',
 'IP Address', 
@ -40,14 +41,24 @@ csv_fields = [
 'Initiator Line',
 'Expires',
 'Cached',
-'Cookie Count(out)',
+'Cookie Count(out)'
 ]

-def filter_fields(wpt_row, type):
+# matches a given url to all possible bugs
+def match(url, bugs):
+	for b in bugs:
+		pattern = re.compile(b['pattern'])
+		if(pattern.search(url)):
+			return {'name': b['name'], 'type': b['type'], }
+	return None
+
+# filters the csv file (deleted some columns) and adds the 'tracker type'
+def filter_fields(wpt_row, type, name):
 	for k in wpt_row.keys():
 		if k not in csv_fields:
 			del wpt_row[k]
-	wpt_row['tracker_type'] = type
+	wpt_row['bug_type'] = type
+	wpt_row['bug_name'] = name

 def run(options):

@ -72,8 +83,6 @@ def run(options):
 	except Exception, ee:	
 		sys.exit('Error loading bugs data... Aborting.')

-	stats = {'total' : 0.0, 'ads': 0.0, 'trackers': 0.0, 'analytics': 0.0, 'widgets': 0.0, 'privacy': 0.0, 'blank': 0.0}
-
 	#write ouput 
 	fname, ext = os.path.splitext(os.path.basename(csv_file))

@ -87,67 +96,31 @@ def run(options):
 	writer.writeheader()

 	last_seq = 0
+
 	for r in wpt_data:
 		seq = int(r['Sequence Number'])
 		if  seq < last_seq:
 			break
 		last_seq = seq
-		stats['total'] += 1
-		host = r['Host']
-		if any(a in host for a in bugs_data['ads']):
-#			print "ads: " + host
-			filter_fields(r, 'ad')
+
+		print str(seq)
+
+		url = r['Host'] + r['URL']
+		bug = match(url, bugs_data['bugs'])
+		if bug:
+			filter_fields(r, bug['type'], bug['name'])
 			writer.writerow(r);
-			stats['ads'] += 1
-			continue
-		if any(a in host for a in bugs_data['trackers']):
-#			print "trackers: " + host
-			filter_fields(r, 'tracker')
-			writer.writerow(r);
-			stats['trackers'] += 1
-			continue
-		if any(a in host for a in bugs_data['analytics']):
-#			print "analytics: " + host
-			filter_fields(r, 'analytics')
-			writer.writerow(r);
-			stats['analytics'] += 1
-			continue
-		if any(a in host for a in bugs_data['widgets']):
-#			print "widgets: " + host
-			filter_fields(r, 'widget')
-			writer.writerow(r);
-			stats['widgets'] += 1
-			continue
-		if any(a in host for a in bugs_data['privacy']):
-#			print "privacy: " + host
-			filter_fields(r, 'privacy')
-			writer.writerow(r);
-			stats['privacy'] += 1
-			continue			
+		else:
 			if options.keep:
-			stats['blank'] += 1
-			filter_fields(r, '-')
+				filter_fields(r, '-', '-')
 				writer.writerow(r);

-
-	if options.stats:
-		print "----- Stats: " + fname + ext + " -----"
-		print "total (elements): " + str(stats['total'])
-		print "ads: " + str(stats['ads']) + ' - ' + str(stats['ads'] / stats['total']) + '%'
-		print "trackers: " + str(stats['trackers']) + ' - ' + str(stats['trackers'] / stats['total']) + '%'
-		print "analytics: " + str(stats['analytics']) + ' - ' + str(stats['analytics'] / stats['total']) + '%'
-		print "widgets: " + str(stats['widgets']) + ' - ' + str(stats['widgets'] / stats['total']) + '%'
-		print "privacy: " + str(stats['privacy']) + ' - ' + str(stats['privacy'] / stats['total']) + '%'
-		print "..............."
-		print "* JUNK RATIO * " + str((stats['ads'] + stats['trackers'] + stats['analytics'] + stats['widgets'] + stats['privacy']) / stats['total']) + '%'			
-
 if __name__ == '__main__':

 	p = OptionParser();
 	p.add_option('-f', '--file', action="store", help="wpt csv input file")
 	p.add_option('-b', '--bugs', action="store", help="ghostery (formated) bugs input file")
 	p.add_option('-k', '--keep', action="store_true", help="keeps the non bugs html element")
-	p.add_option('-s', '--stats', action="store_true", help="prints basic stats")
 	p.add_option('-o', '--outputdir', action="store", help="output directory", default="")


--- a/webpagetest/wpt_csv_filter_batch.py
+++ b/webpagetest/wpt_csv_filter_batch.py
@ -1,6 +1,6 @@
 #!/usr/bin/python2.7

-import sys, csv, json, os
+import sys, csv, json, os, time
 from optparse import OptionParser
 import wpt_csv_filter as wptf

@ -30,11 +30,15 @@ def run(options):
 				csv_files.append(os.path.join(dirpath, fn))
 		break

+	i = 0
 	for f in csv_files:
+		i += 1
 		options.file = f
-		print "processing - " + f
+		print str(i) + "/" + str(len(csv_files)) + " - " + f
+		start_time = time.time()
 		wptf.run(options)
-		print ".......done........"
+		duration = time.time() - start_time
+		print "done - " + time.strftime('%H:%M:%S', time.gmtime(duration))

 if __name__ == '__main__':

@ -42,7 +46,6 @@ if __name__ == '__main__':
 	p.add_option('-i', '--inputdir', action="store", help="input directory (where all the wpt csv files reside)")
 	p.add_option('-b', '--bugs', action="store", help="ghostery (formated) bugs input file")
 	p.add_option('-k', '--keep', action="store_true", help="keeps the non bugs html element")
-	p.add_option('-s', '--stats', action="store_true", help="prints basic stats")
 	p.add_option('-o', '--outputdir', action="store", help="output directory (where all the filtered csv files will be placed)", default="")

 	options, args = p.parse_args()
--- a/webpagetest/wpt_csv_stats.py
+++ b/webpagetest/wpt_csv_stats.py
@ -0,0 +1,47 @@
+import sys, csv, json, os, re
+from optparse import OptionParser
+
+def run(csv_file_path):
+
+	if not os.path.exists(csv_file_path):
+		sys.exit('Input file does not exists. Aborting.')
+
+	stats = {
+	'date' : None,
+	'time' : None,
+	'host' : None,
+	'items' : {'ad': 0.0, 'tracker': 0.0, 'analytics': 0.0, 'widget': 0.0, 'privacy': 0.0, '-': 0.0, 'total' : 0.0},
+	'sizes' : {'ad': 0.0, 'tracker': 0.0, 'analytics': 0.0, 'widget': 0.0, 'privacy': 0.0, '-': 0.0},
+	'times' : {'ad': 0.0, 'tracker': 0.0, 'analytics': 0.0, 'widget': 0.0, 'privacy': 0.0, '-': 0.0}
+	}
+
+	with open(csv_file_path) as csv_file:
+		wpt_data = csv.DictReader(csv_file)
+		items = stats['items']
+		sizes = stats['sizes']
+		times = stats['times']
+		for d in wpt_data:
+			if not stats['host']:
+				stats['host'] = d['Host'] # first line
+				stats['date'] = d['Date'] # first line
+				stats['time'] = d['Time'] # first line
+			items['total'] += 1
+			bug_type = d['bug_type']
+			items[bug_type] += 1
+			sizes[bug_type] += int(d['Object Size'])
+			times[bug_type] += int(d['Time to Load (ms)'])
+
+	return stats
+
+if __name__ == '__main__':
+
+	p = OptionParser();
+	p.add_option('-f', '--file', action="store", help="wpt csv input file")
+
+	options, args = p.parse_args()
+
+	if not options.file:
+		sys.exit('No wpt csv input file specified. Aborting.')
+
+	stats = run(options.file)
+	print json.dumps(stats, indent=2, separators=(',',':'))
--- a/webpagetest/wpt_csv_stats_batch.py
+++ b/webpagetest/wpt_csv_stats_batch.py
@ -0,0 +1,34 @@
+import sys, csv, json, os, re
+from optparse import OptionParser
+import wpt_csv_stats
+
+def run(input_dir):
+
+	if not os.path.exists(input_dir):
+		sys.exit('Input directory does not exists. Aborting.')
+
+	stats = []
+
+	csv_files = []
+	for (dirpath, dirnames, filenames) in os.walk(input_dir):
+		for fn in filenames:
+			fname, ext = os.path.splitext(fn)
+			if ext == '.csv':
+				filepath = os.path.join(dirpath, fn)
+				stats.append({'name' : fname, 'stats': wpt_csv_stats.run(filepath)})
+		break
+
+	return stats
+
+if __name__ == '__main__':
+
+	p = OptionParser();
+	p.add_option('-i', '--inputdir', action="store", help="input directory (where all the wpt csv files reside)")
+
+	options, args = p.parse_args()
+
+	if not options.inputdir:
+		sys.exit('No input directory specified. Aborting.')
+
+	stats = run(options.inputdir)
+	print json.dumps(stats, indent=2, separators=(',',':'))