cv_filters + stats + htmlcards

This commit is contained in:
gauthiier 2015-07-09 20:49:06 +02:00
parent a5004449a5
commit caa8b3ddb5
7 changed files with 276 additions and 58 deletions

View File

@ -0,0 +1,38 @@
body {
width: 100%;
height: 100%;
margin: 0;
padding: 0;
}
.agent {
padding: 1.5em;
float: left;
margin: 1em;
width: 20em;
background-color: #eeeeee;
}
.card {
padding: 1.5em;
float: left;
display: inline;
margin: 1em;
width: 20em;
background-color: #eeeeee;
}
.data {
margin-left: 0.5em;
}
.name {
font-weight: bold;
}
table {
width: 100%;
dborder: 1px solid red;
}

View File

@ -0,0 +1,10 @@
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<link rel="stylesheet" type="text/css" href="+++/lestyle.css"/>
<title>Cards - Micro-Temporalities [dmi15]</title>
</head>
<body>
[[content]]
</body>
</html>

View File

@ -0,0 +1,113 @@
import sys, csv, json, os, re
def emit_header():
str_s = '<div class="agent">'
str_s += '<b>Location</b>:<br>'
str_s += 'Amsterdam<br>\n'
str_s += '<br>\n'
str_s += '<b>User-Agent</b>:<br>'
str_s += "Mozilla/5.0 (Windows NT 6.1; WOW64)<br>AppleWebKit/537.36 (KHTML, like Gecko)<br>Chrome/43.0.2357.132 Safari/537.36 PTST/221\n"
str_s += '</div>\n'
return str_s
def emit_name(name, date, time):
str_s = '<div class="name">' + name + '</div>\n'
str_s += '<div class="date">' + date + '</div>\n'
str_s += '<div class="time">' + time + '</div>\n'
return str_s
def string_format_percentage(pct):
v = int(pct * 100)
return str(v)
def emit_table_row(elem, index, total):
return '<tr>' + '<td>' + index +': ' + '</td><td>'+ str(int(elem[index])) + '</td><td>' + string_format_percentage(elem[index] / total) + '%' + '</td></tr>\n'
def emit_size(size):
total = size['widget'] + size['ad'] + size['privacy'] + size['-'] + size['analytics'] + size['tracker']
if total == 0:
total = 1
str_s = '<div class="size">'
str_s += '<h4>Objects Size (bytes)</h4>\n'
str_s += '<div class="data">\n'
str_s += '<table>\n'
str_s += emit_table_row(size, 'ad', total).replace('ad', 'ads')
str_s += emit_table_row(size, 'analytics', total)
str_s += emit_table_row(size, 'tracker', total).replace('tracker', 'trackers')
str_s += emit_table_row(size, 'widget', total).replace('widget', 'widgets')
str_s += emit_table_row(size, '-', total).replace('-', 'other')
str_s += '</table>\n'
str_s += '</div>\n'
str_s += '</div>\n'
return str_s
def emit_item(item):
total = item['total']
total_junk = item['widget'] + item['ad'] + item['privacy'] + item['analytics'] + item['tracker']
if total == 0:
total = 1
str_s = '<div class="items">'
str_s += '<h4>Page Http Request Elements</h4>\n'
str_s += '<div class="data">\n'
str_s += '<table>\n'
str_s += emit_table_row(item, 'ad', total).replace('ad', 'ads')
str_s += emit_table_row(item, 'analytics', total)
str_s += emit_table_row(item, 'tracker', total).replace('tracker', 'trackers')
str_s += emit_table_row(item, 'widget', total).replace('widget', 'widgets')
str_s += emit_table_row(item, '-', total).replace('-', 'other')
str_s += '</table>\n'
str_s += '</div>\n'
str_s += '</div>\n'
return str_s
def emit_time(time):
total = time['widget'] + time['ad'] + time['privacy'] + time['-'] + time['analytics'] + time['tracker']
if total == 0:
total = 1
str_s = '<div class="times">'
str_s += '<h4>(Micro) Timing (ms)</h4>\n'
str_s += '<div class="data">\n'
str_s += '<table>\n'
str_s += emit_table_row(time, 'ad', total).replace('ad', 'ads')
str_s += emit_table_row(time, 'analytics', total)
str_s += emit_table_row(time, 'tracker', total).replace('tracker', 'trackers')
str_s += emit_table_row(time, 'widget', total).replace('widget', 'widgets')
str_s += emit_table_row(time, '-', total).replace('-', 'other')
str_s += '</table>\n'
str_s += '</div>\n'
str_s += '</div>\n'
return str_s
if __name__ == '__main__':
fp = sys.stdin
try:
stats = json.loads(fp.read())
except Exception, ee:
sys.exit('Error loading data... Aborting.')
try:
template = open(os.path.join('.', 'index_template.html'), 'r+');
except:
print('error opening template file. aborting...');
sys.exit(0);
content = ""
content += emit_header()
for e in stats:
card = "<div class='card'>\n"
stats = e['stats']
card += "\t\t\t" + emit_name(stats['host'], stats['date'], stats['time'])
card += "\t\t\t" + emit_item(stats['items'])
card += "\t\t\t" + emit_size(stats['sizes'])
card += "\t\t\t" + emit_time(stats['times'])
card += "</div>\n"
content += card
html = template.read().replace('[[content]]', content);
print html

View File

@ -1,13 +1,14 @@
#!/usr/bin/python2.7
import sys, csv, json, os
import sys, csv, json, os, re
from optparse import OptionParser
# list of fileds from the wpt csv file to keep
csv_fields = [
'Date',
'Time',
'tracker_type',
'bug_type',
'bug_name',
'Sequence Number',
'Host',
'IP Address',
@ -40,14 +41,24 @@ csv_fields = [
'Initiator Line',
'Expires',
'Cached',
'Cookie Count(out)',
'Cookie Count(out)'
]
def filter_fields(wpt_row, type):
# matches a given url to all possible bugs
def match(url, bugs):
for b in bugs:
pattern = re.compile(b['pattern'])
if(pattern.search(url)):
return {'name': b['name'], 'type': b['type'], }
return None
# filters the csv file (deleted some columns) and adds the 'tracker type'
def filter_fields(wpt_row, type, name):
for k in wpt_row.keys():
if k not in csv_fields:
del wpt_row[k]
wpt_row['tracker_type'] = type
wpt_row['bug_type'] = type
wpt_row['bug_name'] = name
def run(options):
@ -72,8 +83,6 @@ def run(options):
except Exception, ee:
sys.exit('Error loading bugs data... Aborting.')
stats = {'total' : 0.0, 'ads': 0.0, 'trackers': 0.0, 'analytics': 0.0, 'widgets': 0.0, 'privacy': 0.0, 'blank': 0.0}
#write ouput
fname, ext = os.path.splitext(os.path.basename(csv_file))
@ -87,67 +96,31 @@ def run(options):
writer.writeheader()
last_seq = 0
for r in wpt_data:
seq = int(r['Sequence Number'])
if seq < last_seq:
break
last_seq = seq
stats['total'] += 1
host = r['Host']
if any(a in host for a in bugs_data['ads']):
# print "ads: " + host
filter_fields(r, 'ad')
print str(seq)
url = r['Host'] + r['URL']
bug = match(url, bugs_data['bugs'])
if bug:
filter_fields(r, bug['type'], bug['name'])
writer.writerow(r);
stats['ads'] += 1
continue
if any(a in host for a in bugs_data['trackers']):
# print "trackers: " + host
filter_fields(r, 'tracker')
writer.writerow(r);
stats['trackers'] += 1
continue
if any(a in host for a in bugs_data['analytics']):
# print "analytics: " + host
filter_fields(r, 'analytics')
writer.writerow(r);
stats['analytics'] += 1
continue
if any(a in host for a in bugs_data['widgets']):
# print "widgets: " + host
filter_fields(r, 'widget')
writer.writerow(r);
stats['widgets'] += 1
continue
if any(a in host for a in bugs_data['privacy']):
# print "privacy: " + host
filter_fields(r, 'privacy')
writer.writerow(r);
stats['privacy'] += 1
continue
else:
if options.keep:
stats['blank'] += 1
filter_fields(r, '-')
filter_fields(r, '-', '-')
writer.writerow(r);
if options.stats:
print "----- Stats: " + fname + ext + " -----"
print "total (elements): " + str(stats['total'])
print "ads: " + str(stats['ads']) + ' - ' + str(stats['ads'] / stats['total']) + '%'
print "trackers: " + str(stats['trackers']) + ' - ' + str(stats['trackers'] / stats['total']) + '%'
print "analytics: " + str(stats['analytics']) + ' - ' + str(stats['analytics'] / stats['total']) + '%'
print "widgets: " + str(stats['widgets']) + ' - ' + str(stats['widgets'] / stats['total']) + '%'
print "privacy: " + str(stats['privacy']) + ' - ' + str(stats['privacy'] / stats['total']) + '%'
print "..............."
print "* JUNK RATIO * " + str((stats['ads'] + stats['trackers'] + stats['analytics'] + stats['widgets'] + stats['privacy']) / stats['total']) + '%'
if __name__ == '__main__':
p = OptionParser();
p.add_option('-f', '--file', action="store", help="wpt csv input file")
p.add_option('-b', '--bugs', action="store", help="ghostery (formated) bugs input file")
p.add_option('-k', '--keep', action="store_true", help="keeps the non bugs html element")
p.add_option('-s', '--stats', action="store_true", help="prints basic stats")
p.add_option('-o', '--outputdir', action="store", help="output directory", default="")

View File

@ -1,6 +1,6 @@
#!/usr/bin/python2.7
import sys, csv, json, os
import sys, csv, json, os, time
from optparse import OptionParser
import wpt_csv_filter as wptf
@ -30,11 +30,15 @@ def run(options):
csv_files.append(os.path.join(dirpath, fn))
break
i = 0
for f in csv_files:
i += 1
options.file = f
print "processing - " + f
print str(i) + "/" + str(len(csv_files)) + " - " + f
start_time = time.time()
wptf.run(options)
print ".......done........"
duration = time.time() - start_time
print "done - " + time.strftime('%H:%M:%S', time.gmtime(duration))
if __name__ == '__main__':
@ -42,7 +46,6 @@ if __name__ == '__main__':
p.add_option('-i', '--inputdir', action="store", help="input directory (where all the wpt csv files reside)")
p.add_option('-b', '--bugs', action="store", help="ghostery (formated) bugs input file")
p.add_option('-k', '--keep', action="store_true", help="keeps the non bugs html element")
p.add_option('-s', '--stats', action="store_true", help="prints basic stats")
p.add_option('-o', '--outputdir', action="store", help="output directory (where all the filtered csv files will be placed)", default="")
options, args = p.parse_args()

View File

@ -0,0 +1,47 @@
import sys, csv, json, os, re
from optparse import OptionParser
def run(csv_file_path):
if not os.path.exists(csv_file_path):
sys.exit('Input file does not exists. Aborting.')
stats = {
'date' : None,
'time' : None,
'host' : None,
'items' : {'ad': 0.0, 'tracker': 0.0, 'analytics': 0.0, 'widget': 0.0, 'privacy': 0.0, '-': 0.0, 'total' : 0.0},
'sizes' : {'ad': 0.0, 'tracker': 0.0, 'analytics': 0.0, 'widget': 0.0, 'privacy': 0.0, '-': 0.0},
'times' : {'ad': 0.0, 'tracker': 0.0, 'analytics': 0.0, 'widget': 0.0, 'privacy': 0.0, '-': 0.0}
}
with open(csv_file_path) as csv_file:
wpt_data = csv.DictReader(csv_file)
items = stats['items']
sizes = stats['sizes']
times = stats['times']
for d in wpt_data:
if not stats['host']:
stats['host'] = d['Host'] # first line
stats['date'] = d['Date'] # first line
stats['time'] = d['Time'] # first line
items['total'] += 1
bug_type = d['bug_type']
items[bug_type] += 1
sizes[bug_type] += int(d['Object Size'])
times[bug_type] += int(d['Time to Load (ms)'])
return stats
if __name__ == '__main__':
p = OptionParser();
p.add_option('-f', '--file', action="store", help="wpt csv input file")
options, args = p.parse_args()
if not options.file:
sys.exit('No wpt csv input file specified. Aborting.')
stats = run(options.file)
print json.dumps(stats, indent=2, separators=(',',':'))

View File

@ -0,0 +1,34 @@
import sys, csv, json, os, re
from optparse import OptionParser
import wpt_csv_stats
def run(input_dir):
if not os.path.exists(input_dir):
sys.exit('Input directory does not exists. Aborting.')
stats = []
csv_files = []
for (dirpath, dirnames, filenames) in os.walk(input_dir):
for fn in filenames:
fname, ext = os.path.splitext(fn)
if ext == '.csv':
filepath = os.path.join(dirpath, fn)
stats.append({'name' : fname, 'stats': wpt_csv_stats.run(filepath)})
break
return stats
if __name__ == '__main__':
p = OptionParser();
p.add_option('-i', '--inputdir', action="store", help="input directory (where all the wpt csv files reside)")
options, args = p.parse_args()
if not options.inputdir:
sys.exit('No input directory specified. Aborting.')
stats = run(options.inputdir)
print json.dumps(stats, indent=2, separators=(',',':'))