From c3962aa5adf57beab31819b3be483c35a0cf184b Mon Sep 17 00:00:00 2001 From: gauthiier Date: Mon, 25 Aug 2014 12:54:27 +0200 Subject: [PATCH] inject from skim --- nnnew.sh | 2 + parse/compare.py | 74 +++++++++++++++++ parse/compare.sh | 16 ++++ parse/inject.py | 191 ++++++++++++++++++++++++++++++++++++++++++++ parse/inject.sh | 16 ++++ parse/parse-skim.py | 10 +-- parse/parse-skim.sh | 18 +++++ parse/parse.py | 20 +++-- 8 files changed, 334 insertions(+), 13 deletions(-) create mode 100755 parse/compare.py create mode 100755 parse/compare.sh create mode 100755 parse/inject.py create mode 100755 parse/inject.sh mode change 100644 => 100755 parse/parse-skim.py create mode 100755 parse/parse-skim.sh diff --git a/nnnew.sh b/nnnew.sh index 9bdbf98..3678500 100755 --- a/nnnew.sh +++ b/nnnew.sh @@ -63,6 +63,8 @@ cp -aR $WHERE/template/* $TITLE cd $TITLE +touch notes.mmd + # file exists? copy to output [ -z "$FILE" ] && echo "No file to copy. Make sure to reference in text in the note!" || curl -O $FILE diff --git a/parse/compare.py b/parse/compare.py new file mode 100755 index 0000000..902eed3 --- /dev/null +++ b/parse/compare.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +import sys, os, string, json + +fp1 = '' +fp2 = '' + +def difference(data1, data2): + output = {'QUOTES' : [], 'NOTES' : []} + + d1 = {i['quote']: i for i in data1['QUOTES']} + d2 = {i['quote']: i for i in data2['QUOTES']} + + #create sets + s1 = set(d1.keys()) + s2 = set(d2.keys()) + + #symmetric difference - nope + #diff = list(s1 ^ s2) + + # difference between s2 and s1 (s2 being pdf annotation) + diff = list(s2 - s1) + + for d in diff: + output['QUOTES'].append(d2[d]) + + d1 = {i['note']: i for i in data1['NOTES']} + d2 = {i['note']: i for i in data2['NOTES']} + + #create sets + s1 = set(d1.keys()) + s2 = set(d2.keys()) + + #symmetric difference - nope + #diff = list(s1 ^ s2) + + # difference between s2 and s1 (s2 being pdf annotation) + diff = list(s2 - s1) + + for d in diff: + output['NOTES'].append(d2[d]) + + return output + +def open_file(p): + if not os.path.exists(p): + sys.exit('File %s does not exists... Aborting.' % p) + return open(p, 'rb') + +if __name__ == '__main__': + if len(sys.argv) < 2: + sys.exit('No input file... Aborting.') + fp1 = open_file(sys.argv[1]) + if len(sys.argv) < 3: + fp2 = sys.stdin + else: + fp2 = open_file(sys.argv[2]) + + data1 = json.load(fp1) + data2 = json.load(fp2) + + # print "----" + # print data1 + # print "----" + # print data2 + # print "----" + + fp1.close() + fp2.close() + + data = difference(data1, data2) + + json.dump(data, sys.stdout) + + diff --git a/parse/compare.sh b/parse/compare.sh new file mode 100755 index 0000000..773a62b --- /dev/null +++ b/parse/compare.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +INDXFILE='.indx' + +for i in *.mmd; do + f=$i +done + +if [[ ! -f $INDXFILE ]]; then + make --quiet index +elif [[ $f -nt $INDXFILE ]]; then + rm $INDXFILE + make --quiet index +fi + +parse-skim.sh | compare.py .indx diff --git a/parse/inject.py b/parse/inject.py new file mode 100755 index 0000000..5e54c46 --- /dev/null +++ b/parse/inject.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +from statemachine import StateMachine +import sys, os, string, json, shutil, codecs + +quote_nbr = sys.maxint +fileout = '' +fileref = '' +notes = [] +quotes = [] + +def error(c): + fp, l = c + sys.stderr.write('Unidentifiable line:\n'+ l) + +def eof(c): + print "eof" + return + +def parse(c): + fp, l = c + while 1: + line = fp.readline() + if not line: return eof, (fp, line) + if line[:2] == '##': return section(line), (fp, line) + else: + emit_line(line) + continue + +def QUOTES(c): + fp, l = c + while 1: + line = fp.readline() + if not line: + emit_remaining_quotes() + return eof, (fp, line) + elif string.find(string.upper(line), 'PAGE') >= 0: return process_quote, (fp, line) + elif line[:2] == '##': + emit_remaining_quotes() + return section(line), (fp, line) + else: + emit_line(line) + continue + +def NOTES(c): + fp, l = c + while 1: + line = fp.readline() + if not line: + emit_line('\n') + emit_remaining_notes() + return eof, (fp, line) + elif string.find(string.upper(line), 'NOTE') >= 0: return process_note, (fp, line) + elif line[:2] == '##': + emit_line('\n') + emit_remaining_notes() + return section(line), (fp, line) + else: + emit_line(line) + continue + +def process_quote(c): + fp, l = c + ppnbr = int(string.rsplit(l)[-1]) + for i in quotes.keys(): + if int(i) < ppnbr: + emit_quotes(quotes[i]) + emit_line(l) + return QUOTES(c) + +def process_note(c): + fp, l = c + emit_line(l) + return NOTES(c) + + +#################### + +def section(line): + emit_line(line) + line = string.upper(line) + if string.find(line, 'NOTES') >= 0: + if not notes: + return parse + return NOTES + elif string.find(line, 'QUOTES') >= 0: + if not quotes: + return parse + return QUOTES + elif string.find(line, 'REFERENCE') >= 0: return parse + else: return parse + +def emit_remaining_quotes(): + rest = [] + for i in quotes: + rest.extend(quotes[i]) + emit_quotes(rest) + +def emit_quotes(list): + while list: + emit_quote(list.pop()) + +def emit_quote(data): + emit_line("page " + data['pp'] + "\n\n") + emit_line(">" + data['quote'] + "\n") + emit_line('\n') + +def emit_remaining_notes(): + rest = [] + for i in notes: + rest.extend(notes[i]) + for j in rest: + emit_note(j) + +def emit_note(data): + emit_line("note: " + data['pp'] + "\n\n") + emit_line(data['note'] + "\n" ) + emit_line('\n') + +def emit_line(l): + #l = l.encode('utf-8') + fileout.write(l) + +def reoder(q): + out = {} + while q: + i = q.pop() + if i['pp'] in out.keys(): + out[i['pp']].append(i) + else: + out[i['pp']] = [i] + return out + +def open_file(p): + if not os.path.exists(p): + sys.exit('File %s does not exists... Aborting.' % p) + return codecs.open(p, 'rb', 'utf-8') + +def open_fileoutput(p): + if not os.path.exists(p): + sys.exit('File %s does not exists... Aborting.' % p) + return codecs.open(p, 'r+', 'utf-8') + +def backupfile(p): + if not os.path.exists(p): + sys.exit('File %s does not exists... Aborting.' % p) + bak = p + '.bak' + shutil.copy2(p, bak) + return codecs.open(bak, 'r', 'utf-8') + + +if __name__ == '__main__': + if len(sys.argv) < 2: + sys.exit('No input file... Aborting.') + # fp1 should be the incoming .mmd file + fileref = backupfile(sys.argv[1]) + fileout = open_fileoutput(sys.argv[1]) + fileout.seek(0) + if len(sys.argv) < 3: + fp2 = sys.stdin + else: + fp2 = open_file(sys.argv[2]) + + # fp2 should be the incoming (json) data to inject in fp1 + data = json.load(fp2) + fp2.close() + + print data + + if not data['QUOTES'] and not data['NOTES']: + print "Document up-to-date." + fileout.close() + sys.exit(0) + + quotes = reoder(data['QUOTES']) + notes = reoder(data['NOTES']) + + m = StateMachine(); + m.add_state(parse) + m.add_state(NOTES) + m.add_state(QUOTES) + m.add_state(process_quote) + m.add_state(process_note) + m.add_state(error, end_state=1) + m.add_state(eof, end_state=1) + m.set_start(parse) + m.run((fileref, '')) + + fileout.close() + fileref.close() + + diff --git a/parse/inject.sh b/parse/inject.sh new file mode 100755 index 0000000..a9afdc5 --- /dev/null +++ b/parse/inject.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +for i in *.mmd; do + f=$i +done + +if [[ -z '$f' ]]; then + echo "No mmd file (markdown source file) in directory... Aborting."; + exit; +fi + +# inject new nnnotes in source file +compare.sh | inject.py $f + +# update index +make --quiet index diff --git a/parse/parse-skim.py b/parse/parse-skim.py old mode 100644 new mode 100755 index be7ba92..58db9c1 --- a/parse/parse-skim.py +++ b/parse/parse-skim.py @@ -24,7 +24,7 @@ def highlight(c): fp, l = c p = page(l) text = fp.readline() - output['QUOTES'].append({'pp' : p, 'quote' : text}) + output['QUOTES'].append({'pp' : p, 'quote' : text.strip()}) return parse(c) def anchored_note(c): @@ -33,22 +33,22 @@ def anchored_note(c): text = fp.readline() fp.readline() note = fp.readline() - output['QUOTES'].append({'pp' : p, 'quote' : text}) - output['NOTES'].append({'pp' : p, 'note' : note}) + output['QUOTES'].append({'pp' : p, 'quote' : text.strip()}) + output['NOTES'].append({'pp' : p, 'note' : note.strip()}) return parse(c) def box(c): fp, l = c p = page(l) text = fp.readline() - output['QUOTES'].append({'pp' : p, 'quote' : text}) + output['QUOTES'].append({'pp' : p, 'quote' : text.strip()}) return parse(c) def text_note(c): fp, l = c p = page(l) text = fp.readline() - output['NOTES'].append({'pp' : p, 'note' : text}) + output['NOTES'].append({'pp' : p, 'note' : text.strip()}) return parse(c) ## helper fncts diff --git a/parse/parse-skim.sh b/parse/parse-skim.sh new file mode 100755 index 0000000..d773de7 --- /dev/null +++ b/parse/parse-skim.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +for i in *.pdf; do + f=$i +done + +if [[ -z '$f' ]]; then + echo "No pdf file in directory... Aborting."; + exit; +fi + +filename="${f%.*}" + +skimnotes get -format txt $f + +parse-skim.py < "$filename.txt" + +#rm $filename.txt \ No newline at end of file diff --git a/parse/parse.py b/parse/parse.py index 322a8fd..227cf9f 100755 --- a/parse/parse.py +++ b/parse/parse.py @@ -25,20 +25,20 @@ def parse(c): def QUOTES(c): fp, l = c - sys.stdout.write('QUOTES\n') while 1: line = fp.readline() if not line: return eof, (fp, line) - elif string.find(string.upper(line), 'PAGE') >= 0: return segment, (fp, line, 'QUOTES', markups['QUOTES']) + elif line.strip().upper().startswith('PAGE'): return segment, (fp, line, 'QUOTES', markups['QUOTES']) + elif line.strip().startswith(u'##'): return section(line), (fp, line) else: continue -def NOTES(c): +def NOTES(c): fp, l = c - sys.stdout.write('NOTES\n') while 1: line = fp.readline() if not line: return eof, (fp, line) - elif string.find(string.upper(line), 'NOTE') >= 0: return segment, (fp, line, 'NOTES', markups['NOTES']) + elif line.strip().upper().startswith('NOTE'): return segment, (fp, line, 'NOTES', markups['NOTES']) + elif line[:2] == '##': return section(line), (fp, line) else: continue def segment(c): @@ -57,7 +57,7 @@ def segment(c): # transition: EOF - record entry rec_segment(c, t, q, cc, (sect, x, tt, y, cnt)) return eof, (fp, line) - elif string.find(string.upper(line), m) >= 0: + elif line.strip().upper().startswith(m): # transition: new segment - record entry rec_segment(c, t, q, cc, (sect, x, tt, y, cnt)) return segment, (fp, line, sect, mk) @@ -84,8 +84,9 @@ def section(line): if string.find(line, 'NOTES') >= 0: return NOTES elif string.find(line, 'QUOTES') >= 0: return QUOTES elif string.find(line, 'REFERENCE') >= 0: return parse - else: return error + else: return parse +# todo - optimise this (i.e: id != only the last word) def ext_identifier(line): b = string.rsplit(line) return b[-1] @@ -97,8 +98,11 @@ def ext_tags(line): def rec_segment(idf, tags, text, cnt, mk): if not text: - print 'hmm... no quote on pp.', c + #sys.stderr.write('hmm... no quote on pp.' + idf) return None + if text[0] == '>': + text = text[1:] + text = text.strip() section_i, idf_i, tags_i, text_i, cnt_i = mk entry = {idf_i : idf, text_i : text, tags_i : tags, cnt_i : cnt} output[section_i].append(entry)