new nnnotes and qqquotes identifiers

This commit is contained in:
gauthiier 2014-08-25 17:57:34 +02:00
parent c3962aa5ad
commit e88d0d1973
3 changed files with 75 additions and 35 deletions

View File

@ -7,6 +7,7 @@ fileout = ''
fileref = '' fileref = ''
notes = [] notes = []
quotes = [] quotes = []
notes_cnt = 0
def error(c): def error(c):
fp, l = c fp, l = c
@ -33,7 +34,7 @@ def QUOTES(c):
if not line: if not line:
emit_remaining_quotes() emit_remaining_quotes()
return eof, (fp, line) return eof, (fp, line)
elif string.find(string.upper(line), 'PAGE') >= 0: return process_quote, (fp, line) elif is_quote_identifier(line): return process_quote, (fp, line)
elif line[:2] == '##': elif line[:2] == '##':
emit_remaining_quotes() emit_remaining_quotes()
return section(line), (fp, line) return section(line), (fp, line)
@ -49,7 +50,7 @@ def NOTES(c):
emit_line('\n') emit_line('\n')
emit_remaining_notes() emit_remaining_notes()
return eof, (fp, line) return eof, (fp, line)
elif string.find(string.upper(line), 'NOTE') >= 0: return process_note, (fp, line) elif is_note_identifier(line): return process_note, (fp, line)
elif line[:2] == '##': elif line[:2] == '##':
emit_line('\n') emit_line('\n')
emit_remaining_notes() emit_remaining_notes()
@ -60,7 +61,7 @@ def NOTES(c):
def process_quote(c): def process_quote(c):
fp, l = c fp, l = c
ppnbr = int(string.rsplit(l)[-1]) ppnbr = int(extract_identifier(l))
for i in quotes.keys(): for i in quotes.keys():
if int(i) < ppnbr: if int(i) < ppnbr:
emit_quotes(quotes[i]) emit_quotes(quotes[i])
@ -68,7 +69,11 @@ def process_quote(c):
return QUOTES(c) return QUOTES(c)
def process_note(c): def process_note(c):
global notes_cnt
fp, l = c fp, l = c
cnt = int(extract_identifier(l))
if(cnt > notes_cnt):
notes_cnt = cnt
emit_line(l) emit_line(l)
return NOTES(c) return NOTES(c)
@ -89,6 +94,18 @@ def section(line):
elif string.find(line, 'REFERENCE') >= 0: return parse elif string.find(line, 'REFERENCE') >= 0: return parse
else: return parse else: return parse
def is_quote_identifier(line):
l = line.strip().upper()
return l.startswith("<!--") and l.find("PAGE") >= 0
def is_note_identifier(line):
l = line.strip().upper()
return l.startswith("<!--") and l.find("NOTE") >= 0
def extract_identifier(line):
t = line.strip().replace('<!--', '').replace('-->', '')
return t.strip().rsplit()[-1]
def emit_remaining_quotes(): def emit_remaining_quotes():
rest = [] rest = []
for i in quotes: for i in quotes:
@ -100,8 +117,8 @@ def emit_quotes(list):
emit_quote(list.pop()) emit_quote(list.pop())
def emit_quote(data): def emit_quote(data):
emit_line("page " + data['pp'] + "\n\n") emit_line("<!--page " + data['pp'] + "-->\n\n")
emit_line(">" + data['quote'] + "\n") emit_line(">\"" + data['quote'] + "\" pp." + data['pp'] + "\n")
emit_line('\n') emit_line('\n')
def emit_remaining_notes(): def emit_remaining_notes():
@ -112,9 +129,10 @@ def emit_remaining_notes():
emit_note(j) emit_note(j)
def emit_note(data): def emit_note(data):
emit_line("note: " + data['pp'] + "\n\n") global notes_cnt
emit_line(data['note'] + "\n" ) notes_cnt += 1
emit_line('\n') emit_line("<!--note " + str(notes_cnt) + "-->\n\n")
emit_line(str(notes_cnt) + ". " + data['note'] + "\n\n" )
def emit_line(l): def emit_line(l):
#l = l.encode('utf-8') #l = l.encode('utf-8')
@ -164,8 +182,6 @@ if __name__ == '__main__':
data = json.load(fp2) data = json.load(fp2)
fp2.close() fp2.close()
print data
if not data['QUOTES'] and not data['NOTES']: if not data['QUOTES'] and not data['NOTES']:
print "Document up-to-date." print "Document up-to-date."
fileout.close() fileout.close()
@ -174,6 +190,8 @@ if __name__ == '__main__':
quotes = reoder(data['QUOTES']) quotes = reoder(data['QUOTES'])
notes = reoder(data['NOTES']) notes = reoder(data['NOTES'])
notes_cnt = 0
m = StateMachine(); m = StateMachine();
m.add_state(parse) m.add_state(parse)
m.add_state(NOTES) m.add_state(NOTES)

View File

@ -15,4 +15,4 @@ skimnotes get -format txt $f
parse-skim.py < "$filename.txt" parse-skim.py < "$filename.txt"
#rm $filename.txt rm $filename.txt

View File

@ -3,7 +3,19 @@
from statemachine import StateMachine from statemachine import StateMachine
import sys, string, re, json import sys, string, re, json
markups = {'QUOTES' : ('PAGE', 'pp', 'tags', 'quote', 'fpc'), 'NOTES' : ('NOTE', '#', 'tags', 'note', 'fpc')} def is_quote_identifier(line):
l = line.strip().upper()
return l.startswith("<!--") and l.find("PAGE") >= 0
def is_note_identifier(line):
l = line.strip().upper()
return l.startswith("<!--") and l.find("NOTE") >= 0
def is_tag_identifier(line):
l = line.strip()
return l.startswith('<') and not l.startswith('<!')
markups = {'QUOTES' : (is_quote_identifier, 'pp', 'tags', 'quote', 'fpc'), 'NOTES' : (is_note_identifier, '#', 'tags', 'note', 'fpc')}
output = {'QUOTES' : [], 'NOTES' : []} output = {'QUOTES' : [], 'NOTES' : []}
@ -20,7 +32,7 @@ def parse(c):
while 1: while 1:
line = fp.readline() line = fp.readline()
if not line: return eof, (fp, line) if not line: return eof, (fp, line)
if line[:2] == '##': return section(line), (fp, line) if line.strip().startswith('##'): return section(line), (fp, line)
else: continue else: continue
def QUOTES(c): def QUOTES(c):
@ -28,8 +40,8 @@ def QUOTES(c):
while 1: while 1:
line = fp.readline() line = fp.readline()
if not line: return eof, (fp, line) if not line: return eof, (fp, line)
elif line.strip().upper().startswith('PAGE'): return segment, (fp, line, 'QUOTES', markups['QUOTES']) elif is_quote_identifier(line): return segment, (fp, line, 'QUOTES', markups['QUOTES'])
elif line.strip().startswith(u'##'): return section(line), (fp, line) elif line.strip().startswith('##'): return section(line), (fp, line)
else: continue else: continue
def NOTES(c): def NOTES(c):
@ -37,8 +49,8 @@ def NOTES(c):
while 1: while 1:
line = fp.readline() line = fp.readline()
if not line: return eof, (fp, line) if not line: return eof, (fp, line)
elif line.strip().upper().startswith('NOTE'): return segment, (fp, line, 'NOTES', markups['NOTES']) elif is_note_identifier(line): return segment, (fp, line, 'NOTES', markups['NOTES'])
elif line[:2] == '##': return section(line), (fp, line) elif line.strip().startswith('##'): return section(line), (fp, line)
else: continue else: continue
def segment(c): def segment(c):
@ -49,25 +61,28 @@ def segment(c):
q = '' q = ''
cc = '' cc = ''
# identifier # identifier
c = ext_identifier(l) c = extract_identifier(l)
while 1: while 1:
cursor = fp.tell() cursor = fp.tell()
line = fp.readline() line = fp.readline()
if not line: if not line:
# transition: EOF - record entry # transition: EOF - record entry
rec_segment(c, t, q, cc, (sect, x, tt, y, cnt)) record_segment(c, t, q, cc, (sect, x, tt, y, cnt))
return eof, (fp, line) return eof, (fp, line)
elif line.strip().upper().startswith(m):
elif m(line):
# transition: new segment - record entry # transition: new segment - record entry
rec_segment(c, t, q, cc, (sect, x, tt, y, cnt)) record_segment(c, t, q, cc, (sect, x, tt, y, cnt))
return segment, (fp, line, sect, mk) return segment, (fp, line, sect, mk)
elif line[:1] == '<': elif is_tag_identifier(line):
# tags # tags
t += ext_tags(line) t += extract_tags(line)
continue continue
elif line[:2] == '##': elif line[:2] == '##':
# transition: new section - record entry # transition: new section - record entry
rec_segment(c, t, q, cc, (sect, x, tt, y, cnt)) record_segment(c, t, q, cc, (sect, x, tt, y, cnt))
return section(line), (fp, line) return section(line), (fp, line)
elif line == '\n' : elif line == '\n' :
continue continue
@ -81,32 +96,39 @@ def segment(c):
## helper fncts ## helper fncts
def section(line): def section(line):
line = string.upper(line) line = string.upper(line)
if string.find(line, 'NOTES') >= 0: return NOTES if line.find('NOTES') >= 0: return NOTES
elif string.find(line, 'QUOTES') >= 0: return QUOTES elif line.find('QUOTES') >= 0: return QUOTES
elif string.find(line, 'REFERENCE') >= 0: return parse elif line.find('REFERENCE') >= 0: return parse
else: return parse else: return parse
# todo - optimise this (i.e: id != only the last word) # todo - optimise this (i.e: id != only the last word)
def ext_identifier(line): def extract_identifier(line):
b = string.rsplit(line) t = line.strip().replace('<!--', '').replace('-->', '')
return b[-1] return t.strip().rsplit()[-1]
def ext_tags(line): def extract_tags(line):
line = line.rstrip('\n').replace(' ','') line = line.rstrip('\n').replace(' ','')
t = re.split('<|>', line) t = re.split('<|>', line)
return [v for v in t if v] return [v for v in t if v]
def rec_segment(idf, tags, text, cnt, mk): def record_segment(idf, tags, text, cnt, mk):
if not text: if not text:
#sys.stderr.write('hmm... no quote on pp.' + idf) #sys.stderr.write('hmm... no quote on pp.' + idf)
return None return None
if text[0] == '>': text = escape_quote(text)
text = text[1:] text = escape_note(text)
text = text.strip()
section_i, idf_i, tags_i, text_i, cnt_i = mk section_i, idf_i, tags_i, text_i, cnt_i = mk
entry = {idf_i : idf, text_i : text, tags_i : tags, cnt_i : cnt} entry = {idf_i : idf, text_i : text, tags_i : tags, cnt_i : cnt}
output[section_i].append(entry) output[section_i].append(entry)
def escape_quote(line):
if(not line.strip().startswith('>')):
return line
l = re.sub('\"*\"', '', line.strip()[1:])
return re.sub('pp.[0-9]+', '', l)
def escape_note(line):
return re.sub('^[0-9]+.', '', line).strip()
if __name__ == '__main__': if __name__ == '__main__':
m = StateMachine(); m = StateMachine();