nnnotes/parse/parse.py

141 lines
3.5 KiB
Python
Raw Normal View History

2014-08-18 18:38:12 +02:00
#!/usr/bin/env python
from statemachine import StateMachine
import sys, string, re, json
2014-08-25 17:57:34 +02:00
def is_quote_identifier(line):
l = line.strip().upper()
return l.startswith("<!--") and l.find("PAGE") >= 0
def is_note_identifier(line):
l = line.strip().upper()
return l.startswith("<!--") and l.find("NOTE") >= 0
def is_tag_identifier(line):
l = line.strip()
return l.startswith('<') and not l.startswith('<!')
markups = {'QUOTES' : (is_quote_identifier, 'pp', 'tags', 'quote', 'fpc'), 'NOTES' : (is_note_identifier, '#', 'tags', 'note', 'fpc')}
2014-08-18 18:38:12 +02:00
output = {'QUOTES' : [], 'NOTES' : []}
2014-08-20 19:14:40 +02:00
def error(c):
fp, l = c
sys.stderr.write('Unidentifiable line:\n'+ l)
2014-08-18 18:38:12 +02:00
2014-08-20 19:14:40 +02:00
def eof(c):
2014-08-18 18:38:12 +02:00
fpindx = open('.indx','wb')
json.dump(output, fpindx)
def parse(c):
fp, l = c
while 1:
line = fp.readline()
if not line: return eof, (fp, line)
2014-08-25 17:57:34 +02:00
if line.strip().startswith('##'): return section(line), (fp, line)
2014-08-18 18:38:12 +02:00
else: continue
def QUOTES(c):
fp, l = c
while 1:
line = fp.readline()
if not line: return eof, (fp, line)
2014-08-25 17:57:34 +02:00
elif is_quote_identifier(line): return segment, (fp, line, 'QUOTES', markups['QUOTES'])
elif line.strip().startswith('##'): return section(line), (fp, line)
2014-08-18 18:38:12 +02:00
else: continue
2014-08-25 12:54:27 +02:00
def NOTES(c):
2014-08-18 18:38:12 +02:00
fp, l = c
while 1:
line = fp.readline()
if not line: return eof, (fp, line)
2014-08-25 17:57:34 +02:00
elif is_note_identifier(line): return segment, (fp, line, 'NOTES', markups['NOTES'])
elif line.strip().startswith('##'): return section(line), (fp, line)
2014-08-18 18:38:12 +02:00
else: continue
def segment(c):
fp, l, sect, mk = c
m, x, tt, y, cnt = mk
c = ''
t = []
q = ''
cc = ''
# identifier
2014-08-25 17:57:34 +02:00
c = extract_identifier(l)
2014-08-18 18:38:12 +02:00
while 1:
cursor = fp.tell()
line = fp.readline()
2014-08-25 17:57:34 +02:00
2014-08-18 18:38:12 +02:00
if not line:
# transition: EOF - record entry
2014-08-25 17:57:34 +02:00
record_segment(c, t, q, cc, (sect, x, tt, y, cnt))
2014-08-18 18:38:12 +02:00
return eof, (fp, line)
2014-08-25 17:57:34 +02:00
elif m(line):
2014-08-18 18:38:12 +02:00
# transition: new segment - record entry
2014-08-25 17:57:34 +02:00
record_segment(c, t, q, cc, (sect, x, tt, y, cnt))
2014-08-18 18:38:12 +02:00
return segment, (fp, line, sect, mk)
2014-08-25 17:57:34 +02:00
elif is_tag_identifier(line):
2014-08-18 18:38:12 +02:00
# tags
2014-08-25 17:57:34 +02:00
t += extract_tags(line)
2014-08-18 18:38:12 +02:00
continue
elif line[:2] == '##':
# transition: new section - record entry
2014-08-25 17:57:34 +02:00
record_segment(c, t, q, cc, (sect, x, tt, y, cnt))
2014-08-18 18:38:12 +02:00
return section(line), (fp, line)
elif line == '\n' :
continue
else:
# text
if not cc:
cc = cursor
q += line
continue
## helper fncts
def section(line):
line = string.upper(line)
2014-08-25 17:57:34 +02:00
if line.find('NOTES') >= 0: return NOTES
elif line.find('QUOTES') >= 0: return QUOTES
elif line.find('REFERENCE') >= 0: return parse
2014-08-25 12:54:27 +02:00
else: return parse
2014-08-18 18:38:12 +02:00
2014-08-25 12:54:27 +02:00
# todo - optimise this (i.e: id != only the last word)
2014-08-25 17:57:34 +02:00
def extract_identifier(line):
t = line.strip().replace('<!--', '').replace('-->', '')
return t.strip().rsplit()[-1]
2014-08-18 18:38:12 +02:00
2014-08-25 17:57:34 +02:00
def extract_tags(line):
2014-08-18 18:38:12 +02:00
line = line.rstrip('\n').replace(' ','')
t = re.split('<|>', line)
return [v for v in t if v]
2014-08-25 17:57:34 +02:00
def record_segment(idf, tags, text, cnt, mk):
2014-08-18 18:38:12 +02:00
if not text:
2014-08-25 12:54:27 +02:00
#sys.stderr.write('hmm... no quote on pp.' + idf)
2014-08-18 18:38:12 +02:00
return None
2014-08-25 17:57:34 +02:00
text = escape_quote(text)
text = escape_note(text)
2014-08-18 18:38:12 +02:00
section_i, idf_i, tags_i, text_i, cnt_i = mk
entry = {idf_i : idf, text_i : text, tags_i : tags, cnt_i : cnt}
output[section_i].append(entry)
2014-08-25 17:57:34 +02:00
def escape_quote(line):
if(not line.strip().startswith('>')):
return line
l = re.sub('\"*\"', '', line.strip()[1:])
return re.sub('pp.[0-9]+', '', l)
def escape_note(line):
return re.sub('^[0-9]+.', '', line).strip()
2014-08-18 18:38:12 +02:00
if __name__ == '__main__':
m = StateMachine();
m.add_state(parse)
m.add_state(NOTES)
m.add_state(QUOTES)
m.add_state(segment)
m.add_state(error, end_state=1)
m.add_state(eof, end_state=1)
m.set_start(parse)
m.run((sys.stdin, ''))