2014-08-18 18:38:12 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
|
|
from statemachine import StateMachine
|
|
|
|
|
import sys, string, re, json
|
|
|
|
|
|
|
|
|
|
markups = {'QUOTES' : ('PAGE', 'pp', 'tags', 'quote', 'fpc'), 'NOTES' : ('NOTE', '#', 'tags', 'note', 'fpc')}
|
|
|
|
|
output = {'QUOTES' : [], 'NOTES' : []}
|
|
|
|
|
|
|
|
|
|
|
2014-08-20 19:14:40 +02:00
|
|
|
def error(c):
|
|
|
|
|
fp, l = c
|
|
|
|
|
sys.stderr.write('Unidentifiable line:\n'+ l)
|
2014-08-18 18:38:12 +02:00
|
|
|
|
2014-08-20 19:14:40 +02:00
|
|
|
def eof(c):
|
2014-08-18 18:38:12 +02:00
|
|
|
fpindx = open('.indx','wb')
|
|
|
|
|
json.dump(output, fpindx)
|
|
|
|
|
|
|
|
|
|
def parse(c):
|
|
|
|
|
fp, l = c
|
|
|
|
|
while 1:
|
|
|
|
|
line = fp.readline()
|
|
|
|
|
if not line: return eof, (fp, line)
|
|
|
|
|
if line[:2] == '##': return section(line), (fp, line)
|
|
|
|
|
else: continue
|
|
|
|
|
|
|
|
|
|
def QUOTES(c):
|
|
|
|
|
fp, l = c
|
|
|
|
|
while 1:
|
|
|
|
|
line = fp.readline()
|
|
|
|
|
if not line: return eof, (fp, line)
|
2014-08-25 12:54:27 +02:00
|
|
|
elif line.strip().upper().startswith('PAGE'): return segment, (fp, line, 'QUOTES', markups['QUOTES'])
|
|
|
|
|
elif line.strip().startswith(u'##'): return section(line), (fp, line)
|
2014-08-18 18:38:12 +02:00
|
|
|
else: continue
|
|
|
|
|
|
2014-08-25 12:54:27 +02:00
|
|
|
def NOTES(c):
|
2014-08-18 18:38:12 +02:00
|
|
|
fp, l = c
|
|
|
|
|
while 1:
|
|
|
|
|
line = fp.readline()
|
|
|
|
|
if not line: return eof, (fp, line)
|
2014-08-25 12:54:27 +02:00
|
|
|
elif line.strip().upper().startswith('NOTE'): return segment, (fp, line, 'NOTES', markups['NOTES'])
|
|
|
|
|
elif line[:2] == '##': return section(line), (fp, line)
|
2014-08-18 18:38:12 +02:00
|
|
|
else: continue
|
|
|
|
|
|
|
|
|
|
def segment(c):
|
|
|
|
|
fp, l, sect, mk = c
|
|
|
|
|
m, x, tt, y, cnt = mk
|
|
|
|
|
c = ''
|
|
|
|
|
t = []
|
|
|
|
|
q = ''
|
|
|
|
|
cc = ''
|
|
|
|
|
# identifier
|
|
|
|
|
c = ext_identifier(l)
|
|
|
|
|
while 1:
|
|
|
|
|
cursor = fp.tell()
|
|
|
|
|
line = fp.readline()
|
|
|
|
|
if not line:
|
|
|
|
|
# transition: EOF - record entry
|
|
|
|
|
rec_segment(c, t, q, cc, (sect, x, tt, y, cnt))
|
|
|
|
|
return eof, (fp, line)
|
2014-08-25 12:54:27 +02:00
|
|
|
elif line.strip().upper().startswith(m):
|
2014-08-18 18:38:12 +02:00
|
|
|
# transition: new segment - record entry
|
|
|
|
|
rec_segment(c, t, q, cc, (sect, x, tt, y, cnt))
|
|
|
|
|
return segment, (fp, line, sect, mk)
|
|
|
|
|
elif line[:1] == '<':
|
|
|
|
|
# tags
|
|
|
|
|
t += ext_tags(line)
|
|
|
|
|
continue
|
|
|
|
|
elif line[:2] == '##':
|
|
|
|
|
# transition: new section - record entry
|
|
|
|
|
rec_segment(c, t, q, cc, (sect, x, tt, y, cnt))
|
|
|
|
|
return section(line), (fp, line)
|
|
|
|
|
elif line == '\n' :
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
# text
|
|
|
|
|
if not cc:
|
|
|
|
|
cc = cursor
|
|
|
|
|
q += line
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
## helper fncts
|
|
|
|
|
def section(line):
|
|
|
|
|
line = string.upper(line)
|
|
|
|
|
if string.find(line, 'NOTES') >= 0: return NOTES
|
|
|
|
|
elif string.find(line, 'QUOTES') >= 0: return QUOTES
|
|
|
|
|
elif string.find(line, 'REFERENCE') >= 0: return parse
|
2014-08-25 12:54:27 +02:00
|
|
|
else: return parse
|
2014-08-18 18:38:12 +02:00
|
|
|
|
2014-08-25 12:54:27 +02:00
|
|
|
# todo - optimise this (i.e: id != only the last word)
|
2014-08-18 18:38:12 +02:00
|
|
|
def ext_identifier(line):
|
|
|
|
|
b = string.rsplit(line)
|
|
|
|
|
return b[-1]
|
|
|
|
|
|
|
|
|
|
def ext_tags(line):
|
|
|
|
|
line = line.rstrip('\n').replace(' ','')
|
|
|
|
|
t = re.split('<|>', line)
|
|
|
|
|
return [v for v in t if v]
|
|
|
|
|
|
|
|
|
|
def rec_segment(idf, tags, text, cnt, mk):
|
|
|
|
|
if not text:
|
2014-08-25 12:54:27 +02:00
|
|
|
#sys.stderr.write('hmm... no quote on pp.' + idf)
|
2014-08-18 18:38:12 +02:00
|
|
|
return None
|
2014-08-25 12:54:27 +02:00
|
|
|
if text[0] == '>':
|
|
|
|
|
text = text[1:]
|
|
|
|
|
text = text.strip()
|
2014-08-18 18:38:12 +02:00
|
|
|
section_i, idf_i, tags_i, text_i, cnt_i = mk
|
|
|
|
|
entry = {idf_i : idf, text_i : text, tags_i : tags, cnt_i : cnt}
|
|
|
|
|
output[section_i].append(entry)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
m = StateMachine();
|
|
|
|
|
m.add_state(parse)
|
|
|
|
|
m.add_state(NOTES)
|
|
|
|
|
m.add_state(QUOTES)
|
|
|
|
|
m.add_state(segment)
|
|
|
|
|
m.add_state(error, end_state=1)
|
|
|
|
|
m.add_state(eof, end_state=1)
|
|
|
|
|
m.set_start(parse)
|
|
|
|
|
m.run((sys.stdin, ''))
|