#!/usr/bin/env python from statemachine import StateMachine import sys, string, re, json def is_quote_identifier(line): l = line.strip().upper() return l.startswith("', '') return t.strip().rsplit()[-1] def extract_tags(line): line = line.rstrip('\n').replace(' ','') t = re.split('<|>', line) return [v for v in t if v] def record_segment(idf, tags, text, cnt, mk): if not text: #sys.stderr.write('hmm... no quote on pp.' + idf) return None text = escape_quote(text) text = escape_note(text) section_i, idf_i, tags_i, text_i, cnt_i = mk entry = {idf_i : idf, text_i : text, tags_i : tags, cnt_i : cnt} output[section_i].append(entry) def escape_quote(line): if(not line.strip().startswith('>')): return line l = re.sub('\"*\"', '', line.strip()[1:]) return re.sub('pp.[0-9]+', '', l) def escape_note(line): return re.sub('^[0-9]+.', '', line).strip() if __name__ == '__main__': m = StateMachine(); m.add_state(parse) m.add_state(NOTES) m.add_state(QUOTES) m.add_state(segment) m.add_state(error, end_state=1) m.add_state(eof, end_state=1) m.set_start(parse) m.run((sys.stdin, ''))