nnnotes/parse/parse.py

#!/usr/bin/env python

from statemachine import StateMachine
import sys, string, re, json

def is_quote_identifier(line):
	l = line.strip().upper()
	return l.startswith("<!--") and l.find("PAGE") >= 0

def is_note_identifier(line):
	l = line.strip().upper()
	return l.startswith("<!--") and l.find("NOTE") >= 0	

def is_tag_identifier(line):
	l = line.strip()
	return l.startswith('<') and not l.startswith('<!')

markups = {'QUOTES' : (is_quote_identifier, 'pp', 'tags', 'quote', 'fpc'), 'NOTES' : (is_note_identifier, '#', 'tags', 'note', 'fpc')}
output = {'QUOTES' : [], 'NOTES' : []}

def error(c):
	fp, l = c
	sys.stderr.write('Unidentifiable line:\n'+ l)

def eof(c):
	fpindx = open('.indx','wb')
	json.dump(output, fpindx)

def parse(c):
	fp, l = c
	while 1:
		line = fp.readline()
		if not line: return eof, (fp, line)
		if line.strip().startswith('##'): return section(line), (fp, line)
		else: continue

def QUOTES(c):
	fp, l = c
	while 1:
		line = fp.readline()
		if not line: return eof, (fp, line)
		elif is_quote_identifier(line): return segment, (fp, line, 'QUOTES', markups['QUOTES'])
		elif line.strip().startswith('##'): return section(line), (fp, line)
		else: continue

def NOTES(c):	
	fp, l = c
	while 1:
		line = fp.readline()
		if not line: return eof, (fp, line)
		elif is_note_identifier(line): return segment, (fp, line, 'NOTES', markups['NOTES'])
		elif line.strip().startswith('##'): return section(line), (fp, line)
		else: continue

def segment(c):
	fp, l, sect, mk = c
	m, x, tt, y, cnt = mk
	c = '' 
	t = [] 
	q = ''
	cc = ''
	# identifier
	c = extract_identifier(l)
	while 1:
		cursor = fp.tell()
		line = fp.readline()

		if not line: 
			# transition: EOF - record entry
			record_segment(c, t, q, cc, (sect, x, tt, y, cnt))
			return eof, (fp, line)

		elif m(line):
			# transition: new segment - record entry
			record_segment(c, t, q, cc, (sect, x, tt, y, cnt))
			return segment, (fp, line, sect, mk)
		elif is_tag_identifier(line): 
			# tags
			t += extract_tags(line)
			continue
		elif line[:2] == '##': 
			# transition: new section - record entry
			record_segment(c, t, q, cc, (sect, x, tt, y, cnt))
			return section(line), (fp, line)
		elif line == '\n' :
			continue
		else:
			# text
			if not cc:
				cc = cursor
			q += line
			continue

## helper fncts
def section(line):
	line = string.upper(line)
	if line.find('NOTES') >= 0: return NOTES
	elif line.find('QUOTES') >= 0: return QUOTES
	elif line.find('REFERENCE') >= 0: return parse
	else: return parse

# todo - optimise this (i.e: id != only the last word)
def extract_identifier(line):
	t = line.strip().replace('<!--', '').replace('-->', '')
	return t.strip().rsplit()[-1]

def extract_tags(line):
	line = line.rstrip('\n').replace(' ','')
	t = re.split('<|>', line)
	return [v for v in t if v]

def record_segment(idf, tags, text, cnt, mk):
	if not text:
		#sys.stderr.write('hmm... no quote on pp.' + idf)
		return None
	text = escape_quote(text)
	text = escape_note(text)
	section_i, idf_i, tags_i, text_i, cnt_i = mk
	entry = {idf_i : idf, text_i : text, tags_i : tags, cnt_i : cnt}
	output[section_i].append(entry)

def escape_quote(line):
	if(not line.strip().startswith('>')):
		return line
	l = re.sub('\"*\"', '', line.strip()[1:])
	return re.sub('pp.[0-9]+', '', l)

def escape_note(line):
	return re.sub('^[0-9]+.', '', line).strip()

if __name__ == '__main__':
	m = StateMachine();
	m.add_state(parse)
	m.add_state(NOTES)
	m.add_state(QUOTES)
	m.add_state(segment)
	m.add_state(error, end_state=1)
	m.add_state(eof, end_state=1)
	m.set_start(parse)
	m.run((sys.stdin, ''))
haha! commit 2014-08-18 18:38:12 +02:00			`#!/usr/bin/env python`

			`from statemachine import StateMachine`
			`import sys, string, re, json`

new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`def is_quote_identifier(line):`
			`l = line.strip().upper()`
			`return l.startswith("<!--") and l.find("PAGE") >= 0`

			`def is_note_identifier(line):`
			`l = line.strip().upper()`
			`return l.startswith("<!--") and l.find("NOTE") >= 0`

			`def is_tag_identifier(line):`
			`l = line.strip()`
			`return l.startswith('<') and not l.startswith('<!')`

			`markups = {'QUOTES' : (is_quote_identifier, 'pp', 'tags', 'quote', 'fpc'), 'NOTES' : (is_note_identifier, '#', 'tags', 'note', 'fpc')}`
haha! commit 2014-08-18 18:38:12 +02:00			`output = {'QUOTES' : [], 'NOTES' : []}`

renaming functions inputs 2014-08-20 19:14:40 +02:00			`def error(c):`
			`fp, l = c`
			`sys.stderr.write('Unidentifiable line:\n'+ l)`
haha! commit 2014-08-18 18:38:12 +02:00
renaming functions inputs 2014-08-20 19:14:40 +02:00			`def eof(c):`
haha! commit 2014-08-18 18:38:12 +02:00			`fpindx = open('.indx','wb')`
			`json.dump(output, fpindx)`

			`def parse(c):`
			`fp, l = c`
			`while 1:`
			`line = fp.readline()`
			`if not line: return eof, (fp, line)`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`if line.strip().startswith('##'): return section(line), (fp, line)`
haha! commit 2014-08-18 18:38:12 +02:00			`else: continue`

			`def QUOTES(c):`
			`fp, l = c`
			`while 1:`
			`line = fp.readline()`
			`if not line: return eof, (fp, line)`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`elif is_quote_identifier(line): return segment, (fp, line, 'QUOTES', markups['QUOTES'])`
			`elif line.strip().startswith('##'): return section(line), (fp, line)`
haha! commit 2014-08-18 18:38:12 +02:00			`else: continue`

inject from skim 2014-08-25 12:54:27 +02:00			`def NOTES(c):`
haha! commit 2014-08-18 18:38:12 +02:00			`fp, l = c`
			`while 1:`
			`line = fp.readline()`
			`if not line: return eof, (fp, line)`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`elif is_note_identifier(line): return segment, (fp, line, 'NOTES', markups['NOTES'])`
			`elif line.strip().startswith('##'): return section(line), (fp, line)`
haha! commit 2014-08-18 18:38:12 +02:00			`else: continue`

			`def segment(c):`
			`fp, l, sect, mk = c`
			`m, x, tt, y, cnt = mk`
			`c = ''`
			`t = []`
			`q = ''`
			`cc = ''`
			`# identifier`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`c = extract_identifier(l)`
haha! commit 2014-08-18 18:38:12 +02:00			`while 1:`
			`cursor = fp.tell()`
			`line = fp.readline()`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00
haha! commit 2014-08-18 18:38:12 +02:00			`if not line:`
			`# transition: EOF - record entry`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`record_segment(c, t, q, cc, (sect, x, tt, y, cnt))`
haha! commit 2014-08-18 18:38:12 +02:00			`return eof, (fp, line)`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00
			`elif m(line):`
haha! commit 2014-08-18 18:38:12 +02:00			`# transition: new segment - record entry`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`record_segment(c, t, q, cc, (sect, x, tt, y, cnt))`
haha! commit 2014-08-18 18:38:12 +02:00			`return segment, (fp, line, sect, mk)`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`elif is_tag_identifier(line):`
haha! commit 2014-08-18 18:38:12 +02:00			`# tags`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`t += extract_tags(line)`
haha! commit 2014-08-18 18:38:12 +02:00			`continue`
			`elif line[:2] == '##':`
			`# transition: new section - record entry`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`record_segment(c, t, q, cc, (sect, x, tt, y, cnt))`
haha! commit 2014-08-18 18:38:12 +02:00			`return section(line), (fp, line)`
			`elif line == '\n' :`
			`continue`
			`else:`
			`# text`
			`if not cc:`
			`cc = cursor`
			`q += line`
			`continue`

			`## helper fncts`
			`def section(line):`
			`line = string.upper(line)`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`if line.find('NOTES') >= 0: return NOTES`
			`elif line.find('QUOTES') >= 0: return QUOTES`
			`elif line.find('REFERENCE') >= 0: return parse`
inject from skim 2014-08-25 12:54:27 +02:00			`else: return parse`
haha! commit 2014-08-18 18:38:12 +02:00
inject from skim 2014-08-25 12:54:27 +02:00			`# todo - optimise this (i.e: id != only the last word)`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`def extract_identifier(line):`
			`t = line.strip().replace('<!--', '').replace('-->', '')`
			`return t.strip().rsplit()[-1]`
haha! commit 2014-08-18 18:38:12 +02:00
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`def extract_tags(line):`
haha! commit 2014-08-18 18:38:12 +02:00			`line = line.rstrip('\n').replace(' ','')`
			`t = re.split('<\|>', line)`
			`return [v for v in t if v]`

new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`def record_segment(idf, tags, text, cnt, mk):`
haha! commit 2014-08-18 18:38:12 +02:00			`if not text:`
inject from skim 2014-08-25 12:54:27 +02:00			`#sys.stderr.write('hmm... no quote on pp.' + idf)`
haha! commit 2014-08-18 18:38:12 +02:00			`return None`
new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`text = escape_quote(text)`
			`text = escape_note(text)`
haha! commit 2014-08-18 18:38:12 +02:00			`section_i, idf_i, tags_i, text_i, cnt_i = mk`
			`entry = {idf_i : idf, text_i : text, tags_i : tags, cnt_i : cnt}`
			`output[section_i].append(entry)`

new nnnotes and qqquotes identifiers 2014-08-25 17:57:34 +02:00			`def escape_quote(line):`
			`if(not line.strip().startswith('>')):`
			`return line`
			`l = re.sub('\"*\"', '', line.strip()[1:])`
			`return re.sub('pp.[0-9]+', '', l)`

			`def escape_note(line):`
			`return re.sub('^[0-9]+.', '', line).strip()`
haha! commit 2014-08-18 18:38:12 +02:00
			`if __name__ == '__main__':`
			`m = StateMachine();`
			`m.add_state(parse)`
			`m.add_state(NOTES)`
			`m.add_state(QUOTES)`
			`m.add_state(segment)`
			`m.add_state(error, end_state=1)`
			`m.add_state(eof, end_state=1)`
			`m.set_start(parse)`
			`m.run((sys.stdin, ''))`