NATURESPEAK-ML-UTTER/utterance/utils.py

import string, regex
from gensim.utils import tokenize
import nltk

def clean(text: str) -> str:

	s = text.split('\n')

	if(len(s) > 0):
		tok_1 = s[0].split(' ')
		if len(tok_1) > 0 and tok_1[0].strip() in string.punctuation:
				s_1 = ' '.join(tok_1[1:])
				s[0] = s_1.capitalize()
		else:
			s[0] = s[0].capitalize()

	return '\n'.join(s)

def format(text: str) -> str:

	return text.replace('\r\n', '\n').replace('\n\n', '\n').replace('‘', "'").replace("’", "'")

def fragments(utterance: str):
	frags = []
	sentences = utterance.splitlines()

	PUNCT_RE = regex.compile(r'(\p{Punctuation})')

	skip_punctuation = ["'"]

	for s in sentences:
		sf = PUNCT_RE.split(s)
		cum = ""
		for k in sf:
			if len(k) < 1:
				continue
			elif len(k) > 1:
				cum += k
			elif k not in string.punctuation:
				cum += k
			else:
				cum += k
				if k in skip_punctuation:
					continue
				else:
					# if cum[0] not in [' ', '\n']:
					# 	cum = ' ' + cum
					frags.append(cum)
					cum = ""
		cum += '\n'
		frags.append(cum)

	# get rid of newline (2x)
	if len(frags) > 0:
		if frags[-1] == '\n':
			frags = frags[:-1]
		if len(frags) > 0:
			if frags[-1] == '\n':
				frags = frags[:-1]

	return frags


def tokenise(utterance: str):
	return list(tokenize(utterance, lower=True))

def fix_sentence(s: str) -> str:

	if len(s.strip()) <= 1:
		return None

	text = nltk.word_tokenize(s)
	tags = nltk.pos_tag(text)

	if len(tags) == 0:
		return None
	elif len(tags) == 1 and tags[0][1] in ['DT', 'WDT', 'IN']:
		return None
	elif len(tags) == 1 and len(tags[0][0]) == 1 and tags[0][1] in ['NN']:
		return None

	if tags[-1][1] in ['DT', 'WDT', 'IN', 'CC']:
		tags = tags[:-1]
		return " ".join([x[0] for x in tags])
	elif tags[-1][1] in ['NN'] and len(tags[-1][0]) == 1:
		tags = tags[:-1]
		return " ".join([x[0] for x in tags])

	return s

def fix_punctuation(s: str) -> str:

	if len(s.strip()) == 0:
		return ""

	if len(s) == 1 and s in string.punctuation:
		if s != ',':
			return s
		else:
			return ""

	e = s.rstrip()
	if e[-1] in string.punctuation:
		if e[-1] in [',', ':']:
			s = e[:-1] + '.'
	else:
		s = e + '.'

	return s