114 lines
2.1 KiB
Python
114 lines
2.1 KiB
Python
import string, regex
|
||
from gensim.utils import tokenize
|
||
import nltk
|
||
|
||
def clean(text: str) -> str:
|
||
|
||
s = text.split('\n')
|
||
|
||
if(len(s) > 0):
|
||
tok_1 = s[0].split(' ')
|
||
if len(tok_1) > 0 and tok_1[0].strip() in string.punctuation:
|
||
s_1 = ' '.join(tok_1[1:])
|
||
s[0] = s_1.capitalize()
|
||
else:
|
||
s[0] = s[0].capitalize()
|
||
|
||
return '\n'.join(s)
|
||
|
||
def format(text: str) -> str:
|
||
|
||
return text.replace('\r\n', '\n').replace('\n\n', '\n').replace('‘', "'").replace("’", "'")
|
||
|
||
def fragments(utterance: str):
|
||
frags = []
|
||
sentences = utterance.splitlines()
|
||
|
||
PUNCT_RE = regex.compile(r'(\p{Punctuation})')
|
||
|
||
skip_punctuation = ["'"]
|
||
|
||
for s in sentences:
|
||
sf = PUNCT_RE.split(s)
|
||
cum = ""
|
||
for k in sf:
|
||
if len(k) < 1:
|
||
continue
|
||
elif len(k) > 1:
|
||
cum += k
|
||
elif k not in string.punctuation:
|
||
cum += k
|
||
else:
|
||
cum += k
|
||
if k in skip_punctuation:
|
||
continue
|
||
else:
|
||
# if cum[0] not in [' ', '\n']:
|
||
# cum = ' ' + cum
|
||
frags.append(cum)
|
||
cum = ""
|
||
cum += '\n'
|
||
frags.append(cum)
|
||
|
||
# get rid of newline (2x)
|
||
if len(frags) > 0:
|
||
if frags[-1] == '\n':
|
||
frags = frags[:-1]
|
||
if len(frags) > 0:
|
||
if frags[-1] == '\n':
|
||
frags = frags[:-1]
|
||
|
||
return frags
|
||
|
||
|
||
def tokenise(utterance: str):
|
||
return list(tokenize(utterance, lower=True))
|
||
|
||
def fix_sentence(s: str) -> str:
|
||
|
||
if len(s.strip()) <= 1:
|
||
return None
|
||
|
||
text = nltk.word_tokenize(s)
|
||
tags = nltk.pos_tag(text)
|
||
|
||
if len(tags) == 0:
|
||
return None
|
||
elif len(tags) == 1 and tags[0][1] in ['DT', 'WDT', 'IN']:
|
||
return None
|
||
elif len(tags) == 1 and len(tags[0][0]) == 1 and tags[0][1] in ['NN']:
|
||
return None
|
||
|
||
if tags[-1][1] in ['DT', 'WDT', 'IN', 'CC']:
|
||
tags = tags[:-1]
|
||
return " ".join([x[0] for x in tags])
|
||
elif tags[-1][1] in ['NN'] and len(tags[-1][0]) == 1:
|
||
tags = tags[:-1]
|
||
return " ".join([x[0] for x in tags])
|
||
|
||
return s
|
||
|
||
def fix_punctuation(s: str) -> str:
|
||
|
||
if len(s.strip()) == 0:
|
||
return ""
|
||
|
||
if len(s) == 1 and s in string.punctuation:
|
||
if s != ',':
|
||
return s
|
||
else:
|
||
return ""
|
||
|
||
e = s.rstrip()
|
||
if e[-1] in string.punctuation:
|
||
if e[-1] in [',', ':']:
|
||
s = e[:-1] + '.'
|
||
else:
|
||
s = e + '.'
|
||
|
||
return s
|
||
|
||
|
||
|
||
|