114 lines
2.1 KiB
Python
Raw Permalink Normal View History

2022-03-13 17:09:05 +01:00
import string, regex
2022-04-10 15:20:13 +02:00
from gensim.utils import tokenize
2022-04-11 13:09:01 +02:00
import nltk
2022-03-06 14:20:53 +01:00
def clean(text: str) -> str:
s = text.split('\n')
if(len(s) > 0):
tok_1 = s[0].split(' ')
if len(tok_1) > 0 and tok_1[0].strip() in string.punctuation:
s_1 = ' '.join(tok_1[1:])
s[0] = s_1.capitalize()
else:
s[0] = s[0].capitalize()
return '\n'.join(s)
def format(text: str) -> str:
2022-04-04 09:12:49 +02:00
return text.replace('\r\n', '\n').replace('\n\n', '\n').replace('', "'").replace("", "'")
2022-03-13 17:09:05 +01:00
def fragments(utterance: str):
frags = []
sentences = utterance.splitlines()
2022-03-17 15:29:44 +01:00
PUNCT_RE = regex.compile(r'(\p{Punctuation})')
skip_punctuation = ["'"]
2022-03-13 17:09:05 +01:00
for s in sentences:
sf = PUNCT_RE.split(s)
cum = ""
for k in sf:
if len(k) < 1:
continue
elif len(k) > 1:
cum += k
elif k not in string.punctuation:
cum += k
else:
cum += k
2022-03-17 15:29:44 +01:00
if k in skip_punctuation:
continue
else:
2022-04-11 13:09:01 +02:00
# if cum[0] not in [' ', '\n']:
# cum = ' ' + cum
2022-03-17 15:29:44 +01:00
frags.append(cum)
cum = ""
2022-03-13 17:09:05 +01:00
cum += '\n'
frags.append(cum)
2022-04-11 13:09:01 +02:00
# get rid of newline (2x)
if len(frags) > 0:
if frags[-1] == '\n':
frags = frags[:-1]
if len(frags) > 0:
if frags[-1] == '\n':
frags = frags[:-1]
2022-03-13 17:09:05 +01:00
return frags
2022-04-11 13:09:01 +02:00
2022-04-10 15:20:13 +02:00
def tokenise(utterance: str):
return list(tokenize(utterance, lower=True))
2022-04-11 13:09:01 +02:00
def fix_sentence(s: str) -> str:
if len(s.strip()) <= 1:
return None
text = nltk.word_tokenize(s)
tags = nltk.pos_tag(text)
if len(tags) == 0:
return None
elif len(tags) == 1 and tags[0][1] in ['DT', 'WDT', 'IN']:
return None
elif len(tags) == 1 and len(tags[0][0]) == 1 and tags[0][1] in ['NN']:
return None
if tags[-1][1] in ['DT', 'WDT', 'IN', 'CC']:
tags = tags[:-1]
return " ".join([x[0] for x in tags])
elif tags[-1][1] in ['NN'] and len(tags[-1][0]) == 1:
tags = tags[:-1]
return " ".join([x[0] for x in tags])
return s
def fix_punctuation(s: str) -> str:
if len(s.strip()) == 0:
return ""
if len(s) == 1 and s in string.punctuation:
if s != ',':
return s
else:
return ""
e = s.rstrip()
if e[-1] in string.punctuation:
if e[-1] in [',', ':']:
s = e[:-1] + '.'
else:
s = e + '.'
return s
2022-03-13 17:09:05 +01:00