2022-04-11 13:09:01 +02:00

114 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import string, regex
from gensim.utils import tokenize
import nltk
def clean(text: str) -> str:
s = text.split('\n')
if(len(s) > 0):
tok_1 = s[0].split(' ')
if len(tok_1) > 0 and tok_1[0].strip() in string.punctuation:
s_1 = ' '.join(tok_1[1:])
s[0] = s_1.capitalize()
else:
s[0] = s[0].capitalize()
return '\n'.join(s)
def format(text: str) -> str:
return text.replace('\r\n', '\n').replace('\n\n', '\n').replace('', "'").replace("", "'")
def fragments(utterance: str):
frags = []
sentences = utterance.splitlines()
PUNCT_RE = regex.compile(r'(\p{Punctuation})')
skip_punctuation = ["'"]
for s in sentences:
sf = PUNCT_RE.split(s)
cum = ""
for k in sf:
if len(k) < 1:
continue
elif len(k) > 1:
cum += k
elif k not in string.punctuation:
cum += k
else:
cum += k
if k in skip_punctuation:
continue
else:
# if cum[0] not in [' ', '\n']:
# cum = ' ' + cum
frags.append(cum)
cum = ""
cum += '\n'
frags.append(cum)
# get rid of newline (2x)
if len(frags) > 0:
if frags[-1] == '\n':
frags = frags[:-1]
if len(frags) > 0:
if frags[-1] == '\n':
frags = frags[:-1]
return frags
def tokenise(utterance: str):
return list(tokenize(utterance, lower=True))
def fix_sentence(s: str) -> str:
if len(s.strip()) <= 1:
return None
text = nltk.word_tokenize(s)
tags = nltk.pos_tag(text)
if len(tags) == 0:
return None
elif len(tags) == 1 and tags[0][1] in ['DT', 'WDT', 'IN']:
return None
elif len(tags) == 1 and len(tags[0][0]) == 1 and tags[0][1] in ['NN']:
return None
if tags[-1][1] in ['DT', 'WDT', 'IN', 'CC']:
tags = tags[:-1]
return " ".join([x[0] for x in tags])
elif tags[-1][1] in ['NN'] and len(tags[-1][0]) == 1:
tags = tags[:-1]
return " ".join([x[0] for x in tags])
return s
def fix_punctuation(s: str) -> str:
if len(s.strip()) == 0:
return ""
if len(s) == 1 and s in string.punctuation:
if s != ',':
return s
else:
return ""
e = s.rstrip()
if e[-1] in string.punctuation:
if e[-1] in [',', ':']:
s = e[:-1] + '.'
else:
s = e + '.'
return s