fix endings etc.

This commit is contained in:
NATURESPEAK
2022-04-11 13:09:01 +02:00
parent 9673a3d4f0
commit e2f841465c
5 changed files with 141 additions and 7 deletions
+56
View File
@@ -1,5 +1,6 @@
import string, regex
from gensim.utils import tokenize
import nltk
def clean(text: str) -> str:
@@ -42,16 +43,71 @@ def fragments(utterance: str):
if k in skip_punctuation:
continue
else:
# if cum[0] not in [' ', '\n']:
# cum = ' ' + cum
frags.append(cum)
cum = ""
cum += '\n'
frags.append(cum)
# get rid of newline (2x)
if len(frags) > 0:
if frags[-1] == '\n':
frags = frags[:-1]
if len(frags) > 0:
if frags[-1] == '\n':
frags = frags[:-1]
return frags
def tokenise(utterance: str):
return list(tokenize(utterance, lower=True))
def fix_sentence(s: str) -> str:
if len(s.strip()) <= 1:
return None
text = nltk.word_tokenize(s)
tags = nltk.pos_tag(text)
if len(tags) == 0:
return None
elif len(tags) == 1 and tags[0][1] in ['DT', 'WDT', 'IN']:
return None
elif len(tags) == 1 and len(tags[0][0]) == 1 and tags[0][1] in ['NN']:
return None
if tags[-1][1] in ['DT', 'WDT', 'IN', 'CC']:
tags = tags[:-1]
return " ".join([x[0] for x in tags])
elif tags[-1][1] in ['NN'] and len(tags[-1][0]) == 1:
tags = tags[:-1]
return " ".join([x[0] for x in tags])
return s
def fix_punctuation(s: str) -> str:
if len(s.strip()) == 0:
return ""
if len(s) == 1 and s in string.punctuation:
if s != ',':
return s
else:
return ""
e = s.rstrip()
if e[-1] in string.punctuation:
if e[-1] in [',', ':']:
s = e[:-1] + '.'
else:
s = e + '.'
return s
+1 -1
View File
@@ -3,7 +3,7 @@ import utterance.utils
import gensim, regex, string, time
UTTERANCE_MEMORY_LEN = 15
UTTERANCE_MEMORY_MIN_DIST = 0.2
UTTERANCE_MEMORY_MIN_DIST = 0.85
class Voice: