fix endings etc.

2022-04-11 13:09:01 +02:00
parent 9673a3d4f0
commit e2f841465c
5 changed files with 141 additions and 7 deletions
@@ -1,5 +1,6 @@
 import string, regex
 from gensim.utils import tokenize
+import nltk

 def clean(text: str) -> str:

@@ -42,16 +43,71 @@ def fragments(utterance: str):
 				if k in skip_punctuation:
 					continue
 				else:
+					# if cum[0] not in [' ', '\n']:
+					# 	cum = ' ' + cum
 					frags.append(cum)
 					cum = ""
 		cum += '\n'
 		frags.append(cum)

+	# get rid of newline (2x)
+	if len(frags) > 0: 
+		if frags[-1] == '\n':
+			frags = frags[:-1]
+		if len(frags) > 0:
+			if frags[-1] == '\n':
+				frags = frags[:-1]
+
 	return frags

+
 def tokenise(utterance: str):
 	return list(tokenize(utterance, lower=True))

+def fix_sentence(s: str) -> str:
+
+	if len(s.strip()) <= 1:
+		return None
+
+	text = nltk.word_tokenize(s)
+	tags = nltk.pos_tag(text)
+
+	if len(tags) == 0:
+		return None
+	elif len(tags) == 1 and tags[0][1] in ['DT', 'WDT', 'IN']:
+		return None
+	elif len(tags) == 1 and len(tags[0][0]) == 1 and tags[0][1] in ['NN']:
+		return None
+
+	if tags[-1][1] in ['DT', 'WDT', 'IN', 'CC']:
+		tags = tags[:-1]
+		return " ".join([x[0] for x in tags])
+	elif tags[-1][1] in ['NN'] and len(tags[-1][0]) == 1:
+		tags = tags[:-1]
+		return " ".join([x[0] for x in tags])
+
+	return s	
+
+def fix_punctuation(s: str) -> str:
+
+	if len(s.strip()) == 0:
+		return ""
+
+	if len(s) == 1 and s in string.punctuation:
+		if s != ',':
+			return s
+		else:
+			return ""
+
+	e = s.rstrip()
+	if e[-1] in string.punctuation:
+		if e[-1] in [',', ':']:
+			s = e[:-1] + '.'
+	else:
+		s = e + '.'
+
+	return s	
+



@@ -3,7 +3,7 @@ import utterance.utils
 import gensim, regex, string, time

 UTTERANCE_MEMORY_LEN = 15
-UTTERANCE_MEMORY_MIN_DIST = 0.2
+UTTERANCE_MEMORY_MIN_DIST = 0.85

 class Voice: