import string, regex from gensim.utils import tokenize import nltk def clean(text: str) -> str: s = text.split('\n') if(len(s) > 0): tok_1 = s[0].split(' ') if len(tok_1) > 0 and tok_1[0].strip() in string.punctuation: s_1 = ' '.join(tok_1[1:]) s[0] = s_1.capitalize() else: s[0] = s[0].capitalize() return '\n'.join(s) def format(text: str) -> str: return text.replace('\r\n', '\n').replace('\n\n', '\n').replace('‘', "'").replace("’", "'") def fragments(utterance: str): frags = [] sentences = utterance.splitlines() PUNCT_RE = regex.compile(r'(\p{Punctuation})') skip_punctuation = ["'"] for s in sentences: sf = PUNCT_RE.split(s) cum = "" for k in sf: if len(k) < 1: continue elif len(k) > 1: cum += k elif k not in string.punctuation: cum += k else: cum += k if k in skip_punctuation: continue else: # if cum[0] not in [' ', '\n']: # cum = ' ' + cum frags.append(cum) cum = "" cum += '\n' frags.append(cum) # get rid of newline (2x) if len(frags) > 0: if frags[-1] == '\n': frags = frags[:-1] if len(frags) > 0: if frags[-1] == '\n': frags = frags[:-1] return frags def tokenise(utterance: str): return list(tokenize(utterance, lower=True)) def fix_sentence(s: str) -> str: if len(s.strip()) <= 1: return None text = nltk.word_tokenize(s) tags = nltk.pos_tag(text) if len(tags) == 0: return None elif len(tags) == 1 and tags[0][1] in ['DT', 'WDT', 'IN']: return None elif len(tags) == 1 and len(tags[0][0]) == 1 and tags[0][1] in ['NN']: return None if tags[-1][1] in ['DT', 'WDT', 'IN', 'CC']: tags = tags[:-1] return " ".join([x[0] for x in tags]) elif tags[-1][1] in ['NN'] and len(tags[-1][0]) == 1: tags = tags[:-1] return " ".join([x[0] for x in tags]) return s def fix_punctuation(s: str) -> str: if len(s.strip()) == 0: return "" if len(s) == 1 and s in string.punctuation: if s != ',': return s else: return "" e = s.rstrip() if e[-1] in string.punctuation: if e[-1] in [',', ':']: s = e[:-1] + '.' else: s = e + '.' return s