examine + new speak
This commit is contained in:
parent
abf2f2f720
commit
11201664b8
29
config.json
Normal file
29
config.json
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
{
|
||||||
|
"voices": [
|
||||||
|
{
|
||||||
|
"name": "Ralph",
|
||||||
|
"model_dir": "data/tokens+models/Emerson-Nature.txt_bs=64_ns=8000_vs=5000",
|
||||||
|
"tokeniser_file": "data/tokens+models/Emerson-Nature.txt_bs=64_ns=8000_vs=5000/Emerson-Nature.txt_ns=5000.tokenizer.json",
|
||||||
|
"temperature": "0.9"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Jean",
|
||||||
|
"model_dir": "data/tokens+models/Lafontaine-Fables[english].txt_bs=64_ns=8000_vs=5000",
|
||||||
|
"tokeniser_file": "data/tokens+models/Lafontaine-Fables[english].txt_bs=64_ns=8000_vs=5000/Lafontaine-Fables[english].txt_ns=5000.tokenizer.json",
|
||||||
|
"temperature": "1.2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Blake",
|
||||||
|
"model_dir": "data/tokens+models/Blake-Songs-of-Innocence-and-of-Experience.txt_bs=64_ns=8000_vs=5000",
|
||||||
|
"tokeniser_file": "data/tokens+models/Blake-Songs-of-Innocence-and-of-Experience.txt_bs=64_ns=8000_vs=5000/Blake-Songs-of-Innocence-and-of-Experience.txt_ns=5000.tokenizer.json",
|
||||||
|
"temperature": "1.5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Friedrich",
|
||||||
|
"model_dir": "data/tokens+models/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_bs=64_ns=8000_vs=5000",
|
||||||
|
"tokeniser_file": "data/tokens+models/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_bs=64_ns=8000_vs=5000/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_ns=5000.tokenizer.json",
|
||||||
|
"temperature": "1.5"
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
0
examine/__init__.py
Normal file
0
examine/__init__.py
Normal file
42
examine/doc2vec.py
Normal file
42
examine/doc2vec.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
import pathlib
|
||||||
|
import gensim
|
||||||
|
|
||||||
|
TEXTS_DIR = '../data/texts/'
|
||||||
|
EXCLUDE = ['Wellcome_TALKS-ABOUT-FLOWERS']
|
||||||
|
OUTPUT = '../data/models/doc2vec.model'
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
print("1. Building corpus")
|
||||||
|
|
||||||
|
dirs = [d for d in pathlib.Path(TEXTS_DIR).iterdir() if d.is_dir() and d.stem not in EXCLUDE]
|
||||||
|
|
||||||
|
tagged_corpus = []
|
||||||
|
i = 0
|
||||||
|
for d in dirs:
|
||||||
|
files = list(d.glob('*.txt'))
|
||||||
|
for f in files:
|
||||||
|
with f.open() as fp:
|
||||||
|
text = fp.read()
|
||||||
|
tokens = gensim.utils.simple_preprocess(text, max_len=25)
|
||||||
|
tag_doc = gensim.models.doc2vec.TaggedDocument(words=tokens, tags=[i])
|
||||||
|
tagged_corpus.append(tag_doc)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
print("2. Building vocabulary")
|
||||||
|
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
|
||||||
|
|
||||||
|
model.build_vocab(tagged_corpus)
|
||||||
|
|
||||||
|
print("3. Training")
|
||||||
|
model.train(tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
|
||||||
|
|
||||||
|
print("4. Saving")
|
||||||
|
model.save(OUTPUT)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
18
examine/metric.py
Normal file
18
examine/metric.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import gensim
|
||||||
|
import numpy
|
||||||
|
from numpy import dot
|
||||||
|
from numpy.linalg import norm
|
||||||
|
|
||||||
|
MODEL_INPUT = '../data/models/doc2vec.model'
|
||||||
|
|
||||||
|
def cos_dist(v0, v1):
|
||||||
|
return dot(v0, v1) / (norm(v0) * norm(v1))
|
||||||
|
|
||||||
|
class Metric:
|
||||||
|
|
||||||
|
def __init__(self, model_input=MODEL_INPUT):
|
||||||
|
self.model = gensim.models.doc2vec.Doc2Vec.load(model_input)
|
||||||
|
|
||||||
|
def vector(self, text: str):
|
||||||
|
tokens = gensim.utils.simple_preprocess(text, max_len=25)
|
||||||
|
return self.model.infer_vector(tokens)
|
||||||
41
speak.py
Normal file
41
speak.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import argparse, json, sys, time, random
|
||||||
|
import utterance.voice
|
||||||
|
import utterance.utils
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("-c", "--config", type=str, default="config.json", help="configuratin file")
|
||||||
|
p.add_argument("-i", "--iterations", type=int, default=10, help="number of iterations")
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
with open(args.config) as f:
|
||||||
|
conf = json.load(f)
|
||||||
|
|
||||||
|
voices = []
|
||||||
|
for v in conf['voices']:
|
||||||
|
voice = utterance.voice.Voice(name=v["name"].upper(), model=v['model_dir'], tokenizer=v['tokeniser_file'], temp=float(v["temperature"]), lenght=32)
|
||||||
|
voices.append(voice)
|
||||||
|
|
||||||
|
nbr_voices = len(voices)
|
||||||
|
current_voice = ""
|
||||||
|
for i in range(args.iterations):
|
||||||
|
rindex = random.randint(0, nbr_voices - 1)
|
||||||
|
v = voices[rindex]
|
||||||
|
if v.name != current_voice:
|
||||||
|
print("==========")
|
||||||
|
print(v.name + ":")
|
||||||
|
current_voice = v.name
|
||||||
|
t = v.utter_one()
|
||||||
|
if t != None:
|
||||||
|
t = utterance.utils.clean(t)
|
||||||
|
t = utterance.utils.format(t)
|
||||||
|
print(t)
|
||||||
|
|
||||||
|
time.sleep(4)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
117
speak_metric.py
Normal file
117
speak_metric.py
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
import argparse, json, sys, time, random
|
||||||
|
import utterance.voice
|
||||||
|
import utterance.utils
|
||||||
|
import examine.metric
|
||||||
|
|
||||||
|
def format_str(text: str) -> str:
|
||||||
|
t = utterance.utils.clean(text)
|
||||||
|
return utterance.utils.format(t)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("-c", "--config", type=str, default="config.json", help="configuratin file")
|
||||||
|
p.add_argument("-i", "--iterations", type=int, default=10, help="number of iterations")
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
with open(args.config) as f:
|
||||||
|
conf = json.load(f)
|
||||||
|
|
||||||
|
voices = []
|
||||||
|
for v in conf['voices']:
|
||||||
|
voice = utterance.voice.Voice(name=v["name"].upper(), model=v['model_dir'], tokenizer=v['tokeniser_file'], temp=float(v["temperature"]), lenght=16)
|
||||||
|
voices.append(voice)
|
||||||
|
|
||||||
|
nbr_voices = len(voices)
|
||||||
|
|
||||||
|
state = 'c'
|
||||||
|
|
||||||
|
metric = examine.metric.Metric(model_input='data/models/doc2vec.model')
|
||||||
|
|
||||||
|
s = set(range(0, nbr_voices - 1))
|
||||||
|
|
||||||
|
rindex = random.sample(s, 1)[0]
|
||||||
|
|
||||||
|
v = voices[rindex]
|
||||||
|
uv = v.utter_one()
|
||||||
|
uv = format_str(uv)
|
||||||
|
|
||||||
|
v_vec = metric.vector(uv)
|
||||||
|
|
||||||
|
|
||||||
|
while state == 'c':
|
||||||
|
|
||||||
|
candidates = random.sample(s, 3)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for c in candidates:
|
||||||
|
if c == rindex:
|
||||||
|
continue
|
||||||
|
vc = voices[c]
|
||||||
|
vc_texts = vc.utter_n(n=150)
|
||||||
|
for t in vc_texts:
|
||||||
|
t = format_str(t)
|
||||||
|
t_vec = metric.vector(t)
|
||||||
|
d = examine.metric.cos_dist(v_vec, t_vec)
|
||||||
|
results.append([d, t, c])
|
||||||
|
|
||||||
|
# vv = voices[rrindex]
|
||||||
|
# texts = vv.utter_n(n=150)
|
||||||
|
# # texts = v.utter_n(n=150)
|
||||||
|
# results = []
|
||||||
|
# for t in texts:
|
||||||
|
# t = format_str(t)
|
||||||
|
# t_vec = metric.vector(uv)
|
||||||
|
# d = examine.metric.cos_dist(v_vec, t_vec)
|
||||||
|
# results.append((d, t))
|
||||||
|
|
||||||
|
results.sort(key=lambda t: t[0], reverse=True)
|
||||||
|
|
||||||
|
print('----------------------------')
|
||||||
|
print(v.name + ":")
|
||||||
|
print(uv)
|
||||||
|
print('----------------------------')
|
||||||
|
|
||||||
|
for r in results[:2]:
|
||||||
|
print('-->' + str(r[0]))
|
||||||
|
print(r[1])
|
||||||
|
print('+++++++++++++++++++++++++')
|
||||||
|
|
||||||
|
|
||||||
|
# new round
|
||||||
|
|
||||||
|
top = results[0]
|
||||||
|
rindex = top[2]
|
||||||
|
v = voices[rindex]
|
||||||
|
uv = top[1]
|
||||||
|
v_vec = metric.vector(top[1])
|
||||||
|
|
||||||
|
|
||||||
|
state = input("Continue? ")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# nbr_voices = len(voices)
|
||||||
|
# current_voice = ""
|
||||||
|
# for i in range(args.iterations):
|
||||||
|
# rindex = random.randint(0, nbr_voices - 1)
|
||||||
|
# v = voices[rindex]
|
||||||
|
# if v.name != current_voice:
|
||||||
|
# print("==========")
|
||||||
|
# print(v.name + ":")
|
||||||
|
# current_voice = v.name
|
||||||
|
# t = v.utter_one()
|
||||||
|
# if t != None:
|
||||||
|
# t = utterance.utils.clean(t)
|
||||||
|
# t = utterance.utils.format(t)
|
||||||
|
# print(t)
|
||||||
|
|
||||||
|
# time.sleep(4)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
0
utterance/__init__.py
Normal file
0
utterance/__init__.py
Normal file
@ -1,29 +0,0 @@
|
|||||||
{
|
|
||||||
"voices": [
|
|
||||||
{
|
|
||||||
"name": "Ralph",
|
|
||||||
"model_dir": "../data/tokens+models/Emerson-Nature.txt_bs=64_ns=8000_vs=5000",
|
|
||||||
"tokeniser_file": "../data/tokens+models/Emerson-Nature.txt_bs=64_ns=8000_vs=5000/Emerson-Nature.txt_ns=5000.tokenizer.json",
|
|
||||||
"temperature": "0.9"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Jean",
|
|
||||||
"model_dir": "../data/tokens+models/Lafontaine-Fables[english].txt_bs=64_ns=8000_vs=5000",
|
|
||||||
"tokeniser_file": "../data/tokens+models/Lafontaine-Fables[english].txt_bs=64_ns=8000_vs=5000/Lafontaine-Fables[english].txt_ns=5000.tokenizer.json",
|
|
||||||
"temperature": "1.2"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Blake",
|
|
||||||
"model_dir": "../data/tokens+models/Blake-Songs-of-Innocence-and-of-Experience.txt_bs=64_ns=8000_vs=5000",
|
|
||||||
"tokeniser_file": "../data/tokens+models/Blake-Songs-of-Innocence-and-of-Experience.txt_bs=64_ns=8000_vs=5000/Blake-Songs-of-Innocence-and-of-Experience.txt_ns=5000.tokenizer.json",
|
|
||||||
"temperature": "1.5"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Friedrich",
|
|
||||||
"model_dir": "../data/tokens+models/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_bs=64_ns=8000_vs=5000",
|
|
||||||
"tokeniser_file": "../data/tokens+models/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_bs=64_ns=8000_vs=5000/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_ns=5000.tokenizer.json",
|
|
||||||
"temperature": "1.5"
|
|
||||||
}
|
|
||||||
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@ -1,60 +0,0 @@
|
|||||||
import argparse, json, sys, time, random
|
|
||||||
import spacy
|
|
||||||
from aitextgen import aitextgen
|
|
||||||
import string
|
|
||||||
|
|
||||||
def clean(text: str) -> str:
|
|
||||||
|
|
||||||
s = text.split('\n')
|
|
||||||
|
|
||||||
if(len(s) > 0):
|
|
||||||
tok_1 = s[0].split(' ')
|
|
||||||
if len(tok_1) > 0 and tok_1[0].strip() in string.punctuation:
|
|
||||||
s_1 = ' '.join(tok_1[1:])
|
|
||||||
s[0] = s_1.capitalize()
|
|
||||||
else:
|
|
||||||
s[0] = s[0].capitalize()
|
|
||||||
|
|
||||||
return '\n'.join(s)
|
|
||||||
|
|
||||||
def format(text: str) -> str:
|
|
||||||
|
|
||||||
return text.replace('\r\n', '\n').replace('\n\n', '\n')
|
|
||||||
|
|
||||||
def main() -> int:
|
|
||||||
|
|
||||||
p = argparse.ArgumentParser()
|
|
||||||
p.add_argument("-c", "--config", type=str, default="config.json", help="configuratin file")
|
|
||||||
p.add_argument("-i", "--iterations", type=int, default=10, help="number of iterations")
|
|
||||||
args = p.parse_args()
|
|
||||||
|
|
||||||
print(args)
|
|
||||||
|
|
||||||
with open(args.config) as f:
|
|
||||||
conf = json.load(f)
|
|
||||||
|
|
||||||
voices = []
|
|
||||||
for v in conf['voices']:
|
|
||||||
a = aitextgen(model_folder=v['model_dir'], tokenizer_file=v['tokeniser_file'])
|
|
||||||
voices.append({"name": v["name"].upper(), "a": a, "temp": float(v["temperature"])})
|
|
||||||
|
|
||||||
nbr_voices = len(voices)
|
|
||||||
current_voice = ""
|
|
||||||
for i in range(args.iterations):
|
|
||||||
rindex = random.randint(0, nbr_voices - 1)
|
|
||||||
v = voices[rindex]
|
|
||||||
if v['name'] != current_voice:
|
|
||||||
print("==========")
|
|
||||||
print(v['name'] + ":")
|
|
||||||
current_voice = v['name']
|
|
||||||
t = v['a'].generate(n=1, max_lenght=32, temperature=v['temp'], return_as_list=True)[0]
|
|
||||||
if t != None:
|
|
||||||
t = clean(t)
|
|
||||||
t = format(t)
|
|
||||||
print(t)
|
|
||||||
|
|
||||||
time.sleep(4)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
||||||
19
utterance/utils.py
Normal file
19
utterance/utils.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import string
|
||||||
|
|
||||||
|
def clean(text: str) -> str:
|
||||||
|
|
||||||
|
s = text.split('\n')
|
||||||
|
|
||||||
|
if(len(s) > 0):
|
||||||
|
tok_1 = s[0].split(' ')
|
||||||
|
if len(tok_1) > 0 and tok_1[0].strip() in string.punctuation:
|
||||||
|
s_1 = ' '.join(tok_1[1:])
|
||||||
|
s[0] = s_1.capitalize()
|
||||||
|
else:
|
||||||
|
s[0] = s[0].capitalize()
|
||||||
|
|
||||||
|
return '\n'.join(s)
|
||||||
|
|
||||||
|
def format(text: str) -> str:
|
||||||
|
|
||||||
|
return text.replace('\r\n', '\n').replace('\n\n', '\n')
|
||||||
18
utterance/voice.py
Normal file
18
utterance/voice.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from aitextgen import aitextgen
|
||||||
|
import utterance.utils
|
||||||
|
|
||||||
|
class Voice:
|
||||||
|
|
||||||
|
def __init__(self, name: str, model: str, tokenizer: str, temp: int, lenght: int):
|
||||||
|
self.name = name
|
||||||
|
self.temp = temp
|
||||||
|
self.lenght = lenght
|
||||||
|
self.v = aitextgen(model_folder=model, tokenizer_file=tokenizer)
|
||||||
|
|
||||||
|
def utter_n(self, n: int, temp: float = None, lenght: int = None):
|
||||||
|
t = self.temp if temp != None else temp
|
||||||
|
l = self.lenght if lenght != None else lenght
|
||||||
|
return self.v.generate(n=n, max_lenght=l, temperature=t, return_as_list=True)
|
||||||
|
|
||||||
|
def utter_one(self, temp: int = None, lenght: float = None) -> str:
|
||||||
|
return self.utter_n(n=1, temp=temp, lenght=lenght)[0]
|
||||||
Loading…
x
Reference in New Issue
Block a user