diff --git a/config.json b/config.json new file mode 100644 index 0000000..b22ca73 --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "voices": [ + { + "name": "Ralph", + "model_dir": "data/tokens+models/Emerson-Nature.txt_bs=64_ns=8000_vs=5000", + "tokeniser_file": "data/tokens+models/Emerson-Nature.txt_bs=64_ns=8000_vs=5000/Emerson-Nature.txt_ns=5000.tokenizer.json", + "temperature": "0.9" + }, + { + "name": "Jean", + "model_dir": "data/tokens+models/Lafontaine-Fables[english].txt_bs=64_ns=8000_vs=5000", + "tokeniser_file": "data/tokens+models/Lafontaine-Fables[english].txt_bs=64_ns=8000_vs=5000/Lafontaine-Fables[english].txt_ns=5000.tokenizer.json", + "temperature": "1.2" + }, + { + "name": "Blake", + "model_dir": "data/tokens+models/Blake-Songs-of-Innocence-and-of-Experience.txt_bs=64_ns=8000_vs=5000", + "tokeniser_file": "data/tokens+models/Blake-Songs-of-Innocence-and-of-Experience.txt_bs=64_ns=8000_vs=5000/Blake-Songs-of-Innocence-and-of-Experience.txt_ns=5000.tokenizer.json", + "temperature": "1.5" + }, + { + "name": "Friedrich", + "model_dir": "data/tokens+models/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_bs=64_ns=8000_vs=5000", + "tokeniser_file": "data/tokens+models/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_bs=64_ns=8000_vs=5000/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_ns=5000.tokenizer.json", + "temperature": "1.5" + } + + ] +} \ No newline at end of file diff --git a/examine/__init__.py b/examine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examine/doc2vec.py b/examine/doc2vec.py new file mode 100644 index 0000000..0713913 --- /dev/null +++ b/examine/doc2vec.py @@ -0,0 +1,42 @@ +import pathlib +import gensim + +TEXTS_DIR = '../data/texts/' +EXCLUDE = ['Wellcome_TALKS-ABOUT-FLOWERS'] +OUTPUT = '../data/models/doc2vec.model' + +if __name__ == '__main__': + + print("1. Building corpus") + + dirs = [d for d in pathlib.Path(TEXTS_DIR).iterdir() if d.is_dir() and d.stem not in EXCLUDE] + + tagged_corpus = [] + i = 0 + for d in dirs: + files = list(d.glob('*.txt')) + for f in files: + with f.open() as fp: + text = fp.read() + tokens = gensim.utils.simple_preprocess(text, max_len=25) + tag_doc = gensim.models.doc2vec.TaggedDocument(words=tokens, tags=[i]) + tagged_corpus.append(tag_doc) + i += 1 + + print("2. Building vocabulary") + model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40) + + model.build_vocab(tagged_corpus) + + print("3. Training") + model.train(tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs) + + print("4. Saving") + model.save(OUTPUT) + + + + + + + diff --git a/examine/metric.py b/examine/metric.py new file mode 100644 index 0000000..6493bdc --- /dev/null +++ b/examine/metric.py @@ -0,0 +1,18 @@ +import gensim +import numpy +from numpy import dot +from numpy.linalg import norm + +MODEL_INPUT = '../data/models/doc2vec.model' + +def cos_dist(v0, v1): + return dot(v0, v1) / (norm(v0) * norm(v1)) + +class Metric: + + def __init__(self, model_input=MODEL_INPUT): + self.model = gensim.models.doc2vec.Doc2Vec.load(model_input) + + def vector(self, text: str): + tokens = gensim.utils.simple_preprocess(text, max_len=25) + return self.model.infer_vector(tokens) \ No newline at end of file diff --git a/speak.py b/speak.py new file mode 100644 index 0000000..b15a63f --- /dev/null +++ b/speak.py @@ -0,0 +1,41 @@ +import argparse, json, sys, time, random +import utterance.voice +import utterance.utils + +def main() -> int: + + p = argparse.ArgumentParser() + p.add_argument("-c", "--config", type=str, default="config.json", help="configuratin file") + p.add_argument("-i", "--iterations", type=int, default=10, help="number of iterations") + args = p.parse_args() + + print(args) + + with open(args.config) as f: + conf = json.load(f) + + voices = [] + for v in conf['voices']: + voice = utterance.voice.Voice(name=v["name"].upper(), model=v['model_dir'], tokenizer=v['tokeniser_file'], temp=float(v["temperature"]), lenght=32) + voices.append(voice) + + nbr_voices = len(voices) + current_voice = "" + for i in range(args.iterations): + rindex = random.randint(0, nbr_voices - 1) + v = voices[rindex] + if v.name != current_voice: + print("==========") + print(v.name + ":") + current_voice = v.name + t = v.utter_one() + if t != None: + t = utterance.utils.clean(t) + t = utterance.utils.format(t) + print(t) + + time.sleep(4) + + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/speak_metric.py b/speak_metric.py new file mode 100644 index 0000000..36a2215 --- /dev/null +++ b/speak_metric.py @@ -0,0 +1,117 @@ +import argparse, json, sys, time, random +import utterance.voice +import utterance.utils +import examine.metric + +def format_str(text: str) -> str: + t = utterance.utils.clean(text) + return utterance.utils.format(t) + + +def main() -> int: + + p = argparse.ArgumentParser() + p.add_argument("-c", "--config", type=str, default="config.json", help="configuratin file") + p.add_argument("-i", "--iterations", type=int, default=10, help="number of iterations") + args = p.parse_args() + + print(args) + + with open(args.config) as f: + conf = json.load(f) + + voices = [] + for v in conf['voices']: + voice = utterance.voice.Voice(name=v["name"].upper(), model=v['model_dir'], tokenizer=v['tokeniser_file'], temp=float(v["temperature"]), lenght=16) + voices.append(voice) + + nbr_voices = len(voices) + + state = 'c' + + metric = examine.metric.Metric(model_input='data/models/doc2vec.model') + + s = set(range(0, nbr_voices - 1)) + + rindex = random.sample(s, 1)[0] + + v = voices[rindex] + uv = v.utter_one() + uv = format_str(uv) + + v_vec = metric.vector(uv) + + + while state == 'c': + + candidates = random.sample(s, 3) + + results = [] + for c in candidates: + if c == rindex: + continue + vc = voices[c] + vc_texts = vc.utter_n(n=150) + for t in vc_texts: + t = format_str(t) + t_vec = metric.vector(t) + d = examine.metric.cos_dist(v_vec, t_vec) + results.append([d, t, c]) + + # vv = voices[rrindex] + # texts = vv.utter_n(n=150) + # # texts = v.utter_n(n=150) + # results = [] + # for t in texts: + # t = format_str(t) + # t_vec = metric.vector(uv) + # d = examine.metric.cos_dist(v_vec, t_vec) + # results.append((d, t)) + + results.sort(key=lambda t: t[0], reverse=True) + + print('----------------------------') + print(v.name + ":") + print(uv) + print('----------------------------') + + for r in results[:2]: + print('-->' + str(r[0])) + print(r[1]) + print('+++++++++++++++++++++++++') + + + # new round + + top = results[0] + rindex = top[2] + v = voices[rindex] + uv = top[1] + v_vec = metric.vector(top[1]) + + + state = input("Continue? ") + + + + + # nbr_voices = len(voices) + # current_voice = "" + # for i in range(args.iterations): + # rindex = random.randint(0, nbr_voices - 1) + # v = voices[rindex] + # if v.name != current_voice: + # print("==========") + # print(v.name + ":") + # current_voice = v.name + # t = v.utter_one() + # if t != None: + # t = utterance.utils.clean(t) + # t = utterance.utils.format(t) + # print(t) + + # time.sleep(4) + + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/tokenise.py b/train/tokenise.py similarity index 100% rename from tokenise.py rename to train/tokenise.py diff --git a/train.py b/train/train.py similarity index 100% rename from train.py rename to train/train.py diff --git a/utterance/__init__.py b/utterance/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utterance/config.json b/utterance/config.json deleted file mode 100644 index 422789a..0000000 --- a/utterance/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "voices": [ - { - "name": "Ralph", - "model_dir": "../data/tokens+models/Emerson-Nature.txt_bs=64_ns=8000_vs=5000", - "tokeniser_file": "../data/tokens+models/Emerson-Nature.txt_bs=64_ns=8000_vs=5000/Emerson-Nature.txt_ns=5000.tokenizer.json", - "temperature": "0.9" - }, - { - "name": "Jean", - "model_dir": "../data/tokens+models/Lafontaine-Fables[english].txt_bs=64_ns=8000_vs=5000", - "tokeniser_file": "../data/tokens+models/Lafontaine-Fables[english].txt_bs=64_ns=8000_vs=5000/Lafontaine-Fables[english].txt_ns=5000.tokenizer.json", - "temperature": "1.2" - }, - { - "name": "Blake", - "model_dir": "../data/tokens+models/Blake-Songs-of-Innocence-and-of-Experience.txt_bs=64_ns=8000_vs=5000", - "tokeniser_file": "../data/tokens+models/Blake-Songs-of-Innocence-and-of-Experience.txt_bs=64_ns=8000_vs=5000/Blake-Songs-of-Innocence-and-of-Experience.txt_ns=5000.tokenizer.json", - "temperature": "1.5" - }, - { - "name": "Friedrich", - "model_dir": "../data/tokens+models/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_bs=64_ns=8000_vs=5000", - "tokeniser_file": "../data/tokens+models/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_bs=64_ns=8000_vs=5000/Schelling-ON-THE-RELATION-OF-THE-PLASTIC-ARTS-TO-NATURE.txt_ns=5000.tokenizer.json", - "temperature": "1.5" - } - - ] -} \ No newline at end of file diff --git a/utterance/speak.py b/utterance/speak.py deleted file mode 100644 index 4607f9e..0000000 --- a/utterance/speak.py +++ /dev/null @@ -1,60 +0,0 @@ -import argparse, json, sys, time, random -import spacy -from aitextgen import aitextgen -import string - -def clean(text: str) -> str: - - s = text.split('\n') - - if(len(s) > 0): - tok_1 = s[0].split(' ') - if len(tok_1) > 0 and tok_1[0].strip() in string.punctuation: - s_1 = ' '.join(tok_1[1:]) - s[0] = s_1.capitalize() - else: - s[0] = s[0].capitalize() - - return '\n'.join(s) - -def format(text: str) -> str: - - return text.replace('\r\n', '\n').replace('\n\n', '\n') - -def main() -> int: - - p = argparse.ArgumentParser() - p.add_argument("-c", "--config", type=str, default="config.json", help="configuratin file") - p.add_argument("-i", "--iterations", type=int, default=10, help="number of iterations") - args = p.parse_args() - - print(args) - - with open(args.config) as f: - conf = json.load(f) - - voices = [] - for v in conf['voices']: - a = aitextgen(model_folder=v['model_dir'], tokenizer_file=v['tokeniser_file']) - voices.append({"name": v["name"].upper(), "a": a, "temp": float(v["temperature"])}) - - nbr_voices = len(voices) - current_voice = "" - for i in range(args.iterations): - rindex = random.randint(0, nbr_voices - 1) - v = voices[rindex] - if v['name'] != current_voice: - print("==========") - print(v['name'] + ":") - current_voice = v['name'] - t = v['a'].generate(n=1, max_lenght=32, temperature=v['temp'], return_as_list=True)[0] - if t != None: - t = clean(t) - t = format(t) - print(t) - - time.sleep(4) - - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/utterance/utils.py b/utterance/utils.py new file mode 100644 index 0000000..54f1bcd --- /dev/null +++ b/utterance/utils.py @@ -0,0 +1,19 @@ +import string + +def clean(text: str) -> str: + + s = text.split('\n') + + if(len(s) > 0): + tok_1 = s[0].split(' ') + if len(tok_1) > 0 and tok_1[0].strip() in string.punctuation: + s_1 = ' '.join(tok_1[1:]) + s[0] = s_1.capitalize() + else: + s[0] = s[0].capitalize() + + return '\n'.join(s) + +def format(text: str) -> str: + + return text.replace('\r\n', '\n').replace('\n\n', '\n') diff --git a/utterance/voice.py b/utterance/voice.py new file mode 100644 index 0000000..c734fce --- /dev/null +++ b/utterance/voice.py @@ -0,0 +1,18 @@ +from aitextgen import aitextgen +import utterance.utils + +class Voice: + + def __init__(self, name: str, model: str, tokenizer: str, temp: int, lenght: int): + self.name = name + self.temp = temp + self.lenght = lenght + self.v = aitextgen(model_folder=model, tokenizer_file=tokenizer) + + def utter_n(self, n: int, temp: float = None, lenght: int = None): + t = self.temp if temp != None else temp + l = self.lenght if lenght != None else lenght + return self.v.generate(n=n, max_lenght=l, temperature=t, return_as_list=True) + + def utter_one(self, temp: int = None, lenght: float = None) -> str: + return self.utter_n(n=1, temp=temp, lenght=lenght)[0]