From 537acb571e6783f79484abe06287b7bd94611388 Mon Sep 17 00:00:00 2001 From: gauthiier Date: Sun, 30 Jan 2022 15:47:24 +0100 Subject: [PATCH] ahah --- tokenise.py | 35 ++++++++++++++++++++++++++ train.py | 57 +++++++++++++++++++++++++++++++++++++++++++ utterance/config.json | 19 +++++++++++++++ utterance/speak.py | 38 +++++++++++++++++++++++++++++ 4 files changed, 149 insertions(+) create mode 100644 tokenise.py create mode 100644 train.py create mode 100644 utterance/config.json create mode 100644 utterance/speak.py diff --git a/tokenise.py b/tokenise.py new file mode 100644 index 0000000..081d88e --- /dev/null +++ b/tokenise.py @@ -0,0 +1,35 @@ +import argparse, os, sys + + +def encode(filepath: str, blocksize: int, ouputdir: str, verbose: bool = False) -> int: + + from aitextgen.TokenDataset import TokenDataset + from aitextgen.tokenizers import train_tokenizer + + fn = ouputdir + os.path.basename(filepath) + + #NOTE: vocab_size is fixed since this is not yet in train_tokenizer + #see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py + + train_tokenizer(filepath, prefix=fn) + + tok_fn = fn + ".tokenizer.json" + fn_dest = fn + "_bs=" + str(blocksize) + ".tar.gz" + + data = TokenDataset(file_path=filepath, tokenizer_file=tok_fn, block_size=blocksize) + data.save(cache_destination=fn_dest) + + return 0 + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("text", type=str, help="text file path to be tokenised and encoded") + p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)") + p.add_argument("--ouputdir", type=str, default="data/tokens/") + p.add_argument("-v", "--verbose", action="store_true") + args = p.parse_args() + + return encode(args.text, args.blocksize, args.ouputdir, args.verbose) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..e439a6c --- /dev/null +++ b/train.py @@ -0,0 +1,57 @@ +import argparse, os, sys +from aitextgen.TokenDataset import TokenDataset +from aitextgen.utils import GPT2ConfigCPU +from aitextgen.utils import build_gpt2_config +from aitextgen import aitextgen + + +# https://github.com/minimaxir/aitextgen/blob/master/aitextgen/utils.py +# https://github.com/huggingface/transformers/blob/master/src/transformers/models/gpt2/configuration_gpt2.py + +def run_cpu(te: str, tok: str, dat: str, blocksize: int, num_steps: int = 10000) -> int: + + config = GPT2ConfigCPU() + + ai = aitextgen(tokenizer_file=tok, config=config) + data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True) + + ai.train(data, output_dir=te, batch_size=16, num_steps=num_steps, generate_every=1000, save_every=1000, num_workers=4) + + return 0 + +def run_gpu(te: str, tok: str, dat: str, blocksize: int, num_steps: int = 10000) -> int: + + #NOTE: vocab_size is fixed since this is not yet in train_tokenizer + + config = build_gpt2_config(vocab_size=1000, max_lenght=blocksize) + + ai = aitextgen(tokenizer_file=tok, config=config) + data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True) + + ai.train(data, output_dir=te, batch_size=16, num_steps=num_steps, generate_every=1000, save_every=1000, num_workers=4, to_gpu=True) + + return 0 + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("text", type=str, help="text to create model from") + p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)") + p.add_argument("-s", "--numsteps", type=int, default=10000) + p.add_argument("--tokensdir", type=str, default="data/tokens/") + p.add_argument("--ouputdir", type=str, default="data/models/") + p.add_argument("--gpu", action="store_true") + + args = p.parse_args() + + tok_file = f"{args.tokensdir}{args.text}.tokenizer.json" + dat_file = f"{args.tokensdir}{args.text}_bs={args.blocksize}.tar.gz" + output_dir = f"{args.ouputdir}{args.text}_bs={args.blocksize}_ns={args.numsteps}" + + if args.gpu: + return run_gpu(te=output_dir, tok=tok_file, dat=dat_file, blocksize=args.blocksize, num_steps=args.numsteps) + else: + return run_cpu(output_dir, tok_file, dat_file, args.blocksize) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/utterance/config.json b/utterance/config.json new file mode 100644 index 0000000..af3ee8e --- /dev/null +++ b/utterance/config.json @@ -0,0 +1,19 @@ +{ + "voices": [ + { + "name": "Ralph", + "model_dir": "../data/models/Emerson-Nature.txt", + "tokeniser_file": "../data/tokens/Emerson-Nature.txt.tokenizer.json" + }, + { + "name": "Jean", + "model_dir": "../data/models/Lafontaine-Fables[english].txt", + "tokeniser_file": "../data/tokens/Lafontaine-Fables[english].txt.tokenizer.json" + }, + { + "name": "Blake", + "model_dir": "../data/models/Blake-Songs-of-Innocence-and-of-Experience.txt", + "tokeniser_file": "../data/tokens/Blake-Songs-of-Innocence-and-of-Experience.txt.tokenizer.json" + } + ] +} \ No newline at end of file diff --git a/utterance/speak.py b/utterance/speak.py new file mode 100644 index 0000000..4af96e7 --- /dev/null +++ b/utterance/speak.py @@ -0,0 +1,38 @@ +import argparse, json, sys, time, random +import spacy +from aitextgen import aitextgen + +def main() -> int: + + p = argparse.ArgumentParser() + p.add_argument("-c", "--config", type=str, default="config.json", help="configuratin file") + p.add_argument("-i", "--iterations", type=int, default=10, help="number of iterations") + args = p.parse_args() + + print(args) + + with open(args.config) as f: + conf = json.load(f) + + voices = [] + for v in conf['voices']: + a = aitextgen(model_folder=v['model_dir'], tokenizer_file=v['tokeniser_file']) + voices.append({"name": v["name"].upper(), "a": a}) + + nbr_voices = len(voices) + current_voice = "" + for i in range(args.iterations): + rindex = random.randint(0, nbr_voices - 1) + v = voices[rindex] + if v['name'] != current_voice: + print("==========") + print(v['name'] + ":") + current_voice = v['name'] + t = v['a'].generate_one().strip() + print(t) + + time.sleep(1) + + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file