NATURESPEAK-ML-UTTER/tokenise+train.py

132 lines
4.4 KiB
Python
Raw Normal View History

2022-02-06 19:01:26 +01:00
import argparse, os, sys
from pathlib import Path
# from aitextgen.TokenDataset import TokenDataset
# from aitextgen.tokenizers import train_tokenizer
# from aitextgen.utils import GPT2ConfigCPU
# from aitextgen.utils import build_gpt2_config
# from aitextgen import aitextgen
# import tokenise as tk
# import train as tr
def suffix(bs: int, ns: int, vs: int) -> str:
return f"_bs={bs}_ns={ns}_vs={vs}"
2022-02-10 13:34:20 +01:00
def train(filepath: str, ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str:
2022-02-06 19:01:26 +01:00
2022-02-10 13:34:20 +01:00
# from aitextgen.TokenDataset import TokenDataset
# from transformers import GPT2Config
from aitextgen.utils import build_gpt2_config
2022-02-06 19:01:26 +01:00
from aitextgen import aitextgen
2022-02-10 13:46:30 +01:00
files = [x for x in ouputdir.glob('*') if x.name.endswith(".tokenizer.json")]
2022-02-10 08:56:33 +01:00
print(files)
2022-02-10 13:46:30 +01:00
if len(files) == 1:
tok = str(files[0])
else:
return "No valid tokenizer in " + str(ouputdir)
2022-02-06 19:01:26 +01:00
2022-02-10 13:34:20 +01:00
config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize, dropout=0.0, n_embd=256, n_layer=8, n_head=8)
2022-02-09 18:49:18 +01:00
2022-02-10 13:34:20 +01:00
# config = GPT2Config(
# vocab_size=vocabsize,
# n_positions=blocksize,
# n_ctx=blocksize,
# resid_pdrop=0.0,
# embd_pdrop=0.0,
# attn_pdrop=0.0,
# summary_first_dropout=0.0,
# bos_token_id=0,
# eos_token_id=0
# )
2022-02-06 19:01:26 +01:00
2022-02-09 18:42:32 +01:00
print(config)
2022-02-10 13:34:20 +01:00
ai = aitextgen(config=config, tokenizer_file=tok, to_gpu=gpu)
2022-02-06 19:01:26 +01:00
2022-02-10 13:34:20 +01:00
# data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True)
2022-02-06 19:01:26 +01:00
2022-02-10 14:13:00 +01:00
ai.train(filepath, output_dir=str(ouputdir), line_by_line=False, from_cache=False, learning_rate=1e-3, batch_size=256, num_steps=num_steps, generate_every=1000, save_every=1000)
2022-02-06 19:01:26 +01:00
return "Done!"
2022-02-10 09:18:28 +01:00
def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, linebyline: bool, verbose: bool = False) -> str:
2022-02-06 19:01:26 +01:00
f_path = Path(filepath)
if f_path.is_dir():
text = [x for x in f_path.glob('*') if x.is_file()]
elif f_path.is_file():
text = str(f_path)
else:
return "text input is not valid"
2022-02-10 13:34:20 +01:00
# from aitextgen.TokenDataset import TokenDataset
2022-02-09 18:52:17 +01:00
from aitextgen.tokenizers import train_tokenizer
2022-02-06 19:01:26 +01:00
#NOTE: vocab_size is fixed since this is not yet in train_tokenizer
#see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py
fn = ouputdir / (f_path.name + f"_ns={vocabsize}")
if type(text) is str:
train_tokenizer(text, vocab_size=vocabsize, prefix=str(fn))
else:
train_tokenizer(files=[str(x) for x in text], vocab_size=vocabsize, prefix=str(fn))
2022-02-10 13:34:20 +01:00
# tok_fn = str(fn) + ".tokenizer.json"
2022-02-06 19:01:26 +01:00
2022-02-10 13:34:20 +01:00
# fnn = ouputdir / (f_path.name + f"_bs={blocksize}_ns={vocabsize}")
# dataset_fn = str(fnn) + ".tar.gz"
2022-02-06 19:01:26 +01:00
2022-02-10 13:34:20 +01:00
# print(tok_fn)
# print(dataset_fn)
# if type(text) is str:
# data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline)
# else:
# texts = [x.read_text() for x in text]
# data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline)
# data.save(cache_destination=dataset_fn)
2022-02-06 19:01:26 +01:00
return "encode success"
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("text", type=str, help="text file path to be tokenised and encoded")
p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)")
2022-02-10 13:34:20 +01:00
p.add_argument("-s", "--numsteps", type=int, default=8000)
p.add_argument("-v", "--vocabsize", type=int, default=5000)
2022-02-06 19:01:26 +01:00
p.add_argument("--ouputdir", type=str, default="data/tokens+models/")
p.add_argument("--gpu", action="store_true")
2022-02-10 09:18:28 +01:00
p.add_argument("--line_by_line", action="store_true")
2022-02-06 19:01:26 +01:00
args = p.parse_args()
text = Path(args.text)
if not text.exists():
return args.text + " doesn't exists"
output_dir = Path(args.ouputdir + text.name + suffix(args.blocksize, args.numsteps, args.vocabsize))
if output_dir.is_dir():
exts = ['.json', '.gz', '.bin']
files = [x for x in output_dir.glob('*') if x.suffix in exts]
2022-02-09 18:18:41 +01:00
if len(files) == 4:
2022-02-06 19:01:26 +01:00
print("Token + model already exists > " + output_dir.name)
q = input("Continue? [y/n]")
if q != 'y':
return "Nothing to do..."
else:
output_dir.mkdir()
2022-02-10 09:18:28 +01:00
encode(filepath=args.text, blocksize=args.blocksize, vocabsize=args.vocabsize, ouputdir=output_dir, linebyline=args.line_by_line)
2022-02-10 13:39:26 +01:00
train(filepath=args.text, ouputdir=output_dir, blocksize=args.blocksize, vocabsize=args.vocabsize, num_steps=args.numsteps, gpu=args.gpu)
2022-02-06 19:01:26 +01:00
if __name__ == '__main__':
sys.exit(main())