diff --git a/.gitignore b/.gitignore index 8959fc6..8d9ba71 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ #macos .DS_store +venv # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/tokenise+train.py b/tokenise+train.py new file mode 100644 index 0000000..4a0f75d --- /dev/null +++ b/tokenise+train.py @@ -0,0 +1,115 @@ +import argparse, os, sys +from pathlib import Path +# from aitextgen.TokenDataset import TokenDataset +# from aitextgen.tokenizers import train_tokenizer +# from aitextgen.utils import GPT2ConfigCPU +# from aitextgen.utils import build_gpt2_config +# from aitextgen import aitextgen + +# import tokenise as tk +# import train as tr + +def suffix(bs: int, ns: int, vs: int) -> str: + return f"_bs={bs}_ns={ns}_vs={vs}" + +def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str: + + from aitextgen.TokenDataset import TokenDataset + from aitextgen.utils import build_gpt2_config + from aitextgen import aitextgen + + exts = ['.json', '.gz'] + files = [x for x in ouputdir.glob('*') if x.suffix in exts] + if len(files) == 2: + if files[0].suffix == '.json': + tok = str(files[0]) + dat = str(files[1]) + else: + tok = str(files[1]) + dat = str(files[0]) + + config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize) + + ai = aitextgen(tokenizer_file=tok, config=config) + + data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True) + + ai.train(data, output_dir=str(ouputdir), batch_size=16, num_steps=num_steps, generate_every=1000, save_every=1000, num_workers=4, to_gpu=gpu) + + return "Done!" + + +def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbose: bool = False) -> str: + + f_path = Path(filepath) + + if f_path.is_dir(): + text = [x for x in f_path.glob('*') if x.is_file()] + elif f_path.is_file(): + text = str(f_path) + else: + return "text input is not valid" + + from aitextgen.TokenDataset import TokenDataset + from aitextgen.tokenizers import train_tokenizer + + #NOTE: vocab_size is fixed since this is not yet in train_tokenizer + #see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py + + fn = ouputdir / (f_path.name + f"_ns={vocabsize}") + if type(text) is str: + train_tokenizer(text, vocab_size=vocabsize, prefix=str(fn)) + else: + train_tokenizer(files=[str(x) for x in text], vocab_size=vocabsize, prefix=str(fn)) + tok_fn = str(fn) + ".tokenizer.json" + + fnn = ouputdir / (f_path.name + f"_bs={blocksize}_ns={vocabsize}") + dataset_fn = str(fnn) + ".tar.gz" + + print(tok_fn) + print(dataset_fn) + + if type(text) is str: + data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize) + else: + texts = [x.read_text() for x in text] + data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize) + data.save(cache_destination=dataset_fn) + + return "encode success" + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("text", type=str, help="text file path to be tokenised and encoded") + p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)") + p.add_argument("-s", "--numsteps", type=int, default=10000) + p.add_argument("-v", "--vocabsize", type=int, default=1000) + p.add_argument("--ouputdir", type=str, default="data/tokens+models/") + p.add_argument("--gpu", action="store_true") + + args = p.parse_args() + + text = Path(args.text) + if not text.exists(): + return args.text + " doesn't exists" + + output_dir = Path(args.ouputdir + text.name + suffix(args.blocksize, args.numsteps, args.vocabsize)) + + if output_dir.is_dir(): + exts = ['.json', '.gz', '.bin'] + files = [x for x in output_dir.glob('*') if x.suffix in exts] + if len(files) == 3: + print("Token + model already exists > " + output_dir.name) + q = input("Continue? [y/n]") + if q != 'y': + return "Nothing to do..." + else: + output_dir.mkdir() + + encode(filepath=args.text, blocksize=args.blocksize, vocabsize=args.vocabsize, ouputdir=output_dir) + train(ouputdir=output_dir, blocksize=args.blocksize, vocabsize=args.vocabsize, num_steps=args.numsteps, gpu=args.gpu) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/tokenise.py b/tokenise.py index 081d88e..5c6f99e 100644 --- a/tokenise.py +++ b/tokenise.py @@ -16,7 +16,7 @@ def encode(filepath: str, blocksize: int, ouputdir: str, verbose: bool = False) tok_fn = fn + ".tokenizer.json" fn_dest = fn + "_bs=" + str(blocksize) + ".tar.gz" - data = TokenDataset(file_path=filepath, tokenizer_file=tok_fn, block_size=blocksize) + data = TokenDataset(file_path=filepath, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True) data.save(cache_destination=fn_dest) return 0