NATURESPEAK-ML-UTTER/train/tokenise.py

import argparse, os, sys


def encode(filepath: str, blocksize: int, ouputdir: str, verbose: bool = False) -> int:

	from aitextgen.TokenDataset import TokenDataset
	from aitextgen.tokenizers import train_tokenizer

	fn = ouputdir + os.path.basename(filepath)

	#NOTE: vocab_size is fixed since this is not yet in train_tokenizer
	#see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py

	train_tokenizer(filepath, prefix=fn)

	tok_fn = fn + ".tokenizer.json"
	fn_dest = fn + "_bs=" + str(blocksize) + ".tar.gz"

	data = TokenDataset(file_path=filepath, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True)
	data.save(cache_destination=fn_dest)

	return 0

def main() -> int:
	p = argparse.ArgumentParser()
	p.add_argument("text", type=str, help="text file path to be tokenised and encoded")
	p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)")
	p.add_argument("--ouputdir", type=str, default="data/tokens/")
	p.add_argument("-v", "--verbose", action="store_true")
	args = p.parse_args()

	return encode(args.text, args.blocksize, args.ouputdir, args.verbose)

if __name__ == '__main__':
    sys.exit(main())