import argparse, os, sys def encode(filepath: str, blocksize: int, ouputdir: str, verbose: bool = False) -> int: from aitextgen.TokenDataset import TokenDataset from aitextgen.tokenizers import train_tokenizer fn = ouputdir + os.path.basename(filepath) #NOTE: vocab_size is fixed since this is not yet in train_tokenizer #see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py train_tokenizer(filepath, prefix=fn) tok_fn = fn + ".tokenizer.json" fn_dest = fn + "_bs=" + str(blocksize) + ".tar.gz" data = TokenDataset(file_path=filepath, tokenizer_file=tok_fn, block_size=blocksize) data.save(cache_destination=fn_dest) return 0 def main() -> int: p = argparse.ArgumentParser() p.add_argument("text", type=str, help="text file path to be tokenised and encoded") p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)") p.add_argument("--ouputdir", type=str, default="data/tokens/") p.add_argument("-v", "--verbose", action="store_true") args = p.parse_args() return encode(args.text, args.blocksize, args.ouputdir, args.verbose) if __name__ == '__main__': sys.exit(main())