import argparse, os, sys from pathlib import Path # from aitextgen.TokenDataset import TokenDataset # from aitextgen.tokenizers import train_tokenizer # from aitextgen.utils import GPT2ConfigCPU # from aitextgen.utils import build_gpt2_config # from aitextgen import aitextgen # import tokenise as tk # import train as tr def suffix(bs: int, ns: int, vs: int) -> str: return f"_bs={bs}_ns={ns}_vs={vs}" def train(filepath: str, ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str: # from aitextgen.TokenDataset import TokenDataset # from transformers import GPT2Config from aitextgen.utils import build_gpt2_config from aitextgen import aitextgen files = [x for x in ouputdir.glob('*') if x.name.endswith(".tokenizer.json")] print(files) if len(files) == 1: tok = str(files[0]) else: return "No valid tokenizer in " + str(ouputdir) config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize, dropout=0.0, n_embd=256, n_layer=8, n_head=8) # config = GPT2Config( # vocab_size=vocabsize, # n_positions=blocksize, # n_ctx=blocksize, # resid_pdrop=0.0, # embd_pdrop=0.0, # attn_pdrop=0.0, # summary_first_dropout=0.0, # bos_token_id=0, # eos_token_id=0 # ) print(config) ai = aitextgen(config=config, tokenizer_file=tok, to_gpu=gpu) # data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True) ai.train(filepath, output_dir=str(ouputdir), line_by_line=False, from_cache=False, learning_rate=1e-3, batch_size=256, num_steps=num_steps, generate_every=1000, save_every=1000) return "Done!" def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, linebyline: bool, verbose: bool = False) -> str: f_path = Path(filepath) if f_path.is_dir(): text = [x for x in f_path.glob('*') if x.is_file()] elif f_path.is_file(): text = str(f_path) else: return "text input is not valid" # from aitextgen.TokenDataset import TokenDataset from aitextgen.tokenizers import train_tokenizer #NOTE: vocab_size is fixed since this is not yet in train_tokenizer #see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py fn = ouputdir / (f_path.name + f"_ns={vocabsize}") if type(text) is str: train_tokenizer(text, vocab_size=vocabsize, prefix=str(fn)) else: train_tokenizer(files=[str(x) for x in text], vocab_size=vocabsize, prefix=str(fn)) # tok_fn = str(fn) + ".tokenizer.json" # fnn = ouputdir / (f_path.name + f"_bs={blocksize}_ns={vocabsize}") # dataset_fn = str(fnn) + ".tar.gz" # print(tok_fn) # print(dataset_fn) # if type(text) is str: # data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) # else: # texts = [x.read_text() for x in text] # data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) # data.save(cache_destination=dataset_fn) return "encode success" def main() -> int: p = argparse.ArgumentParser() p.add_argument("text", type=str, help="text file path to be tokenised and encoded") p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)") p.add_argument("-s", "--numsteps", type=int, default=8000) p.add_argument("-v", "--vocabsize", type=int, default=5000) p.add_argument("--ouputdir", type=str, default="data/tokens+models/") p.add_argument("--gpu", action="store_true") p.add_argument("--line_by_line", action="store_true") args = p.parse_args() text = Path(args.text) if not text.exists(): return args.text + " doesn't exists" output_dir = Path(args.ouputdir + text.name + suffix(args.blocksize, args.numsteps, args.vocabsize)) if output_dir.is_dir(): exts = ['.json', '.gz', '.bin'] files = [x for x in output_dir.glob('*') if x.suffix in exts] if len(files) == 4: print("Token + model already exists > " + output_dir.name) q = input("Continue? [y/n]") if q != 'y': return "Nothing to do..." else: output_dir.mkdir() encode(filepath=args.text, blocksize=args.blocksize, vocabsize=args.vocabsize, ouputdir=output_dir, linebyline=args.line_by_line) train(filepath=args.text, ouputdir=output_dir, blocksize=args.blocksize, vocabsize=args.vocabsize, num_steps=args.numsteps, gpu=args.gpu) if __name__ == '__main__': sys.exit(main())