tok+train
This commit is contained in:
parent
ca68d5712c
commit
04efe2cc4f
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,5 +1,6 @@
|
|||||||
#macos
|
#macos
|
||||||
.DS_store
|
.DS_store
|
||||||
|
venv
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|||||||
115
tokenise+train.py
Normal file
115
tokenise+train.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
import argparse, os, sys
|
||||||
|
from pathlib import Path
|
||||||
|
# from aitextgen.TokenDataset import TokenDataset
|
||||||
|
# from aitextgen.tokenizers import train_tokenizer
|
||||||
|
# from aitextgen.utils import GPT2ConfigCPU
|
||||||
|
# from aitextgen.utils import build_gpt2_config
|
||||||
|
# from aitextgen import aitextgen
|
||||||
|
|
||||||
|
# import tokenise as tk
|
||||||
|
# import train as tr
|
||||||
|
|
||||||
|
def suffix(bs: int, ns: int, vs: int) -> str:
|
||||||
|
return f"_bs={bs}_ns={ns}_vs={vs}"
|
||||||
|
|
||||||
|
def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str:
|
||||||
|
|
||||||
|
from aitextgen.TokenDataset import TokenDataset
|
||||||
|
from aitextgen.utils import build_gpt2_config
|
||||||
|
from aitextgen import aitextgen
|
||||||
|
|
||||||
|
exts = ['.json', '.gz']
|
||||||
|
files = [x for x in ouputdir.glob('*') if x.suffix in exts]
|
||||||
|
if len(files) == 2:
|
||||||
|
if files[0].suffix == '.json':
|
||||||
|
tok = str(files[0])
|
||||||
|
dat = str(files[1])
|
||||||
|
else:
|
||||||
|
tok = str(files[1])
|
||||||
|
dat = str(files[0])
|
||||||
|
|
||||||
|
config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize)
|
||||||
|
|
||||||
|
ai = aitextgen(tokenizer_file=tok, config=config)
|
||||||
|
|
||||||
|
data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True)
|
||||||
|
|
||||||
|
ai.train(data, output_dir=str(ouputdir), batch_size=16, num_steps=num_steps, generate_every=1000, save_every=1000, num_workers=4, to_gpu=gpu)
|
||||||
|
|
||||||
|
return "Done!"
|
||||||
|
|
||||||
|
|
||||||
|
def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbose: bool = False) -> str:
|
||||||
|
|
||||||
|
f_path = Path(filepath)
|
||||||
|
|
||||||
|
if f_path.is_dir():
|
||||||
|
text = [x for x in f_path.glob('*') if x.is_file()]
|
||||||
|
elif f_path.is_file():
|
||||||
|
text = str(f_path)
|
||||||
|
else:
|
||||||
|
return "text input is not valid"
|
||||||
|
|
||||||
|
from aitextgen.TokenDataset import TokenDataset
|
||||||
|
from aitextgen.tokenizers import train_tokenizer
|
||||||
|
|
||||||
|
#NOTE: vocab_size is fixed since this is not yet in train_tokenizer
|
||||||
|
#see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py
|
||||||
|
|
||||||
|
fn = ouputdir / (f_path.name + f"_ns={vocabsize}")
|
||||||
|
if type(text) is str:
|
||||||
|
train_tokenizer(text, vocab_size=vocabsize, prefix=str(fn))
|
||||||
|
else:
|
||||||
|
train_tokenizer(files=[str(x) for x in text], vocab_size=vocabsize, prefix=str(fn))
|
||||||
|
tok_fn = str(fn) + ".tokenizer.json"
|
||||||
|
|
||||||
|
fnn = ouputdir / (f_path.name + f"_bs={blocksize}_ns={vocabsize}")
|
||||||
|
dataset_fn = str(fnn) + ".tar.gz"
|
||||||
|
|
||||||
|
print(tok_fn)
|
||||||
|
print(dataset_fn)
|
||||||
|
|
||||||
|
if type(text) is str:
|
||||||
|
data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize)
|
||||||
|
else:
|
||||||
|
texts = [x.read_text() for x in text]
|
||||||
|
data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize)
|
||||||
|
data.save(cache_destination=dataset_fn)
|
||||||
|
|
||||||
|
return "encode success"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("text", type=str, help="text file path to be tokenised and encoded")
|
||||||
|
p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)")
|
||||||
|
p.add_argument("-s", "--numsteps", type=int, default=10000)
|
||||||
|
p.add_argument("-v", "--vocabsize", type=int, default=1000)
|
||||||
|
p.add_argument("--ouputdir", type=str, default="data/tokens+models/")
|
||||||
|
p.add_argument("--gpu", action="store_true")
|
||||||
|
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
text = Path(args.text)
|
||||||
|
if not text.exists():
|
||||||
|
return args.text + " doesn't exists"
|
||||||
|
|
||||||
|
output_dir = Path(args.ouputdir + text.name + suffix(args.blocksize, args.numsteps, args.vocabsize))
|
||||||
|
|
||||||
|
if output_dir.is_dir():
|
||||||
|
exts = ['.json', '.gz', '.bin']
|
||||||
|
files = [x for x in output_dir.glob('*') if x.suffix in exts]
|
||||||
|
if len(files) == 3:
|
||||||
|
print("Token + model already exists > " + output_dir.name)
|
||||||
|
q = input("Continue? [y/n]")
|
||||||
|
if q != 'y':
|
||||||
|
return "Nothing to do..."
|
||||||
|
else:
|
||||||
|
output_dir.mkdir()
|
||||||
|
|
||||||
|
encode(filepath=args.text, blocksize=args.blocksize, vocabsize=args.vocabsize, ouputdir=output_dir)
|
||||||
|
train(ouputdir=output_dir, blocksize=args.blocksize, vocabsize=args.vocabsize, num_steps=args.numsteps, gpu=args.gpu)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
@ -16,7 +16,7 @@ def encode(filepath: str, blocksize: int, ouputdir: str, verbose: bool = False)
|
|||||||
tok_fn = fn + ".tokenizer.json"
|
tok_fn = fn + ".tokenizer.json"
|
||||||
fn_dest = fn + "_bs=" + str(blocksize) + ".tar.gz"
|
fn_dest = fn + "_bs=" + str(blocksize) + ".tar.gz"
|
||||||
|
|
||||||
data = TokenDataset(file_path=filepath, tokenizer_file=tok_fn, block_size=blocksize)
|
data = TokenDataset(file_path=filepath, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True)
|
||||||
data.save(cache_destination=fn_dest)
|
data.save(cache_destination=fn_dest)
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user