From 9b0a4b5720771f2865fe462e6dc03f3db40c27a9 Mon Sep 17 00:00:00 2001 From: gauthiier Date: Thu, 10 Feb 2022 13:34:20 +0100 Subject: [PATCH] revert demo --- tokenise+train.py | 67 ++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/tokenise+train.py b/tokenise+train.py index a299724..8b2defb 100644 --- a/tokenise+train.py +++ b/tokenise+train.py @@ -12,11 +12,11 @@ from pathlib import Path def suffix(bs: int, ns: int, vs: int) -> str: return f"_bs={bs}_ns={ns}_vs={vs}" -def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str: +def train(filepath: str, ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str: - from aitextgen.TokenDataset import TokenDataset - from transformers import GPT2Config - # from aitextgen.utils import build_gpt2_config + # from aitextgen.TokenDataset import TokenDataset + # from transformers import GPT2Config + from aitextgen.utils import build_gpt2_config from aitextgen import aitextgen exts = ['.json', '.gz'] @@ -32,27 +32,27 @@ def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: b tok = str(files[1]) dat = str(files[0]) - # config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize) + config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize, dropout=0.0, n_embd=256, n_layer=8, n_head=8) - config = GPT2Config( - vocab_size=vocabsize, - n_positions=blocksize, - n_ctx=blocksize, - resid_pdrop=0.0, - embd_pdrop=0.0, - attn_pdrop=0.0, - summary_first_dropout=0.0, - bos_token_id=0, - eos_token_id=0 - ) +# config = GPT2Config( + # vocab_size=vocabsize, + # n_positions=blocksize, + # n_ctx=blocksize, + # resid_pdrop=0.0, + # embd_pdrop=0.0, + # attn_pdrop=0.0, + # summary_first_dropout=0.0, + # bos_token_id=0, + # eos_token_id=0 + # ) print(config) - ai = aitextgen(tokenizer_file=tok, config=config) + ai = aitextgen(config=config, tokenizer_file=tok, to_gpu=gpu) - data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True) + # data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True) - ai.train(data, output_dir=str(ouputdir), batch_size=16, num_steps=num_steps, generate_every=1000, save_every=1000, num_workers=4, to_gpu=gpu) + ai.train(filepath, output_dir=str(ouputdir), line_by_line=False, from_cache=False, learning_rate=1e-3, batch_size=256, num_steps=num_steps, generate_every=1000, save_every=1000, num_workers=4) return "Done!" @@ -68,7 +68,7 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, lineby else: return "text input is not valid" - from aitextgen.TokenDataset import TokenDataset + # from aitextgen.TokenDataset import TokenDataset from aitextgen.tokenizers import train_tokenizer #NOTE: vocab_size is fixed since this is not yet in train_tokenizer @@ -79,20 +79,21 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, lineby train_tokenizer(text, vocab_size=vocabsize, prefix=str(fn)) else: train_tokenizer(files=[str(x) for x in text], vocab_size=vocabsize, prefix=str(fn)) - tok_fn = str(fn) + ".tokenizer.json" - fnn = ouputdir / (f_path.name + f"_bs={blocksize}_ns={vocabsize}") - dataset_fn = str(fnn) + ".tar.gz" + # tok_fn = str(fn) + ".tokenizer.json" - print(tok_fn) - print(dataset_fn) + # fnn = ouputdir / (f_path.name + f"_bs={blocksize}_ns={vocabsize}") + # dataset_fn = str(fnn) + ".tar.gz" - if type(text) is str: - data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) - else: - texts = [x.read_text() for x in text] - data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) - data.save(cache_destination=dataset_fn) + # print(tok_fn) + # print(dataset_fn) + + # if type(text) is str: + # data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) + # else: + # texts = [x.read_text() for x in text] + # data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) + # data.save(cache_destination=dataset_fn) return "encode success" @@ -101,8 +102,8 @@ def main() -> int: p = argparse.ArgumentParser() p.add_argument("text", type=str, help="text file path to be tokenised and encoded") p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)") - p.add_argument("-s", "--numsteps", type=int, default=10000) - p.add_argument("-v", "--vocabsize", type=int, default=1000) + p.add_argument("-s", "--numsteps", type=int, default=8000) + p.add_argument("-v", "--vocabsize", type=int, default=5000) p.add_argument("--ouputdir", type=str, default="data/tokens+models/") p.add_argument("--gpu", action="store_true") p.add_argument("--line_by_line", action="store_true")