revert demo

This commit is contained in:
gauthiier 2022-02-10 13:34:20 +01:00
parent e0dada02f0
commit 9b0a4b5720

View File

@ -12,11 +12,11 @@ from pathlib import Path
def suffix(bs: int, ns: int, vs: int) -> str: def suffix(bs: int, ns: int, vs: int) -> str:
return f"_bs={bs}_ns={ns}_vs={vs}" return f"_bs={bs}_ns={ns}_vs={vs}"
def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str: def train(filepath: str, ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str:
from aitextgen.TokenDataset import TokenDataset # from aitextgen.TokenDataset import TokenDataset
from transformers import GPT2Config # from transformers import GPT2Config
# from aitextgen.utils import build_gpt2_config from aitextgen.utils import build_gpt2_config
from aitextgen import aitextgen from aitextgen import aitextgen
exts = ['.json', '.gz'] exts = ['.json', '.gz']
@ -32,27 +32,27 @@ def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: b
tok = str(files[1]) tok = str(files[1])
dat = str(files[0]) dat = str(files[0])
# config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize) config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize, dropout=0.0, n_embd=256, n_layer=8, n_head=8)
config = GPT2Config( # config = GPT2Config(
vocab_size=vocabsize, # vocab_size=vocabsize,
n_positions=blocksize, # n_positions=blocksize,
n_ctx=blocksize, # n_ctx=blocksize,
resid_pdrop=0.0, # resid_pdrop=0.0,
embd_pdrop=0.0, # embd_pdrop=0.0,
attn_pdrop=0.0, # attn_pdrop=0.0,
summary_first_dropout=0.0, # summary_first_dropout=0.0,
bos_token_id=0, # bos_token_id=0,
eos_token_id=0 # eos_token_id=0
) # )
print(config) print(config)
ai = aitextgen(tokenizer_file=tok, config=config) ai = aitextgen(config=config, tokenizer_file=tok, to_gpu=gpu)
data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True) # data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True)
ai.train(data, output_dir=str(ouputdir), batch_size=16, num_steps=num_steps, generate_every=1000, save_every=1000, num_workers=4, to_gpu=gpu) ai.train(filepath, output_dir=str(ouputdir), line_by_line=False, from_cache=False, learning_rate=1e-3, batch_size=256, num_steps=num_steps, generate_every=1000, save_every=1000, num_workers=4)
return "Done!" return "Done!"
@ -68,7 +68,7 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, lineby
else: else:
return "text input is not valid" return "text input is not valid"
from aitextgen.TokenDataset import TokenDataset # from aitextgen.TokenDataset import TokenDataset
from aitextgen.tokenizers import train_tokenizer from aitextgen.tokenizers import train_tokenizer
#NOTE: vocab_size is fixed since this is not yet in train_tokenizer #NOTE: vocab_size is fixed since this is not yet in train_tokenizer
@ -79,20 +79,21 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, lineby
train_tokenizer(text, vocab_size=vocabsize, prefix=str(fn)) train_tokenizer(text, vocab_size=vocabsize, prefix=str(fn))
else: else:
train_tokenizer(files=[str(x) for x in text], vocab_size=vocabsize, prefix=str(fn)) train_tokenizer(files=[str(x) for x in text], vocab_size=vocabsize, prefix=str(fn))
tok_fn = str(fn) + ".tokenizer.json"
fnn = ouputdir / (f_path.name + f"_bs={blocksize}_ns={vocabsize}") # tok_fn = str(fn) + ".tokenizer.json"
dataset_fn = str(fnn) + ".tar.gz"
print(tok_fn) # fnn = ouputdir / (f_path.name + f"_bs={blocksize}_ns={vocabsize}")
print(dataset_fn) # dataset_fn = str(fnn) + ".tar.gz"
if type(text) is str: # print(tok_fn)
data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) # print(dataset_fn)
else:
texts = [x.read_text() for x in text] # if type(text) is str:
data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) # data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline)
data.save(cache_destination=dataset_fn) # else:
# texts = [x.read_text() for x in text]
# data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline)
# data.save(cache_destination=dataset_fn)
return "encode success" return "encode success"
@ -101,8 +102,8 @@ def main() -> int:
p = argparse.ArgumentParser() p = argparse.ArgumentParser()
p.add_argument("text", type=str, help="text file path to be tokenised and encoded") p.add_argument("text", type=str, help="text file path to be tokenised and encoded")
p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)") p.add_argument("-b", "--blocksize", type=int, choices=[32, 64, 128, 256, 1024], default=64, help="block size, default=64 (corresponds to GPT-2 'max_lenght' config)")
p.add_argument("-s", "--numsteps", type=int, default=10000) p.add_argument("-s", "--numsteps", type=int, default=8000)
p.add_argument("-v", "--vocabsize", type=int, default=1000) p.add_argument("-v", "--vocabsize", type=int, default=5000)
p.add_argument("--ouputdir", type=str, default="data/tokens+models/") p.add_argument("--ouputdir", type=str, default="data/tokens+models/")
p.add_argument("--gpu", action="store_true") p.add_argument("--gpu", action="store_true")
p.add_argument("--line_by_line", action="store_true") p.add_argument("--line_by_line", action="store_true")