From aff1c17d2ef1b47c34e183414f23c4b807ce89ee Mon Sep 17 00:00:00 2001 From: gauthiier Date: Wed, 9 Feb 2022 18:52:17 +0100 Subject: [PATCH] GPT2Config --- tokenise+train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tokenise+train.py b/tokenise+train.py index cf59b64..d83208f 100644 --- a/tokenise+train.py +++ b/tokenise+train.py @@ -15,7 +15,8 @@ def suffix(bs: int, ns: int, vs: int) -> str: def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str: from aitextgen.TokenDataset import TokenDataset - from aitextgen.utils import build_gpt2_config + from transformers import GPT2Config + # from aitextgen.utils import build_gpt2_config from aitextgen import aitextgen exts = ['.json', '.gz'] @@ -65,8 +66,7 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbos return "text input is not valid" from aitextgen.TokenDataset import TokenDataset - from aitextgen.tokenizers import train_tokenizer - from transformers import GPT2Config + from aitextgen.tokenizers import train_tokenizer #NOTE: vocab_size is fixed since this is not yet in train_tokenizer #see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py