diff --git a/tokenise+train.py b/tokenise+train.py index cf59b64..d83208f 100644 --- a/tokenise+train.py +++ b/tokenise+train.py @@ -15,7 +15,8 @@ def suffix(bs: int, ns: int, vs: int) -> str: def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str: from aitextgen.TokenDataset import TokenDataset - from aitextgen.utils import build_gpt2_config + from transformers import GPT2Config + # from aitextgen.utils import build_gpt2_config from aitextgen import aitextgen exts = ['.json', '.gz'] @@ -65,8 +66,7 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbos return "text input is not valid" from aitextgen.TokenDataset import TokenDataset - from aitextgen.tokenizers import train_tokenizer - from transformers import GPT2Config + from aitextgen.tokenizers import train_tokenizer #NOTE: vocab_size is fixed since this is not yet in train_tokenizer #see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py