GPT2Config

This commit is contained in:
gauthiier 2022-02-09 18:52:17 +01:00
parent 240e9c4535
commit aff1c17d2e

View File

@ -15,7 +15,8 @@ def suffix(bs: int, ns: int, vs: int) -> str:
def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str: def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: bool = False) -> str:
from aitextgen.TokenDataset import TokenDataset from aitextgen.TokenDataset import TokenDataset
from aitextgen.utils import build_gpt2_config from transformers import GPT2Config
# from aitextgen.utils import build_gpt2_config
from aitextgen import aitextgen from aitextgen import aitextgen
exts = ['.json', '.gz'] exts = ['.json', '.gz']
@ -66,7 +67,6 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbos
from aitextgen.TokenDataset import TokenDataset from aitextgen.TokenDataset import TokenDataset
from aitextgen.tokenizers import train_tokenizer from aitextgen.tokenizers import train_tokenizer
from transformers import GPT2Config
#NOTE: vocab_size is fixed since this is not yet in train_tokenizer #NOTE: vocab_size is fixed since this is not yet in train_tokenizer
#see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py #see https://github.com/minimaxir/aitextgen/blob/master/aitextgen/tokenizers.py