diff --git a/tokenise+train.py b/tokenise+train.py index 7d8f950..51ec87e 100644 --- a/tokenise+train.py +++ b/tokenise+train.py @@ -30,6 +30,8 @@ def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: b config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize) + print(config) + ai = aitextgen(tokenizer_file=tok, config=config) data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True) @@ -70,10 +72,10 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbos print(dataset_fn) if type(text) is str: - data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize) + data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True) else: texts = [x.read_text() for x in text] - data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize) + data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True) data.save(cache_destination=dataset_fn) return "encode success"