From 58dd03ba7304b39a8d9e21a2442c190178eaf04a Mon Sep 17 00:00:00 2001 From: gauthiier Date: Wed, 9 Feb 2022 18:42:32 +0100 Subject: [PATCH] line_by_line --- tokenise+train.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tokenise+train.py b/tokenise+train.py index 7d8f950..51ec87e 100644 --- a/tokenise+train.py +++ b/tokenise+train.py @@ -30,6 +30,8 @@ def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: b config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize) + print(config) + ai = aitextgen(tokenizer_file=tok, config=config) data = TokenDataset(dat, tokenizer_file=tok, block_size=blocksize, from_cache=True) @@ -70,10 +72,10 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbos print(dataset_fn) if type(text) is str: - data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize) + data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True) else: texts = [x.read_text() for x in text] - data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize) + data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True) data.save(cache_destination=dataset_fn) return "encode success"