revert demo

2022-02-10 13:46:30 +01:00 · 2022-02-10 13:46:30 +01:00 · 0f5f06359c
commit 0f5f06359c
parent d62e500c17
1 changed files with 5 additions and 9 deletions
--- a/tokenise+train.py
+++ b/tokenise+train.py
@ -19,18 +19,14 @@ def train(filepath: str, ouputdir: Path, blocksize: int, vocabsize: int, num_ste
 	from aitextgen.utils import build_gpt2_config
 	from aitextgen import aitextgen
-	exts = ['.json', '.gz']
+	files = [x for x in ouputdir.glob('*') if x.name.endswith(".tokenizer.json")]
 	files = [x for x in ouputdir.glob('*') if x.suffix in exts and x.name != "config.json"]
 	print(files)
-	if len(files) == 2:
+	if len(files) == 1:
 		if files[0].suffix == '.json':
 		tok = str(files[0])
 			dat = str(files[1])
 	else:
-			tok = str(files[1])
+		return "No valid tokenizer in " + str(ouputdir)
 			dat = str(files[0])
 	config = build_gpt2_config(vocab_size=vocabsize, max_lenght=blocksize, dropout=0.0, n_embd=256, n_layer=8, n_head=8)