diff --git a/tokenise+train.py b/tokenise+train.py index 5c99e73..a299724 100644 --- a/tokenise+train.py +++ b/tokenise+train.py @@ -57,7 +57,7 @@ def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: b return "Done!" -def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbose: bool = False) -> str: +def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, linebyline: bool, verbose: bool = False) -> str: f_path = Path(filepath) @@ -88,10 +88,10 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbos print(dataset_fn) if type(text) is str: - data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True) + data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) else: texts = [x.read_text() for x in text] - data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True) + data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline) data.save(cache_destination=dataset_fn) return "encode success" @@ -105,6 +105,7 @@ def main() -> int: p.add_argument("-v", "--vocabsize", type=int, default=1000) p.add_argument("--ouputdir", type=str, default="data/tokens+models/") p.add_argument("--gpu", action="store_true") + p.add_argument("--line_by_line", action="store_true") args = p.parse_args() @@ -125,7 +126,7 @@ def main() -> int: else: output_dir.mkdir() - encode(filepath=args.text, blocksize=args.blocksize, vocabsize=args.vocabsize, ouputdir=output_dir) + encode(filepath=args.text, blocksize=args.blocksize, vocabsize=args.vocabsize, ouputdir=output_dir, linebyline=args.line_by_line) train(ouputdir=output_dir, blocksize=args.blocksize, vocabsize=args.vocabsize, num_steps=args.numsteps, gpu=args.gpu)