line by line optional
This commit is contained in:
parent
b8d6a33bf8
commit
e0dada02f0
@ -57,7 +57,7 @@ def train(ouputdir: Path, blocksize: int, vocabsize: int, num_steps: int, gpu: b
|
|||||||
return "Done!"
|
return "Done!"
|
||||||
|
|
||||||
|
|
||||||
def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbose: bool = False) -> str:
|
def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, linebyline: bool, verbose: bool = False) -> str:
|
||||||
|
|
||||||
f_path = Path(filepath)
|
f_path = Path(filepath)
|
||||||
|
|
||||||
@ -88,10 +88,10 @@ def encode(filepath: str, blocksize: int, vocabsize: int, ouputdir: Path, verbos
|
|||||||
print(dataset_fn)
|
print(dataset_fn)
|
||||||
|
|
||||||
if type(text) is str:
|
if type(text) is str:
|
||||||
data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True)
|
data = TokenDataset(file_path=text, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline)
|
||||||
else:
|
else:
|
||||||
texts = [x.read_text() for x in text]
|
texts = [x.read_text() for x in text]
|
||||||
data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=True)
|
data = TokenDataset(texts=texts, tokenizer_file=tok_fn, block_size=blocksize, line_by_line=linebyline)
|
||||||
data.save(cache_destination=dataset_fn)
|
data.save(cache_destination=dataset_fn)
|
||||||
|
|
||||||
return "encode success"
|
return "encode success"
|
||||||
@ -105,6 +105,7 @@ def main() -> int:
|
|||||||
p.add_argument("-v", "--vocabsize", type=int, default=1000)
|
p.add_argument("-v", "--vocabsize", type=int, default=1000)
|
||||||
p.add_argument("--ouputdir", type=str, default="data/tokens+models/")
|
p.add_argument("--ouputdir", type=str, default="data/tokens+models/")
|
||||||
p.add_argument("--gpu", action="store_true")
|
p.add_argument("--gpu", action="store_true")
|
||||||
|
p.add_argument("--line_by_line", action="store_true")
|
||||||
|
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|
||||||
@ -125,7 +126,7 @@ def main() -> int:
|
|||||||
else:
|
else:
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
|
|
||||||
encode(filepath=args.text, blocksize=args.blocksize, vocabsize=args.vocabsize, ouputdir=output_dir)
|
encode(filepath=args.text, blocksize=args.blocksize, vocabsize=args.vocabsize, ouputdir=output_dir, linebyline=args.line_by_line)
|
||||||
train(ouputdir=output_dir, blocksize=args.blocksize, vocabsize=args.vocabsize, num_steps=args.numsteps, gpu=args.gpu)
|
train(ouputdir=output_dir, blocksize=args.blocksize, vocabsize=args.vocabsize, num_steps=args.numsteps, gpu=args.gpu)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user