2022-03-06 14:20:53 +01:00

43 lines
920 B
Python

import pathlib
import gensim
TEXTS_DIR = '../data/texts/'
EXCLUDE = ['Wellcome_TALKS-ABOUT-FLOWERS']
OUTPUT = '../data/models/doc2vec.model'
if __name__ == '__main__':
print("1. Building corpus")
dirs = [d for d in pathlib.Path(TEXTS_DIR).iterdir() if d.is_dir() and d.stem not in EXCLUDE]
tagged_corpus = []
i = 0
for d in dirs:
files = list(d.glob('*.txt'))
for f in files:
with f.open() as fp:
text = fp.read()
tokens = gensim.utils.simple_preprocess(text, max_len=25)
tag_doc = gensim.models.doc2vec.TaggedDocument(words=tokens, tags=[i])
tagged_corpus.append(tag_doc)
i += 1
print("2. Building vocabulary")
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(tagged_corpus)
print("3. Training")
model.train(tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
print("4. Saving")
model.save(OUTPUT)