import pathlib import gensim TEXTS_DIR = '../data/texts/' EXCLUDE = ['Wellcome_TALKS-ABOUT-FLOWERS'] OUTPUT = '../data/models/doc2vec.model' if __name__ == '__main__': print("1. Building corpus") dirs = [d for d in pathlib.Path(TEXTS_DIR).iterdir() if d.is_dir() and d.stem not in EXCLUDE] tagged_corpus = [] i = 0 for d in dirs: print(" " + str(d)) files = list(d.glob('*.txt')) for f in files: with f.open() as fp: text = fp.read() tokens = gensim.utils.simple_preprocess(text, max_len=25) tag_doc = gensim.models.doc2vec.TaggedDocument(words=tokens, tags=[i]) tagged_corpus.append(tag_doc) i += 1 print("2. Building vocabulary") model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40) model.build_vocab(tagged_corpus) print("3. Training") model.train(tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs) print("4. Saving") model.save(OUTPUT)