44 lines
942 B
Python
44 lines
942 B
Python
import pathlib
|
|
import gensim
|
|
|
|
TEXTS_DIR = '../data/texts/'
|
|
EXCLUDE = ['Wellcome_TALKS-ABOUT-FLOWERS']
|
|
OUTPUT = '../data/models/doc2vec.model'
|
|
|
|
if __name__ == '__main__':
|
|
|
|
print("1. Building corpus")
|
|
|
|
dirs = [d for d in pathlib.Path(TEXTS_DIR).iterdir() if d.is_dir() and d.stem not in EXCLUDE]
|
|
|
|
tagged_corpus = []
|
|
i = 0
|
|
for d in dirs:
|
|
print(" " + str(d))
|
|
files = list(d.glob('*.txt'))
|
|
for f in files:
|
|
with f.open() as fp:
|
|
text = fp.read()
|
|
tokens = gensim.utils.simple_preprocess(text, max_len=25)
|
|
tag_doc = gensim.models.doc2vec.TaggedDocument(words=tokens, tags=[i])
|
|
tagged_corpus.append(tag_doc)
|
|
i += 1
|
|
|
|
print("2. Building vocabulary")
|
|
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
|
|
|
|
model.build_vocab(tagged_corpus)
|
|
|
|
print("3. Training")
|
|
model.train(tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
|
|
|
|
print("4. Saving")
|
|
model.save(OUTPUT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|