examine + new speak

This commit is contained in:
Gauthier
2022-03-06 14:20:53 +01:00
parent abf2f2f720
commit 11201664b8
13 changed files with 284 additions and 89 deletions
View File
+42
View File
@@ -0,0 +1,42 @@
import pathlib
import gensim
TEXTS_DIR = '../data/texts/'
EXCLUDE = ['Wellcome_TALKS-ABOUT-FLOWERS']
OUTPUT = '../data/models/doc2vec.model'
if __name__ == '__main__':
print("1. Building corpus")
dirs = [d for d in pathlib.Path(TEXTS_DIR).iterdir() if d.is_dir() and d.stem not in EXCLUDE]
tagged_corpus = []
i = 0
for d in dirs:
files = list(d.glob('*.txt'))
for f in files:
with f.open() as fp:
text = fp.read()
tokens = gensim.utils.simple_preprocess(text, max_len=25)
tag_doc = gensim.models.doc2vec.TaggedDocument(words=tokens, tags=[i])
tagged_corpus.append(tag_doc)
i += 1
print("2. Building vocabulary")
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(tagged_corpus)
print("3. Training")
model.train(tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
print("4. Saving")
model.save(OUTPUT)
+18
View File
@@ -0,0 +1,18 @@
import gensim
import numpy
from numpy import dot
from numpy.linalg import norm
MODEL_INPUT = '../data/models/doc2vec.model'
def cos_dist(v0, v1):
return dot(v0, v1) / (norm(v0) * norm(v1))
class Metric:
def __init__(self, model_input=MODEL_INPUT):
self.model = gensim.models.doc2vec.Doc2Vec.load(model_input)
def vector(self, text: str):
tokens = gensim.utils.simple_preprocess(text, max_len=25)
return self.model.infer_vector(tokens)