examine + new speak
This commit is contained in:
@@ -0,0 +1,42 @@
|
||||
import pathlib
|
||||
import gensim
|
||||
|
||||
TEXTS_DIR = '../data/texts/'
|
||||
EXCLUDE = ['Wellcome_TALKS-ABOUT-FLOWERS']
|
||||
OUTPUT = '../data/models/doc2vec.model'
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
print("1. Building corpus")
|
||||
|
||||
dirs = [d for d in pathlib.Path(TEXTS_DIR).iterdir() if d.is_dir() and d.stem not in EXCLUDE]
|
||||
|
||||
tagged_corpus = []
|
||||
i = 0
|
||||
for d in dirs:
|
||||
files = list(d.glob('*.txt'))
|
||||
for f in files:
|
||||
with f.open() as fp:
|
||||
text = fp.read()
|
||||
tokens = gensim.utils.simple_preprocess(text, max_len=25)
|
||||
tag_doc = gensim.models.doc2vec.TaggedDocument(words=tokens, tags=[i])
|
||||
tagged_corpus.append(tag_doc)
|
||||
i += 1
|
||||
|
||||
print("2. Building vocabulary")
|
||||
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
|
||||
|
||||
model.build_vocab(tagged_corpus)
|
||||
|
||||
print("3. Training")
|
||||
model.train(tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
|
||||
|
||||
print("4. Saving")
|
||||
model.save(OUTPUT)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
import gensim
|
||||
import numpy
|
||||
from numpy import dot
|
||||
from numpy.linalg import norm
|
||||
|
||||
MODEL_INPUT = '../data/models/doc2vec.model'
|
||||
|
||||
def cos_dist(v0, v1):
|
||||
return dot(v0, v1) / (norm(v0) * norm(v1))
|
||||
|
||||
class Metric:
|
||||
|
||||
def __init__(self, model_input=MODEL_INPUT):
|
||||
self.model = gensim.models.doc2vec.Doc2Vec.load(model_input)
|
||||
|
||||
def vector(self, text: str):
|
||||
tokens = gensim.utils.simple_preprocess(text, max_len=25)
|
||||
return self.model.infer_vector(tokens)
|
||||
Reference in New Issue
Block a user