Skip to content

Commit 333fd4d

Browse files
committed
Merge branch 'release-1.0.1'
2 parents adc447d + 63cf941 commit 333fd4d

File tree

3 files changed

+22
-4
lines changed

3 files changed

+22
-4
lines changed

gensim/models/word2vec.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1271,7 +1271,7 @@ def load(cls, *args, **kwargs):
12711271
# update older models
12721272
if hasattr(model, 'table'):
12731273
delattr(model, 'table') # discard in favor of cum_table
1274-
if model.negative and hasattr(model, 'index2word'):
1274+
if model.negative and hasattr(model.wv, 'index2word'):
12751275
model.make_cum_table() # rebuild cum_table from vocabulary
12761276
if not hasattr(model, 'corpus_count'):
12771277
model.corpus_count = None

gensim/test/test_doc2vec.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def test_similarity_unseen_docs(self):
142142
model.build_vocab(corpus)
143143
self.assertTrue(model.docvecs.similarity_unseen_docs(model, rome_str, rome_str) > model.docvecs.similarity_unseen_docs(model, rome_str, car_str))
144144

145-
def model_sanity(self, model):
145+
def model_sanity(self, model, keep_training=True):
146146
"""Any non-trivial model on DocsLeeCorpus can pass these sanity checks"""
147147
fire1 = 0 # doc 0 sydney fires
148148
fire2 = 8 # doc 8 sydney fires
@@ -179,6 +179,12 @@ def model_sanity(self, model):
179179
# fire docs should be closer than fire-tennis
180180
self.assertTrue(model.docvecs.similarity(fire1, fire2) > model.docvecs.similarity(fire1, tennis1))
181181

182+
# keep training after save
183+
if keep_training:
184+
model.save(testfile())
185+
loaded = doc2vec.Doc2Vec.load(testfile())
186+
loaded.train(sentences)
187+
182188
def test_training(self):
183189
"""Test doc2vec training."""
184190
corpus = DocsLeeCorpus()
@@ -316,10 +322,10 @@ def test_delete_temporary_training_data(self):
316322
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
317323
self.assertTrue(model.docvecs and hasattr(model.docvecs, 'doctag_syn0'))
318324
self.assertTrue(hasattr(model, 'syn1'))
319-
self.model_sanity(model)
325+
self.model_sanity(model, keep_training=False)
320326
model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, negative=1, alpha=0.05, min_count=2, iter=20)
321327
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
322-
self.model_sanity(model)
328+
self.model_sanity(model, keep_training=False)
323329
self.assertTrue(hasattr(model, 'syn1neg'))
324330

325331
@log_capture()

gensim/test/test_word2vec.py

+12
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,18 @@ def testOnlineLearning(self):
9595
self.assertEqual(len(model_hs.wv.vocab), 14)
9696
self.assertEqual(len(model_neg.wv.vocab), 14)
9797

98+
def testOnlineLearningAfterSave(self):
99+
"""Test that the algorithm is able to add new words to the
100+
vocabulary and to a trained model when using a sorted vocabulary"""
101+
model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
102+
model_neg.save(testfile())
103+
model_neg = word2vec.Word2Vec.load(testfile())
104+
self.assertTrue(len(model_neg.wv.vocab), 12)
105+
model_neg.build_vocab(new_sentences, update=True)
106+
model_neg.train(new_sentences)
107+
self.assertEqual(len(model_neg.wv.vocab), 14)
108+
109+
98110
def onlineSanity(self, model):
99111
terro, others = [], []
100112
for l in list_corpus:

0 commit comments

Comments
 (0)