Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KeyedVectors refactor for word2vec #833

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
55a4fc9
updated refactor
Aug 18, 2016
e916f7e
commit missed file
Aug 18, 2016
e5416ed
docstring added
Aug 18, 2016
e64766b
more refactoring
Aug 19, 2016
c34cf37
add missing docstring
Aug 19, 2016
c9b31f9
fix docstring format
Aug 19, 2016
a0329af
clearer docstring
droudy Aug 19, 2016
0c0e2fa
minor typo in word2vec wmdistance
jayantj Sep 2, 2016
cdefeb0
pyemd error in keyedvecs
jayantj Sep 8, 2016
1aec5a2
relative import of keyedvecs from word2vec fails
jayantj Sep 8, 2016
e7368a3
bug in init_sims in word2vec
jayantj Sep 8, 2016
fe283c2
property descriptors for syn0, syn0norm, index2word, vocab - fixes bu…
jayantj Sep 8, 2016
9b36bc4
tests for loading older word2vec models
jayantj Sep 9, 2016
dfe1893
backwards compatibility for loading older models
jayantj Sep 9, 2016
4a03f20
test for syn0norm not saved to file
jayantj Sep 9, 2016
09b6ebe
syn0norm not saved to file for KeyedVectors
jayantj Sep 9, 2016
7df4138
tests and fix for accuracy
jayantj Sep 9, 2016
4c54d9b
minor bug in finalized vocab check
jayantj Sep 9, 2016
a28f9f1
warnings for direct syn0/syn0norm access
jayantj Sep 9, 2016
bf1182e
fixes use of most_similar in accuracy
jayantj Sep 10, 2016
5a6b97b
changes logging level to ERROR in word2vec tests
jayantj Sep 10, 2016
cfb2e1c
renames kv to wv in word2vec
jayantj Sep 12, 2016
b002765
minor bugs with checking existence of syn0
jayantj Sep 12, 2016
27c0a14
replaces syn0 and syn0norm with wv.syn0 and wv.syn0norm in tests and …
jayantj Sep 12, 2016
81f8cbb
adds changelog
jayantj Sep 12, 2016
7f98c8d
Merge branch 'develop' into keyedvecs
jayantj Oct 16, 2016
1b282ab
updates tests for loading word2vec models for different python versions
jayantj Oct 16, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Changes
=======

0.13.4, TDB
* Vectors for word2vec and doc2vec extracted out into `KeyedVectors`, save/load and similarity calcs can be run independent of model
- Maintains backwards compatibility, `w2v_model.syn0` and `w2v_model.syn0norm` raise a warning

0.13.3, 2016-09-26

* Add online learning feature to word2vec. (@isohyt [#900](https://github.com/RaRe-Technologies/gensim/pull/900))
Expand All @@ -15,7 +19,6 @@ Changes
* Fixed issue #851, In summarizer.py, RunTimeError is raised if single sentence input is provided to avoid ZeroDivionError. (@metalaman, #887)
* Fixed issue [#791](https://github.com/RaRe-Technologies/gensim/issues/791), correct logic for iterating over SimilarityABC interface. ([@MridulS](https://github.com/MridulS), [#839](https://github.com/RaRe-Technologies/gensim/pull/839)


0.13.2, 2016-08-19

* wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771))
Expand Down
16 changes: 8 additions & 8 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,16 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N

"""
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
if word_locks is None:
word_locks = model.syn0_lockf
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]

for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code
Expand Down Expand Up @@ -185,21 +185,21 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,

"""
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
if word_locks is None:
word_locks = model.syn0_lockf
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
doctag_len = len(doctag_indexes)
if doctag_len != model.dm_tag_count:
return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?)

null_word = model.vocab['\0']
null_word = model.wv.vocab['\0']
pre_pad_count = model.window
post_pad_count = model.window
padded_document_indexes = (
Expand All @@ -214,7 +214,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
+ padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words
)
word_context_len = len(word_context_indexes)
predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]]
predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]]
# numpy advanced-indexing copies; concatenate, flatten to 1d
l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel()
neu1e = train_cbow_pair(model, predict_word, None, l1, alpha,
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/doc2vec_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
_word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
Expand Down Expand Up @@ -405,7 +405,7 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
_word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
Expand Down Expand Up @@ -567,7 +567,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
_word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
Expand Down
Loading