-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add inference for new unseen author for gensim.models.AuthorTopicModel
#1766
Changes from 20 commits
f68dfe9
2843d1b
92ef759
ee51f71
2de1b34
cbe9049
a94c0b1
783a585
3849711
a85563b
5eb84ff
7df7065
f818bf2
8c5765f
5ad9a0b
8e4899c
10ba9c8
0a3eedb
f587596
1bc929c
6085b90
34f5222
6d7141d
c525845
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -919,15 +919,85 @@ def get_document_topics(self, word_id, minimum_probability=None): | |
'Use the "get_author_topics" method.' | ||
) | ||
|
||
def get_new_author_topics(self, corpus, minimum_probability=None): | ||
"""Infers topics for new author. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please use numpy-style docstrings: http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html and https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt |
||
|
||
Infers a topic distribution for a new author over the passed corpus of docs, | ||
assuming that all documents are from this single new author. | ||
|
||
Parameters | ||
---------- | ||
corpus : iterable of iterable of (int, int) | ||
Corpus in BoW format. | ||
minimum_probability : float, optional | ||
Ignore topics with probability below this value, if None - 1e-8 is used. | ||
|
||
Returns | ||
------- | ||
list of (int, float) | ||
Topic distribution for the given `corpus`. | ||
|
||
""" | ||
def rho(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Stamenov @olavurmortensen Please document the rationale behind this definition of rho. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In a different place you explain rho as following: |
||
return pow(self.offset + 1 + 1, -self.decay) | ||
|
||
# Wrap in fuction to avoid code duplication. | ||
def rollback_new_author_chages(): | ||
self.state.gamma = self.state.gamma[0:-1] | ||
|
||
del self.author2doc[new_author_name] | ||
a_id = self.author2id[new_author_name] | ||
del self.id2author[a_id] | ||
del self.author2id[new_author_name] | ||
|
||
for new_doc_id in corpus_doc_idx: | ||
del self.doc2author[new_doc_id] | ||
|
||
try: | ||
len_input_corpus = len(corpus) | ||
except TypeError: | ||
logger.warning("input corpus stream has no len(); counting documents") | ||
len_input_corpus = sum(1 for _ in corpus) | ||
if len_input_corpus == 0: | ||
raise ValueError("AuthorTopicModel.get_new_author_topics() called with an empty corpus") | ||
|
||
new_author_name = "placeholder_name" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just for completeness, check that this author isn't already in the dictionary. |
||
corpus_doc_idx = list(range(self.total_docs, self.total_docs + len_input_corpus)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a comment explaining this line. For example: "indexes representing the documents in the input corpus". |
||
|
||
# Add the new placeholder author to author2id/id2author dictionaries. | ||
num_new_authors = 1 | ||
author_id = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this looks confusing. Just say |
||
self.author2id[new_author_name] = author_id + self.num_authors | ||
self.id2author[author_id + self.num_authors] = new_author_name | ||
|
||
# Add new author in author2doc and doc into doc2author. | ||
self.author2doc[new_author_name] = corpus_doc_idx | ||
for new_doc_id in corpus_doc_idx: | ||
self.doc2author[new_doc_id] = [new_author_name] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because of how you set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
|
||
gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics)) | ||
self.state.gamma = np.vstack([self.state.gamma, gamma_new]) | ||
|
||
# Should not record the sstats, as we are goint to delete the new author after calculated. | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this try block throw an exception if there is a problem? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it does, for example, when corpus is invalid (list of something else), or if the inference is somehow interrupted, or maybe something else. Must make sure to rollback the temporary changes in any case. If makes sense to show the exception to the user, but leave the model state untouched. |
||
gammat, _ = self.inference( | ||
corpus, self.author2doc, self.doc2author, rho(), | ||
collect_sstats=False, chunk_doc_idx=corpus_doc_idx | ||
) | ||
new_author_topics = self.get_author_topics(new_author_name, minimum_probability) | ||
finally: | ||
rollback_new_author_chages() | ||
return new_author_topics | ||
|
||
def get_author_topics(self, author_name, minimum_probability=None): | ||
""" | ||
Return topic distribution the given author, as a list of | ||
Return topic distribution the given author. | ||
|
||
Input as as a list of | ||
(topic_id, topic_probability) 2-tuples. | ||
Ignore topics with very low probability (below `minimum_probability`). | ||
|
||
Obtaining topic probabilities of each word, as in LDA (via `per_word_topics`), | ||
is not supported. | ||
|
||
""" | ||
|
||
author_id = self.author2id[author_name] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,7 @@ | |
from gensim.test import basetmtests | ||
from gensim.test.utils import (datapath, | ||
get_tmpfile, common_texts, common_dictionary as dictionary, common_corpus as corpus) | ||
|
||
from gensim.matutils import jensen_shannon | ||
# TODO: | ||
# Test that computing the bound on new unseen documents works as expected (this is somewhat different | ||
# in the author-topic model than in LDA). | ||
|
@@ -450,6 +450,45 @@ def testTermTopics(self): | |
self.assertTrue(isinstance(topic_no, int)) | ||
self.assertTrue(isinstance(probability, float)) | ||
|
||
def testNewAuthorTopics(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CC: @olavurmortensen, this test looks like "sanity-check" only, can you propose something more significant? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Besides the sanity check, I tried another test case, where I would infer the topics for a new author, using the exact same documents from the corpus that an author already is assigned to. We can expect that the topic distributions should be very similar, as it is, but not close enough so that we can use np.allclose(..,..). Example: In [2]: model.get_new_author_topics(corpus=corpus[1:3]) In [3]: model.get_author_topics(author_name="sally") Of course something li this would work. @menshikh-iv do you approve of this test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds reasonable, only one thing - may be better to calculate JS-divergense instead of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://github.com/RaRe-Technologies/gensim/blob/a54e3369bf832bc632d20f5e1d401267e6bc4882/gensim/matutils.py#L921 implementation already available in gensim There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How would you propose to set the similarity threshold? Just empirically for this one test case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I don't know the better solution here. |
||
|
||
model = self.class_( | ||
corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, | ||
passes=100, random_state=np.random.seed(0) | ||
) | ||
author2doc_newauthor = {} | ||
author2doc_newauthor["test"] = [0, 1] | ||
model.update(corpus=corpus[0:2], author2doc=author2doc_newauthor) | ||
|
||
# temp save model state vars before get_new_author_topics is called | ||
state_gamma_len = len(model.state.gamma) | ||
author2doc_len = len(model.author2doc) | ||
author2id_len = len(model.author2id) | ||
id2author_len = len(model.id2author) | ||
doc2author_len = len(model.doc2author) | ||
|
||
new_author_topics = model.get_new_author_topics(corpus=corpus[0:2]) | ||
|
||
# sanity check | ||
for k, v in new_author_topics: | ||
self.assertTrue(isinstance(k, int)) | ||
self.assertTrue(isinstance(v, float)) | ||
|
||
# make sure topics are similar enough | ||
similarity = 1 / (1 + jensen_shannon(model["test"], new_author_topics)) | ||
self.assertTrue(similarity >= 0.9) | ||
|
||
# produce an error to test if rollback occurs | ||
with self.assertRaises(TypeError): | ||
model.get_new_author_topics(corpus=corpus[0]) | ||
|
||
# assure rollback was successful and the model state is as before | ||
self.assertEqual(state_gamma_len, len(model.state.gamma)) | ||
self.assertEqual(author2doc_len, len(model.author2doc)) | ||
self.assertEqual(author2id_len, len(model.author2id)) | ||
self.assertEqual(id2author_len, len(model.id2author)) | ||
self.assertEqual(doc2author_len, len(model.doc2author)) | ||
|
||
def testPasses(self): | ||
# long message includes the original error message with a custom one | ||
self.longMessage = True | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's a reason to
minimum_probability=None
here (instead of 1e-8)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@menshikh-iv It is the same in
get_document_topics
in LDA (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ldamodel.py#L987), andget_term_topics
as well.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so, I should leave it like this, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Stamenov yes, leave as is