From f68dfe97b1ccf9d68d155730a5b28a0ba8e37bf5 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Wed, 6 Dec 2017 21:34:37 +0100 Subject: [PATCH 01/23] Add function get_new_author_topics() to infer topics distribution for new unseen author. --- gensim/models/atmodel.py | 93 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 9c691b8c25..2671772be8 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -34,6 +34,7 @@ # and do_estep methods. import logging +import traceback import numpy as np # for arrays, array broadcasting etc. from copy import deepcopy from shutil import copyfile @@ -882,6 +883,98 @@ def get_document_topics(self, word_id, minimum_probability=None): raise NotImplementedError('Method "get_document_topics" is not valid for the author-topic model. Use the "get_author_topics" method.') + def get_new_author_topics(self, corpus, minimum_probability=None): + """ + Infers a topic distribution for a new author over the passed corpus of docs, + assuming that all documents are from this single new author. + + Args: + corpus (list): Bag-of-words representation of the documents to get topics for. + minimum_probability (float): Ignore topics with probability below this value + (None by default). If set to None, a value of 1e-8 is used to prevent 0s. + Returns: + topic distribution for the given list of documents `corpus` in bow format, + as a list of `(topic_id, topic_probability)` 2-tuples. + """ + + #use the training hyperparameters from the model initialization + passes = self.passes + + #TODO: how should this function look like for get_new_author_topics? + def rho(): + return pow(self.offset + 1 + 1, -self.decay) + + #wrap in fuction to avoid code duplication + def rollback_new_author_chages(): + self.state.gamma = self.state.gamma[0:-1] + for doc in corpus: + self.corpus.remove(doc) + + del self.author2doc[new_author_name] + a_id = self.author2id[new_author_name] + del self.id2author[a_id] + del self.author2id[new_author_name] + + for new_doc_id in corpus_doc_idx: + del self.doc2author[new_doc_id] + + self.total_docs -= len_input_corpus + self.num_authors -= num_new_authors + + try: + len_input_corpus = len(corpus) + except TypeError: + logger.warning("input corpus stream has no len(); counting documents") + len_input_corpus = sum(1 for _ in corpus) + if len_input_corpus == 0: + logger.warning("AuthorTopicModel.get_new_author_topics() called with an empty corpus") + return + if not len_input_corpus < self.chunksize: + logger.warning("AuthorTopicModel.get_new_author_topics() called with to many documents. Use update().") + return + + new_author_name = "placeholder_name" + + # Add new documents in corpus to self.corpus. + self.extend_corpus(corpus) + + corpus_doc_idx = list(range(self.total_docs, len_input_corpus+self.total_docs)) + #increment number of total docs + self.total_docs += len_input_corpus + + # Add the new placeholder author to author2id/id2author dictionaries. + num_new_authors = 1 + author_id = 0 + self.author2id[new_author_name] = author_id + self.num_authors + self.id2author[author_id + self.num_authors] = new_author_name + + # Increment the number of total authors seen. + self.num_authors += num_new_authors + + #add new author in author2doc and doc into doc2author + self.author2doc[new_author_name] = corpus_doc_idx + for new_doc_id in corpus_doc_idx: + self.doc2author[new_doc_id] = [new_author_name] + + gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics)) + self.state.gamma = np.vstack([self.state.gamma, gamma_new]) + + # should not record the sstats, as we are goint to delete the new author after calculated + try: + gammat, _ = self.inference( + corpus, self.author2doc, self.doc2author, rho(), + collect_sstats=False, chunk_doc_idx=corpus_doc_idx + ) + except Exception as e: + #something went wrong! Rollback temporary changes in object and log + rollback_new_author_chages() + logging.error(traceback.format_exc()) + return + + new_author_topics = self.get_author_topics(new_author_name, minimum_probability) + rollback_new_author_chages() + return new_author_topics + def get_author_topics(self, author_name, minimum_probability=None): """ Return topic distribution the given author, as a list of From 2843d1bcdbdb5978bfb38a78cde8b63fe6a4fd2c Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Thu, 7 Dec 2017 11:23:45 +0100 Subject: [PATCH 02/23] Fixes for pep8 compliance. Concrete exception handling. --- gensim/models/atmodel.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 2671772be8..661c1b13b5 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -34,7 +34,6 @@ # and do_estep methods. import logging -import traceback import numpy as np # for arrays, array broadcasting etc. from copy import deepcopy from shutil import copyfile @@ -884,7 +883,8 @@ def get_document_topics(self, word_id, minimum_probability=None): raise NotImplementedError('Method "get_document_topics" is not valid for the author-topic model. Use the "get_author_topics" method.') def get_new_author_topics(self, corpus, minimum_probability=None): - """ + """Inference topics for new author. + Infers a topic distribution for a new author over the passed corpus of docs, assuming that all documents are from this single new author. @@ -897,14 +897,14 @@ def get_new_author_topics(self, corpus, minimum_probability=None): as a list of `(topic_id, topic_probability)` 2-tuples. """ - #use the training hyperparameters from the model initialization + # Use the training hyperparameters from the model initialization. passes = self.passes - #TODO: how should this function look like for get_new_author_topics? + # TODO: how should this function look like for get_new_author_topics? def rho(): return pow(self.offset + 1 + 1, -self.decay) - #wrap in fuction to avoid code duplication + # Wrap in fuction to avoid code duplication. def rollback_new_author_chages(): self.state.gamma = self.state.gamma[0:-1] for doc in corpus: @@ -939,7 +939,7 @@ def rollback_new_author_chages(): self.extend_corpus(corpus) corpus_doc_idx = list(range(self.total_docs, len_input_corpus+self.total_docs)) - #increment number of total docs + # Increment number of total docs. self.total_docs += len_input_corpus # Add the new placeholder author to author2id/id2author dictionaries. @@ -951,7 +951,7 @@ def rollback_new_author_chages(): # Increment the number of total authors seen. self.num_authors += num_new_authors - #add new author in author2doc and doc into doc2author + # Add new author in author2doc and doc into doc2author. self.author2doc[new_author_name] = corpus_doc_idx for new_doc_id in corpus_doc_idx: self.doc2author[new_doc_id] = [new_author_name] @@ -959,16 +959,16 @@ def rollback_new_author_chages(): gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics)) self.state.gamma = np.vstack([self.state.gamma, gamma_new]) - # should not record the sstats, as we are goint to delete the new author after calculated + # Should not record the sstats, as we are goint to delete the new author after calculated. try: gammat, _ = self.inference( corpus, self.author2doc, self.doc2author, rho(), collect_sstats=False, chunk_doc_idx=corpus_doc_idx ) - except Exception as e: - #something went wrong! Rollback temporary changes in object and log + except ValueError as e: + # Something went wrong! Rollback temporary changes in object and log rollback_new_author_chages() - logging.error(traceback.format_exc()) + logging.exception(e) return new_author_topics = self.get_author_topics(new_author_name, minimum_probability) From 92ef7595cca357babb7132e7011b2e8db83e1808 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Thu, 7 Dec 2017 11:32:10 +0100 Subject: [PATCH 03/23] small docstring fix --- gensim/models/atmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 661c1b13b5..35410d6819 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -883,7 +883,7 @@ def get_document_topics(self, word_id, minimum_probability=None): raise NotImplementedError('Method "get_document_topics" is not valid for the author-topic model. Use the "get_author_topics" method.') def get_new_author_topics(self, corpus, minimum_probability=None): - """Inference topics for new author. + """Infers topics for new author. Infers a topic distribution for a new author over the passed corpus of docs, assuming that all documents are from this single new author. From ee51f7198348ab9227214b3bc404a1fe0a59b1c9 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Thu, 28 Dec 2017 12:49:32 +0200 Subject: [PATCH 04/23] dont extend self.corpus --- gensim/models/atmodel.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 35410d6819..df569a6191 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -907,8 +907,6 @@ def rho(): # Wrap in fuction to avoid code duplication. def rollback_new_author_chages(): self.state.gamma = self.state.gamma[0:-1] - for doc in corpus: - self.corpus.remove(doc) del self.author2doc[new_author_name] a_id = self.author2id[new_author_name] @@ -918,7 +916,6 @@ def rollback_new_author_chages(): for new_doc_id in corpus_doc_idx: del self.doc2author[new_doc_id] - self.total_docs -= len_input_corpus self.num_authors -= num_new_authors try: @@ -935,12 +932,8 @@ def rollback_new_author_chages(): new_author_name = "placeholder_name" - # Add new documents in corpus to self.corpus. - self.extend_corpus(corpus) - corpus_doc_idx = list(range(self.total_docs, len_input_corpus+self.total_docs)) - # Increment number of total docs. - self.total_docs += len_input_corpus + corpus_doc_idx = list(range(0, len_input_corpus)) # Add the new placeholder author to author2id/id2author dictionaries. num_new_authors = 1 @@ -948,9 +941,6 @@ def rollback_new_author_chages(): self.author2id[new_author_name] = author_id + self.num_authors self.id2author[author_id + self.num_authors] = new_author_name - # Increment the number of total authors seen. - self.num_authors += num_new_authors - # Add new author in author2doc and doc into doc2author. self.author2doc[new_author_name] = corpus_doc_idx for new_doc_id in corpus_doc_idx: From 2de1b348f27c6e8a7133913f2fed29998892d8fd Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Tue, 2 Jan 2018 12:00:55 +0200 Subject: [PATCH 05/23] pep8 fixes --- gensim/models/atmodel.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index df569a6191..ff6d2ce543 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -897,9 +897,6 @@ def get_new_author_topics(self, corpus, minimum_probability=None): as a list of `(topic_id, topic_probability)` 2-tuples. """ - # Use the training hyperparameters from the model initialization. - passes = self.passes - # TODO: how should this function look like for get_new_author_topics? def rho(): return pow(self.offset + 1 + 1, -self.decay) @@ -931,8 +928,6 @@ def rollback_new_author_chages(): return new_author_name = "placeholder_name" - - corpus_doc_idx = list(range(0, len_input_corpus)) # Add the new placeholder author to author2id/id2author dictionaries. From cbe904909c41a008b137adf4e8e082eab4903b64 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Sat, 10 Feb 2018 15:57:13 +0100 Subject: [PATCH 06/23] remove chunksize limitation and small fix --- gensim/models/atmodel.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index ff6d2ce543..8f6ea6562f 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -921,12 +921,8 @@ def rollback_new_author_chages(): logger.warning("input corpus stream has no len(); counting documents") len_input_corpus = sum(1 for _ in corpus) if len_input_corpus == 0: - logger.warning("AuthorTopicModel.get_new_author_topics() called with an empty corpus") - return - if not len_input_corpus < self.chunksize: - logger.warning("AuthorTopicModel.get_new_author_topics() called with to many documents. Use update().") - return - + raise ValueError("AuthorTopicModel.get_new_author_topics() called with an empty corpus") + new_author_name = "placeholder_name" corpus_doc_idx = list(range(0, len_input_corpus)) From a94c0b1a892d012c6bf3429713839e87a4b2d781 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Sat, 10 Feb 2018 17:43:34 +0100 Subject: [PATCH 07/23] pep8 fix --- gensim/models/atmodel.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 8f6ea6562f..217d073bde 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -922,7 +922,7 @@ def rollback_new_author_chages(): len_input_corpus = sum(1 for _ in corpus) if len_input_corpus == 0: raise ValueError("AuthorTopicModel.get_new_author_topics() called with an empty corpus") - + new_author_name = "placeholder_name" corpus_doc_idx = list(range(0, len_input_corpus)) @@ -964,9 +964,8 @@ def get_author_topics(self, author_name, minimum_probability=None): Obtaining topic probabilities of each word, as in LDA (via `per_word_topics`), is not supported. - """ - + author_id = self.author2id[author_name] if minimum_probability is None: From 783a5857bf9141e6c53cff836267b820521687e8 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Sun, 11 Feb 2018 21:04:18 +0100 Subject: [PATCH 08/23] try pep8 fix --- gensim/models/atmodel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 217d073bde..bf91142b1c 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -958,14 +958,15 @@ def rollback_new_author_chages(): def get_author_topics(self, author_name, minimum_probability=None): """ - Return topic distribution the given author, as a list of + Return topic distribution the given author. + + Input as as a list of (topic_id, topic_probability) 2-tuples. Ignore topics with very low probability (below `minimum_probability`). - Obtaining topic probabilities of each word, as in LDA (via `per_word_topics`), is not supported. """ - + author_id = self.author2id[author_name] if minimum_probability is None: From a85563b750be1edbe95e03a89e527949e1cc278a Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 12 Feb 2018 16:49:57 +0500 Subject: [PATCH 09/23] convert docstring to numpy-style --- gensim/models/atmodel.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 57e0e8792b..ea45f24c2c 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -925,13 +925,18 @@ def get_new_author_topics(self, corpus, minimum_probability=None): Infers a topic distribution for a new author over the passed corpus of docs, assuming that all documents are from this single new author. - Args: - corpus (list): Bag-of-words representation of the documents to get topics for. - minimum_probability (float): Ignore topics with probability below this value - (None by default). If set to None, a value of 1e-8 is used to prevent 0s. - Returns: - topic distribution for the given list of documents `corpus` in bow format, - as a list of `(topic_id, topic_probability)` 2-tuples. + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Corpus in BoW format. + minimum_probability : float, optional + Ignore topics with probability below this value, if None - 1e-8 is used. + + Returns + ------- + list of (int, float) + Topic distribution for the given `corpus`. + """ # TODO: how should this function look like for get_new_author_topics? From 5eb84ff9c9bf190df18e524a5f00b68a797cf472 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 12 Feb 2018 16:51:58 +0500 Subject: [PATCH 10/23] fix PEP8 --- gensim/models/atmodel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index ea45f24c2c..94c376df9e 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -936,9 +936,8 @@ def get_new_author_topics(self, corpus, minimum_probability=None): ------- list of (int, float) Topic distribution for the given `corpus`. - - """ + """ # TODO: how should this function look like for get_new_author_topics? def rho(): return pow(self.offset + 1 + 1, -self.decay) From 7df70654a23257b499b380bd9279aeb721785dea Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Thu, 22 Feb 2018 19:06:33 +0100 Subject: [PATCH 11/23] fix major bug --- gensim/models/atmodel.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 94c376df9e..a8f0752cff 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -954,8 +954,6 @@ def rollback_new_author_chages(): for new_doc_id in corpus_doc_idx: del self.doc2author[new_doc_id] - self.num_authors -= num_new_authors - try: len_input_corpus = len(corpus) except TypeError: @@ -965,7 +963,7 @@ def rollback_new_author_chages(): raise ValueError("AuthorTopicModel.get_new_author_topics() called with an empty corpus") new_author_name = "placeholder_name" - corpus_doc_idx = list(range(0, len_input_corpus)) + corpus_doc_idx = list(range(self.total_docs, self.total_docs+len_input_corpus)) # Add the new placeholder author to author2id/id2author dictionaries. num_new_authors = 1 From f818bf253e2d57eafffcb2c4667cbfe9960779f2 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Thu, 22 Feb 2018 19:53:14 +0100 Subject: [PATCH 12/23] fix pep8 --- gensim/models/atmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index a8f0752cff..be69477697 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -963,7 +963,7 @@ def rollback_new_author_chages(): raise ValueError("AuthorTopicModel.get_new_author_topics() called with an empty corpus") new_author_name = "placeholder_name" - corpus_doc_idx = list(range(self.total_docs, self.total_docs+len_input_corpus)) + corpus_doc_idx = list(range(self.total_docs, self.total_docs + len_input_corpus)) # Add the new placeholder author to author2id/id2author dictionaries. num_new_authors = 1 From 8c5765fa5f475f8f82acea86f6e3b97a9c247d06 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Fri, 23 Feb 2018 13:04:27 +0100 Subject: [PATCH 13/23] fix merge --- gensim/models/atmodel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index be69477697..fb4a475e19 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -940,7 +940,11 @@ def get_new_author_topics(self, corpus, minimum_probability=None): """ # TODO: how should this function look like for get_new_author_topics? def rho(): +<<<<<<< Updated upstream return pow(self.offset + 1 + 1, -self.decay) +======= + return pow(self.offset + 1, -self.decay) +>>>>>>> Stashed changes # Wrap in fuction to avoid code duplication. def rollback_new_author_chages(): From 5ad9a0b7b42e31ed0c08bfa113d8758939f8063c Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Fri, 23 Feb 2018 13:26:18 +0100 Subject: [PATCH 14/23] add tests for get_new_author_topics --- gensim/test/test_atmodel.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index cd1293e6fa..89b5a0f438 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -450,6 +450,31 @@ def testTermTopics(self): self.assertTrue(isinstance(topic_no, int)) self.assertTrue(isinstance(probability, float)) + def testNewAuthorTopics(self): + model = self.class_( + corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, + passes=100, random_state=np.random.seed(0) + ) + #temp save model state vars before get_new_author_topics is called + state_gamma_len = len(model.state.gamma) + author2doc_len = len(model.author2doc) + author2id_len = len(model.author2id) + id2author_len = len(model.id2author) + doc2author_len = len(model.doc2author) + + new_author_topics = model.get_new_author_topics(corpus=corpus[0:2]) + + for k, v in new_author_topics: + self.assertTrue(isinstance(k, int)) + self.assertTrue(isinstance(v, float)) + + #assure rollback was successful and the model state is as before + self.assertEqual(state_gamma_len, len(model.state.gamma)) + self.assertEqual(author2doc_len, len(model.author2doc)) + self.assertEqual(author2id_len, len(model.author2id)) + self.assertEqual(id2author_len, len(model.id2author)) + self.assertEqual(doc2author_len, len(model.doc2author)) + def testPasses(self): # long message includes the original error message with a custom one self.longMessage = True From 8e4899ce69d93e2a62342b8721d00201b405ee15 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Fri, 23 Feb 2018 13:28:53 +0100 Subject: [PATCH 15/23] fix stash changes --- gensim/models/atmodel.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index fb4a475e19..be69477697 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -940,11 +940,7 @@ def get_new_author_topics(self, corpus, minimum_probability=None): """ # TODO: how should this function look like for get_new_author_topics? def rho(): -<<<<<<< Updated upstream return pow(self.offset + 1 + 1, -self.decay) -======= - return pow(self.offset + 1, -self.decay) ->>>>>>> Stashed changes # Wrap in fuction to avoid code duplication. def rollback_new_author_chages(): From 10ba9c84b195d19258ab33348df6caba4fffb6f4 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Fri, 23 Feb 2018 13:57:53 +0100 Subject: [PATCH 16/23] fix test pep8 --- gensim/test/test_atmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index 89b5a0f438..f3844fa603 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -455,7 +455,7 @@ def testNewAuthorTopics(self): corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0) ) - #temp save model state vars before get_new_author_topics is called + # temp save model state vars before get_new_author_topics is called state_gamma_len = len(model.state.gamma) author2doc_len = len(model.author2doc) author2id_len = len(model.author2id) @@ -468,7 +468,7 @@ def testNewAuthorTopics(self): self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) - #assure rollback was successful and the model state is as before + # assure rollback was successful and the model state is as before self.assertEqual(state_gamma_len, len(model.state.gamma)) self.assertEqual(author2doc_len, len(model.author2doc)) self.assertEqual(author2id_len, len(model.author2id)) From 0a3eedb267fb766d323349932d593298419ce9af Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Mon, 26 Feb 2018 23:53:19 +0100 Subject: [PATCH 17/23] fix exception catching and rollback --- gensim/models/atmodel.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index be69477697..f5c8b3a353 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -938,7 +938,6 @@ def get_new_author_topics(self, corpus, minimum_probability=None): Topic distribution for the given `corpus`. """ - # TODO: how should this function look like for get_new_author_topics? def rho(): return pow(self.offset + 1 + 1, -self.decay) @@ -985,14 +984,9 @@ def rollback_new_author_chages(): corpus, self.author2doc, self.doc2author, rho(), collect_sstats=False, chunk_doc_idx=corpus_doc_idx ) - except ValueError as e: - # Something went wrong! Rollback temporary changes in object and log + new_author_topics = self.get_author_topics(new_author_name, minimum_probability) + finally: rollback_new_author_chages() - logging.exception(e) - return - - new_author_topics = self.get_author_topics(new_author_name, minimum_probability) - rollback_new_author_chages() return new_author_topics def get_author_topics(self, author_name, minimum_probability=None): From f5875969390ce5ff4c9f95fa61a30a0bea106549 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Mon, 26 Feb 2018 23:55:35 +0100 Subject: [PATCH 18/23] add test for topic similarity --- gensim/test/test_atmodel.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index f3844fa603..48f08cdc9f 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -26,7 +26,7 @@ from gensim.test import basetmtests from gensim.test.utils import (datapath, get_tmpfile, common_texts, common_dictionary as dictionary, common_corpus as corpus) - +from gensim.matutils import jensen_shannon # TODO: # Test that computing the bound on new unseen documents works as expected (this is somewhat different # in the author-topic model than in LDA). @@ -451,10 +451,15 @@ def testTermTopics(self): self.assertTrue(isinstance(probability, float)) def testNewAuthorTopics(self): + model = self.class_( corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0) ) + author2doc_newauthor = {} + author2doc_newauthor["test"] = [0,1] + model.update(corpus=corpus[0:2], author2doc=author2doc_newauthor) + # temp save model state vars before get_new_author_topics is called state_gamma_len = len(model.state.gamma) author2doc_len = len(model.author2doc) @@ -464,10 +469,19 @@ def testNewAuthorTopics(self): new_author_topics = model.get_new_author_topics(corpus=corpus[0:2]) + # sanity check for k, v in new_author_topics: self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) + # make sure topics are similar enough + similarity = 1 / (1 + jensen_shannon(model["test"], new_author_topics)) + self.assertTrue(similarity >= 0.9) + + # produce an error to test if rollback occurs + with self.assertRaises(TypeError): + new_author_topics_error = model.get_new_author_topics(corpus=corpus[0]) + # assure rollback was successful and the model state is as before self.assertEqual(state_gamma_len, len(model.state.gamma)) self.assertEqual(author2doc_len, len(model.author2doc)) @@ -475,6 +489,7 @@ def testNewAuthorTopics(self): self.assertEqual(id2author_len, len(model.id2author)) self.assertEqual(doc2author_len, len(model.doc2author)) + def testPasses(self): # long message includes the original error message with a custom one self.longMessage = True From 1bc929c46f46d9d3faaebf0e1d21553ace5d4ae9 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Tue, 27 Feb 2018 10:20:40 +0100 Subject: [PATCH 19/23] fix pep8 --- gensim/test/test_atmodel.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index 48f08cdc9f..00d4d2aafa 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -457,7 +457,7 @@ def testNewAuthorTopics(self): passes=100, random_state=np.random.seed(0) ) author2doc_newauthor = {} - author2doc_newauthor["test"] = [0,1] + author2doc_newauthor["test"] = [0, 1] model.update(corpus=corpus[0:2], author2doc=author2doc_newauthor) # temp save model state vars before get_new_author_topics is called @@ -480,7 +480,7 @@ def testNewAuthorTopics(self): # produce an error to test if rollback occurs with self.assertRaises(TypeError): - new_author_topics_error = model.get_new_author_topics(corpus=corpus[0]) + model.get_new_author_topics(corpus=corpus[0]) # assure rollback was successful and the model state is as before self.assertEqual(state_gamma_len, len(model.state.gamma)) @@ -489,7 +489,6 @@ def testNewAuthorTopics(self): self.assertEqual(id2author_len, len(model.id2author)) self.assertEqual(doc2author_len, len(model.doc2author)) - def testPasses(self): # long message includes the original error message with a custom one self.longMessage = True From 6085b901d097bc52ffbfb010f43049875d8e13df Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Wed, 21 Mar 2018 17:43:22 +0100 Subject: [PATCH 20/23] some last cosmetic changes --- gensim/models/atmodel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index f5c8b3a353..7cc4170c4d 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -941,7 +941,6 @@ def get_new_author_topics(self, corpus, minimum_probability=None): def rho(): return pow(self.offset + 1 + 1, -self.decay) - # Wrap in fuction to avoid code duplication. def rollback_new_author_chages(): self.state.gamma = self.state.gamma[0:-1] @@ -962,11 +961,14 @@ def rollback_new_author_chages(): raise ValueError("AuthorTopicModel.get_new_author_topics() called with an empty corpus") new_author_name = "placeholder_name" + # indexes representing the documents in the input corpus corpus_doc_idx = list(range(self.total_docs, self.total_docs + len_input_corpus)) # Add the new placeholder author to author2id/id2author dictionaries. num_new_authors = 1 - author_id = 0 + author_id = self.num_authors + if new_author_name in self.author2id: + raise ValueError("self.author2id already has 'placeholder_name' author") self.author2id[new_author_name] = author_id + self.num_authors self.id2author[author_id + self.num_authors] = new_author_name From 34f5222e0b6fb5fb85bae655a30ef8a700c167ff Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Wed, 21 Mar 2018 18:18:53 +0100 Subject: [PATCH 21/23] author id fix --- gensim/models/atmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 7cc4170c4d..017bf6efd0 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -969,8 +969,8 @@ def rollback_new_author_chages(): author_id = self.num_authors if new_author_name in self.author2id: raise ValueError("self.author2id already has 'placeholder_name' author") - self.author2id[new_author_name] = author_id + self.num_authors - self.id2author[author_id + self.num_authors] = new_author_name + self.author2id[new_author_name] = author_id + self.id2author[author_id] = new_author_name # Add new author in author2doc and doc into doc2author. self.author2doc[new_author_name] = corpus_doc_idx From 6d7141d0650534977210b9957e8b2d1294022997 Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Sun, 25 Mar 2018 18:40:47 +0200 Subject: [PATCH 22/23] add tutorial for authorship prediction --- .../atmodel_prediction_tutorial.ipynb | 530 ++++++++++++++++++ 1 file changed, 530 insertions(+) create mode 100644 docs/notebooks/atmodel_prediction_tutorial.ipynb diff --git a/docs/notebooks/atmodel_prediction_tutorial.ipynb b/docs/notebooks/atmodel_prediction_tutorial.ipynb new file mode 100644 index 0000000000..9a7e3bff33 --- /dev/null +++ b/docs/notebooks/atmodel_prediction_tutorial.ipynb @@ -0,0 +1,530 @@ + + + + + + + Jupyter Notebook + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+
+ +
+
+ + + +
+ + + +
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + \ No newline at end of file From c5258455ad678f48c6cecf13f36cd33327a4310a Mon Sep 17 00:00:00 2001 From: Martin Stamenov Date: Mon, 26 Mar 2018 12:35:49 +0200 Subject: [PATCH 23/23] add ipynb --- .../atmodel_prediction_tutorial.ipynb | 2368 +++++++++++++---- 1 file changed, 1841 insertions(+), 527 deletions(-) diff --git a/docs/notebooks/atmodel_prediction_tutorial.ipynb b/docs/notebooks/atmodel_prediction_tutorial.ipynb index 9a7e3bff33..bb70d1056b 100644 --- a/docs/notebooks/atmodel_prediction_tutorial.ipynb +++ b/docs/notebooks/atmodel_prediction_tutorial.ipynb @@ -1,530 +1,1844 @@ - - - - - - - Jupyter Notebook - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - -
-
-
- -
-
- - - -
- - - -
-
-
-
-
-
- - - - - - - - - - - - - - - - - - \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 2 +}