Skip to content

Commit

Permalink
Correctly process empty documents in AuthorTopicModel (#2133)
Browse files Browse the repository at this point in the history
* test for #1589

* bugfix #1589

* ignore unused assigned varaible

* PR review

* Update test_atmodel.py
  • Loading branch information
probinso authored and menshikh-iv committed Aug 2, 2018
1 parent a6c4ea4 commit 61728a0
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 6 deletions.
11 changes: 6 additions & 5 deletions gensim/models/atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,10 +461,11 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
ids = [int(idx) for idx, _ in doc]
else:
ids = [idx for idx, _ in doc]
cts = np.array([cnt for _, cnt in doc])
ids = np.array(ids, dtype=np.integer)
cts = np.array([cnt for _, cnt in doc], dtype=np.integer)

# Get all authors in current document, and convert the author names to integer IDs.
authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
authors_d = np.array([self.author2id[a] for a in self.doc2author[doc_no]], dtype=np.integer)

gammad = self.state.gamma[authors_d, :] # gamma of document d before update.
tilde_gamma = gammad.copy() # gamma that will be updated.
Expand Down Expand Up @@ -972,9 +973,9 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None,
else:
doc_no = d
# Get all authors in current document, and convert the author names to integer IDs.
authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
ids = np.array([id for id, _ in doc]) # Word IDs in doc.
cts = np.array([cnt for _, cnt in doc]) # Word counts.
authors_d = np.array([self.author2id[a] for a in self.doc2author[doc_no]], dtype=np.integer)
ids = np.array([id for id, _ in doc], dtype=np.integer) # Word IDs in doc.
cts = np.array([cnt for _, cnt in doc], dtype=np.integer) # Word counts.

if d % self.chunksize == 0:
logger.debug("bound: at document #%i in chunk", d)
Expand Down
11 changes: 10 additions & 1 deletion gensim/test/test_atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
# increases the bound.
# Test that models are compatiple across versions, as done in LdaModel.


# Assign some authors randomly to the documents above.
author2doc = {
'john': [0, 1, 2, 3, 4, 5, 6],
Expand Down Expand Up @@ -110,6 +109,16 @@ def testBasic(self):
jill_topics = matutils.sparse2full(jill_topics, model.num_topics)
self.assertTrue(all(jill_topics > 0))

def testEmptyDocument(self):
local_texts = common_texts + [['only_occurs_once_in_corpus_and_alone_in_doc']]
dictionary = Dictionary(local_texts)
dictionary.filter_extremes(no_below=2)
corpus = [dictionary.doc2bow(text) for text in local_texts]
a2d = author2doc.copy()
a2d['joaquin'] = [len(local_texts) - 1]

self.class_(corpus, author2doc=a2d, id2word=dictionary, num_topics=2)

def testAuthor2docMissing(self):
# Check that the results are the same if author2doc is constructed automatically from doc2author.
model = self.class_(
Expand Down

0 comments on commit 61728a0

Please sign in to comment.