Skip to content

Commit

Permalink
Replace custom epsilons with numpy equivalent in LdaModel (#2308)
Browse files Browse the repository at this point in the history
* Fix #2115: Replace custom epsilons with automatic numpy equivalent

* fix typo
  • Loading branch information
horpto authored and menshikh-iv committed Jan 9, 2019
1 parent 9af9416 commit 1b07f81
Showing 1 changed file with 5 additions and 17 deletions.
22 changes: 5 additions & 17 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,6 @@

logger = logging.getLogger(__name__)

# Epsilon (very small) values used by each expected data type instead of 0, to avoid Arithmetic Errors.
DTYPE_TO_EPS = {
np.float16: 1e-5,
np.float32: 1e-35,
np.float64: 1e-100,
}


def update_dir_prior(prior, N, logphat, rho):
"""Update a given prior using Newton's method, described in
Expand Down Expand Up @@ -426,12 +419,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
Data-type to use during calculations inside model. All inputs are also converted.
"""
if dtype not in DTYPE_TO_EPS:
raise ValueError(
"Incorrect 'dtype', please choose one of {}".format(
", ".join("numpy.{}".format(tp.__name__) for tp in sorted(DTYPE_TO_EPS))))

self.dtype = dtype
self.dtype = np.finfo(dtype).dtype

# store user-supplied parameters
self.id2word = id2word
Expand Down Expand Up @@ -668,6 +656,7 @@ def inference(self, chunk, collect_sstats=False):
# Lee&Seung trick which speeds things up by an order of magnitude, compared
# to Blei's original LDA-C code, cool!).
integer_types = six.integer_types + (np.integer,)
epsilon = np.finfo(self.dtype).eps
for d, doc in enumerate(chunk):
if len(doc) > 0 and not isinstance(doc[0][0], integer_types):
# make sure the term IDs are ints, otherwise np will get upset
Expand All @@ -683,8 +672,7 @@ def inference(self, chunk, collect_sstats=False):
# The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w.
# phinorm is the normalizer.
# TODO treat zeros explicitly, instead of adding epsilon?
eps = DTYPE_TO_EPS[self.dtype]
phinorm = np.dot(expElogthetad, expElogbetad) + eps
phinorm = np.dot(expElogthetad, expElogbetad) + epsilon

# Iterate between gamma and phi until convergence
for _ in range(self.iterations):
Expand All @@ -695,7 +683,7 @@ def inference(self, chunk, collect_sstats=False):
gammad = self.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)
Elogthetad = dirichlet_expectation(gammad)
expElogthetad = np.exp(Elogthetad)
phinorm = np.dot(expElogthetad, expElogbetad) + eps
phinorm = np.dot(expElogthetad, expElogbetad) + epsilon
# If gamma hasn't changed much, we're done.
meanchange = mean_absolute_difference(gammad, lastgamma)
if meanchange < self.gamma_threshold:
Expand Down Expand Up @@ -1289,7 +1277,7 @@ def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=N
minimum_probability : float
Topics with an assigned probability lower than this threshold will be discarded.
minimum_phi_value : float
f `per_word_topics` is True, this represents a lower bound on the term probabilities that are included.
If `per_word_topics` is True, this represents a lower bound on the term probabilities that are included.
If set to None, a value of 1e-8 is used to prevent 0s.
per_word_topics : bool
If True, this function will also return two extra lists as explained in the "Returns" section.
Expand Down

0 comments on commit 1b07f81

Please sign in to comment.