Skip to content

Commit

Permalink
WIP: migrating tutorials to 4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Sep 30, 2020
1 parent 0c0f358 commit 502b654
Show file tree
Hide file tree
Showing 17 changed files with 435 additions and 609 deletions.
37 changes: 16 additions & 21 deletions docs/src/gallery/howtos/run_compare_lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
How to Compare LDA Models
=========================
Demonstrates how you can compare a topic model with itself or other models.
Demonstrates how you can visualize and compare trained topic models.
"""

Expand All @@ -16,7 +16,6 @@
# ---------------------------------------------------------------------
#


from string import punctuation
from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
Expand All @@ -32,7 +31,7 @@
translate_tab = {ord(p): u" " for p in punctuation}

def text2tokens(raw_text):
"""Convert a raw text to a list of stemmed tokens."""
"""Split the raw_text string into a list of stemmed tokens."""
clean_text = raw_text.lower().translate(translate_tab)
tokens = [token.strip() for token in tokenizer.tokenize(clean_text)]
tokens = [token for token in tokens if token not in eng_stopwords]
Expand All @@ -59,12 +58,12 @@ def text2tokens(raw_text):

lda_fst = LdaMulticore(
corpus=d2b_dataset, num_topics=num_topics, id2word=dictionary,
workers=4, eval_every=None, passes=10, batch=True
workers=4, eval_every=None, passes=10, batch=True,
)

lda_snd = LdaMulticore(
corpus=d2b_dataset, num_topics=num_topics, id2word=dictionary,
workers=4, eval_every=None, passes=20, batch=True
workers=4, eval_every=None, passes=20, batch=True,
)

###############################################################################
Expand All @@ -77,7 +76,6 @@ def text2tokens(raw_text):
# If you're viewing the static version of the page, you'll get a similar matplotlib heatmap, but it won't be interactive.
#


def plot_difference_plotly(mdiff, title="", annotation=None):
"""Plot the difference between models.
Expand Down Expand Up @@ -162,7 +160,8 @@ def plot_difference_matplotlib(mdiff, title="", annotation=None):
#
# * :raw-html-m2r:`<span style="color:blue">almost blue cell</span>` - strongly correlated topics.
#
# In an ideal world, we would like to see different topics decorrelated between themselves. In this case, our matrix would look like this:
# In an ideal world, we would like to see different topics decorrelated between themselves.
# In this case, our matrix would look like this:
#


Expand All @@ -182,12 +181,8 @@ def plot_difference_matplotlib(mdiff, title="", annotation=None):
#
# Short description (interactive annotations only):
#
#
#
# * ``+++ make, world, well`` - words from the intersection of topics = present in both topics;
#
#
#
# * ``--- money, day, still`` - words from the symmetric difference of topics = present in one topic but not the other.
#

Expand All @@ -197,25 +192,24 @@ def plot_difference_matplotlib(mdiff, title="", annotation=None):

###############################################################################
#
# If you compare a model with itself, you want to see as many red elements as possible (except diagonal). With this picture, you can look at the not very red elements and understand which topics in the model are very similar and why (you can read annotation if you move your pointer to cell).
# If you compare a model with itself, you want to see as many red elements as
# possible (except on the diagonal). With this picture, you can look at the
# "not very red elements" and understand which topics in the model are very
# similar and why (you can read annotation if you move your pointer to cell).
#
#
#
#
# Jaccard is stable and robust distance function, but this function not enough sensitive for some purposes. Let's try to use Hellinger distance now.
# Jaccard is a stable and robust distance function, but sometimes not sensitive
# enough. Let's try to use the Hellinger distance instead.
#


mdiff, annotation = lda_fst.diff(lda_fst, distance='hellinger', num_words=50)
plot_difference(mdiff, title="Topic difference (one model)[hellinger distance]", annotation=annotation)

###############################################################################
#
# You see that everything has become worse, but remember that everything depends on the task.
#
#
#
# You need to choose the function with which your personal point of view about topics similarity and your task (from my experience, Jaccard is fine).
# Choose a distance function that matches your upstream task better: what kind of "similarity" is
# relevant to you. From my (Ivan's) experience, Jaccard is fine.
#


Expand All @@ -239,5 +233,6 @@ def plot_difference_matplotlib(mdiff, title="", annotation=None):

###############################################################################
#
# Looking at this matrix, you can find similar and different topics (and relevant tokens which describe the intersection and difference).
# Looking at this matrix, you can find similar and different topics between the two models.
# The plot also includes relevant tokens describing the topics' intersection and difference.
#
4 changes: 1 addition & 3 deletions docs/src/gallery/howtos/run_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@
How to Author Gensim Documentation
==================================
Some tips of how to author documentation for ``gensim``.
How to author documentation for Gensim.
"""

import sys

###############################################################################
# Background
# ----------
Expand Down
17 changes: 8 additions & 9 deletions docs/src/gallery/howtos/run_doc2vec_imdb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
r"""
How to Apply Doc2Vec to Reproduce the 'Paragraph Vector' paper
==============================================================
How to reproduce the doc2vec 'Paragraph Vector' paper
=====================================================
Shows how to reproduce results of the "Distributed Representation of Sentences and Documents" paper by Le and Mikolov using Gensim.
Expand Down Expand Up @@ -100,8 +100,6 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v
return fname

# Download the file to local storage first.
# We can't read it on the fly because of
# https://github.com/RaRe-Technologies/smart_open/issues/331
with smart_open.open(url, "rb", ignore_ext=True) as fin:
with smart_open.open(fname, 'wb', ignore_ext=True) as fout:
while True:
Expand Down Expand Up @@ -139,18 +137,19 @@ def extract_documents():
alldocs = list(extract_documents())

###############################################################################
# Here's what a single document looks like
# Here's what a single document looks like.
print(alldocs[27])

###############################################################################
# Extract our documents and split into training/test sets
# Extract our documents and split into training/test sets.
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))

###############################################################################
# Set-up Doc2Vec Training & Evaluation Models
# -------------------------------------------
#
# We approximate the experiment of Le & Mikolov `"Distributed Representations
# of Sentences and Documents"
# <http://cs.stanford.edu/~quocle/paragraph_vector.pdf>`_ with guidance from
Expand Down Expand Up @@ -255,11 +254,11 @@ def error_rate_for_model(test_model, train_set, test_set):
"""Report error rate on test_doc sentiments, using supplied model and train_docs"""

train_targets = [doc.sentiment for doc in train_set]
train_regressors = [test_model.docvecs[doc.tags[0]] for doc in train_set]
train_regressors = [test_model.dv[doc.tags[0]] for doc in train_set]
train_regressors = sm.add_constant(train_regressors)
predictor = logistic_predictor_from_data(train_targets, train_regressors)

test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_set]
test_regressors = [test_model.dv[doc.tags[0]] for doc in test_set]
test_regressors = sm.add_constant(test_regressors)

# Predict & evaluate
Expand Down Expand Up @@ -347,7 +346,7 @@ def error_rate_for_model(test_model, train_set, test_set):
###############################################################################
# Are inferred vectors close to the precalculated ones?
# -----------------------------------------------------
doc_id = np.random.randint(simple_models[0].docvecs.count) # Pick random doc; re-run cell for more examples
doc_id = np.random.randint(len(simple_models[0].dv)) # Pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
inferred_docvec = model.infer_vector(alldocs[doc_id].words)
Expand Down
38 changes: 21 additions & 17 deletions docs/src/gallery/howtos/run_downloader_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,23 @@
How to download pre-trained models and corpora
==============================================
Demonstrates simple and quick access to common corpora, models, and other data.
Demonstrates simple and quick access to common corpora and pretrained models.
"""

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

###############################################################################
# One of Gensim's features is simple and easy access to some common data.
# The `gensim-data <https://github.com/RaRe-Technologies/gensim-data>`_ project stores a variety of corpora, models and other data.
# One of Gensim's features is simple and easy access to common data.
# The `gensim-data <https://github.com/RaRe-Technologies/gensim-data>`_ project stores a
# variety of corpora and pretrained models.
# Gensim has a :py:mod:`gensim.downloader` module for programmatically accessing this data.
# The module leverages a local cache that ensures data is downloaded at most once.
# This module leverages a local cache (in user's home folder, by default) that
# ensures data is downloaded at most once.
#
# This tutorial:
#
# * Retrieves the text8 corpus, unless it is already on your local machine
# * Downloads the text8 corpus, unless it is already on your local machine
# * Trains a Word2Vec model from the corpus (see :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` for a detailed tutorial)
# * Leverages the model to calculate word similarity
# * Demonstrates using the API to load other models and corpora
Expand All @@ -27,12 +29,13 @@

###############################################################################
#
# Now, lets download the text8 corpus and load it to memory (automatically)
# Now, let's download the text8 corpus and load it as a Python object
# that supports streamed access.
#
corpus = api.load('text8')

###############################################################################
# In this case, corpus is an iterable.
# In this case, our corpus is an iterable.
# If you look under the covers, it has the following definition:

import inspect
Expand All @@ -46,23 +49,24 @@

###############################################################################
#
# As the corpus has been downloaded and loaded, let's create a word2vec model of our corpus.
# With the corpus has been downloaded and loaded, let's use it to train a word2vec model.
#

from gensim.models.word2vec import Word2Vec
model = Word2Vec(corpus)

###############################################################################
#
# Now that we have our word2vec model, let's find words that are similar to 'tree'
# Now that we have our word2vec model, let's find words that are similar to 'tree'.
#


print(model.most_similar('tree'))
print(model.wv.most_similar('tree'))

###############################################################################
#
# You can use the API to download many corpora and models. You can get the list of all the models and corpora that are provided, by using the code below:
# You can use the API to download several different corpora and pretrained models.
# Here's how to list all resources available in gensim-data:
#


Expand All @@ -71,7 +75,7 @@
print(json.dumps(info, indent=4))

###############################################################################
# There are two types of data: corpora and models.
# There are two types of data resources: corpora and models.
print(info.keys())

###############################################################################
Expand All @@ -98,7 +102,7 @@

###############################################################################
#
# If you want to get detailed information about the model/corpus, use:
# If you want to get detailed information about a model/corpus, use:
#


Expand All @@ -107,7 +111,8 @@

###############################################################################
#
# Sometimes, you do not want to load the model to memory. You would just want to get the path to the model. For that, use :
# Sometimes, you do not want to load a model into memory. Instead, you can request
# just the filesystem path to the model. For that, use:
#


Expand All @@ -124,7 +129,6 @@

###############################################################################
#
# In corpora, the corpus is never loaded to memory, all corpuses wrapped to special class ``Dataset`` and provide ``__iter__`` method
# For corpora, the corpus is never loaded to memory, all corpora are iterables wrapped in
# a special class ``Dataset``, with an ``__iter__`` method.
#


Loading

0 comments on commit 502b654

Please sign in to comment.