WIP: migrating tutorials to 4.0

piskvorky · Sep 30, 2020 · 502b654 · 502b654
1 parent 0c0f358
commit 502b654
Show file tree

Hide file tree

Showing 17 changed files with 435 additions and 609 deletions.
diff --git a/docs/src/gallery/howtos/run_compare_lda.py b/docs/src/gallery/howtos/run_compare_lda.py
@@ -2,7 +2,7 @@
 How to Compare LDA Models
 =========================
 
-Demonstrates how you can compare a topic model with itself or other models.
+Demonstrates how you can visualize and compare trained topic models.
 
 """
 
@@ -16,7 +16,6 @@
 # ---------------------------------------------------------------------
 #
 
-
 from string import punctuation
 from nltk import RegexpTokenizer
 from nltk.stem.porter import PorterStemmer
@@ -32,7 +31,7 @@
 translate_tab = {ord(p): u" " for p in punctuation}
 
 def text2tokens(raw_text):
-    """Convert a raw text to a list of stemmed tokens."""
+    """Split the raw_text string into a list of stemmed tokens."""
     clean_text = raw_text.lower().translate(translate_tab)
     tokens = [token.strip() for token in tokenizer.tokenize(clean_text)]
     tokens = [token for token in tokens if token not in eng_stopwords]
@@ -59,12 +58,12 @@ def text2tokens(raw_text):
 
 lda_fst = LdaMulticore(
     corpus=d2b_dataset, num_topics=num_topics, id2word=dictionary,
-    workers=4, eval_every=None, passes=10, batch=True
+    workers=4, eval_every=None, passes=10, batch=True,
 )
 
 lda_snd = LdaMulticore(
     corpus=d2b_dataset, num_topics=num_topics, id2word=dictionary,
-    workers=4, eval_every=None, passes=20, batch=True
+    workers=4, eval_every=None, passes=20, batch=True,
 )
 
 ###############################################################################
@@ -77,7 +76,6 @@ def text2tokens(raw_text):
 # If you're viewing the static version of the page, you'll get a similar matplotlib heatmap, but it won't be interactive.
 #
 
-
 def plot_difference_plotly(mdiff, title="", annotation=None):
     """Plot the difference between models.
 
@@ -162,7 +160,8 @@ def plot_difference_matplotlib(mdiff, title="", annotation=None):
 #
 # * :raw-html-m2r:`<span style="color:blue">almost blue cell</span>` - strongly correlated topics.
 #
-# In an ideal world, we would like to see different topics decorrelated between themselves. In this case, our matrix would look like this:
+# In an ideal world, we would like to see different topics decorrelated between themselves.
+# In this case, our matrix would look like this:
 #
 
 
@@ -182,12 +181,8 @@ def plot_difference_matplotlib(mdiff, title="", annotation=None):
 #
 # Short description (interactive annotations only):
 #
-#
-#
 # * ``+++ make, world, well`` - words from the intersection of topics = present in both topics;
 #
-#
-#
 # * ``--- money, day, still`` - words from the symmetric difference of topics = present in one topic but not the other.
 #
 
@@ -197,25 +192,24 @@ def plot_difference_matplotlib(mdiff, title="", annotation=None):
 
 ###############################################################################
 #
-# If you compare a model with itself, you want to see as many red elements as possible (except diagonal). With this picture, you can look at the not very red elements and understand which topics in the model are very similar and why (you can read annotation if you move your pointer to cell).
+# If you compare a model with itself, you want to see as many red elements as
+# possible (except on the diagonal). With this picture, you can look at the
+# "not very red elements" and understand which topics in the model are very
+# similar and why (you can read annotation if you move your pointer to cell).
 #
-#
-#
-#
-# Jaccard is stable and robust distance function, but this function not enough sensitive for some purposes. Let's try to use Hellinger distance now.
+# Jaccard is a stable and robust distance function, but sometimes not sensitive
+# enough. Let's try to use the Hellinger distance instead.
 #
 
-
 mdiff, annotation = lda_fst.diff(lda_fst, distance='hellinger', num_words=50)
 plot_difference(mdiff, title="Topic difference (one model)[hellinger distance]", annotation=annotation)
 
 ###############################################################################
 #
 # You see that everything has become worse, but remember that everything depends on the task.
 #
-#
-#
-# You need to choose the function with which your personal point of view about topics similarity and your task (from my experience, Jaccard is fine).
+# Choose a distance function that matches your upstream task better: what kind of "similarity" is
+# relevant to you. From my (Ivan's) experience, Jaccard is fine.
 #
 
 
@@ -239,5 +233,6 @@ def plot_difference_matplotlib(mdiff, title="", annotation=None):
 
 ###############################################################################
 #
-# Looking at this matrix, you can find similar and different topics (and relevant tokens which describe the intersection and difference).
+# Looking at this matrix, you can find similar and different topics between the two models.
+# The plot also includes relevant tokens describing the topics' intersection and difference.
 #
diff --git a/docs/src/gallery/howtos/run_doc.py b/docs/src/gallery/howtos/run_doc.py
@@ -2,11 +2,9 @@
 How to Author Gensim Documentation
 ==================================
 
-Some tips of how to author documentation for ``gensim``.
+How to author documentation for Gensim.
 """
 
-import sys
-
 ###############################################################################
 # Background
 # ----------

diff --git a/docs/src/gallery/howtos/run_doc2vec_imdb.py b/docs/src/gallery/howtos/run_doc2vec_imdb.py
@@ -1,6 +1,6 @@
 r"""
-How to Apply Doc2Vec to Reproduce the 'Paragraph Vector' paper
-==============================================================
+How to reproduce the doc2vec 'Paragraph Vector' paper
+=====================================================
 
 Shows how to reproduce results of the "Distributed Representation of Sentences and Documents" paper by Le and Mikolov using Gensim.
 
@@ -100,8 +100,6 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v
        return fname
 
     # Download the file to local storage first.
-    # We can't read it on the fly because of
-    # https://github.com/RaRe-Technologies/smart_open/issues/331
     with smart_open.open(url, "rb", ignore_ext=True) as fin:
         with smart_open.open(fname, 'wb', ignore_ext=True) as fout:
             while True:
@@ -139,18 +137,19 @@ def extract_documents():
 alldocs = list(extract_documents())
 
 ###############################################################################
-# Here's what a single document looks like
+# Here's what a single document looks like.
 print(alldocs[27])
 
 ###############################################################################
-# Extract our documents and split into training/test sets
+# Extract our documents and split into training/test sets.
 train_docs = [doc for doc in alldocs if doc.split == 'train']
 test_docs = [doc for doc in alldocs if doc.split == 'test']
 print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))
 
 ###############################################################################
 # Set-up Doc2Vec Training & Evaluation Models
 # -------------------------------------------
+#
 # We approximate the experiment of Le & Mikolov `"Distributed Representations
 # of Sentences and Documents"
 # <http://cs.stanford.edu/~quocle/paragraph_vector.pdf>`_ with guidance from
@@ -255,11 +254,11 @@ def error_rate_for_model(test_model, train_set, test_set):
     """Report error rate on test_doc sentiments, using supplied model and train_docs"""
 
     train_targets = [doc.sentiment for doc in train_set]
-    train_regressors = [test_model.docvecs[doc.tags[0]] for doc in train_set]
+    train_regressors = [test_model.dv[doc.tags[0]] for doc in train_set]
     train_regressors = sm.add_constant(train_regressors)
     predictor = logistic_predictor_from_data(train_targets, train_regressors)
 
-    test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_set]
+    test_regressors = [test_model.dv[doc.tags[0]] for doc in test_set]
     test_regressors = sm.add_constant(test_regressors)
 
     # Predict & evaluate
@@ -347,7 +346,7 @@ def error_rate_for_model(test_model, train_set, test_set):
 ###############################################################################
 # Are inferred vectors close to the precalculated ones?
 # -----------------------------------------------------
-doc_id = np.random.randint(simple_models[0].docvecs.count)  # Pick random doc; re-run cell for more examples
+doc_id = np.random.randint(len(simple_models[0].dv))  # Pick random doc; re-run cell for more examples
 print('for doc %d...' % doc_id)
 for model in simple_models:
     inferred_docvec = model.infer_vector(alldocs[doc_id].words)

diff --git a/docs/src/gallery/howtos/run_downloader_api.py b/docs/src/gallery/howtos/run_downloader_api.py
@@ -2,21 +2,23 @@
 How to download pre-trained models and corpora
 ==============================================
 
-Demonstrates simple and quick access to common corpora, models, and other data.
+Demonstrates simple and quick access to common corpora and pretrained models.
 """
 
 import logging
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
 ###############################################################################
-# One of Gensim's features is simple and easy access to some common data.
-# The `gensim-data <https://github.com/RaRe-Technologies/gensim-data>`_ project stores a variety of corpora, models and other data.
+# One of Gensim's features is simple and easy access to common data.
+# The `gensim-data <https://github.com/RaRe-Technologies/gensim-data>`_ project stores a
+# variety of corpora and pretrained models.
 # Gensim has a :py:mod:`gensim.downloader` module for programmatically accessing this data.
-# The module leverages a local cache that ensures data is downloaded at most once.
+# This module leverages a local cache (in user's home folder, by default) that
+# ensures data is downloaded at most once.
 #
 # This tutorial:
 #
-# * Retrieves the text8 corpus, unless it is already on your local machine
+# * Downloads the text8 corpus, unless it is already on your local machine
 # * Trains a Word2Vec model from the corpus (see :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` for a detailed tutorial)
 # * Leverages the model to calculate word similarity
 # * Demonstrates using the API to load other models and corpora
@@ -27,12 +29,13 @@
 
 ###############################################################################
 #
-# Now, lets download the text8 corpus and load it to memory (automatically)
+# Now, let's download the text8 corpus and load it as a Python object
+# that supports streamed access.
 #
 corpus = api.load('text8')
 
 ###############################################################################
-# In this case, corpus is an iterable.
+# In this case, our corpus is an iterable.
 # If you look under the covers, it has the following definition:
 
 import inspect
@@ -46,23 +49,24 @@
 
 ###############################################################################
 #
-# As the corpus has been downloaded and loaded, let's create a word2vec model of our corpus.
+# With the corpus has been downloaded and loaded, let's use it to train a word2vec model.
 #
 
 from gensim.models.word2vec import Word2Vec
 model = Word2Vec(corpus)
 
 ###############################################################################
 #
-# Now that we have our word2vec model, let's find words that are similar to 'tree'
+# Now that we have our word2vec model, let's find words that are similar to 'tree'.
 #
 
 
-print(model.most_similar('tree'))
+print(model.wv.most_similar('tree'))
 
 ###############################################################################
 #
-# You can use the API to download many corpora and models. You can get the list of all the models and corpora that are provided, by using the code below:
+# You can use the API to download several different corpora and pretrained models.
+# Here's how to list all resources available in gensim-data:
 #
 
 
@@ -71,7 +75,7 @@
 print(json.dumps(info, indent=4))
 
 ###############################################################################
-# There are two types of data: corpora and models.
+# There are two types of data resources: corpora and models.
 print(info.keys())
 
 ###############################################################################
@@ -98,7 +102,7 @@
 
 ###############################################################################
 #
-# If you want to get detailed information about the model/corpus, use:
+# If you want to get detailed information about a model/corpus, use:
 #
 
 
@@ -107,7 +111,8 @@
 
 ###############################################################################
 #
-# Sometimes, you do not want to load the model to memory. You would just want to get the path to the model. For that, use :
+# Sometimes, you do not want to load a model into memory. Instead, you can request
+# just the filesystem path to the model. For that, use:
 #
 
 
@@ -124,7 +129,6 @@
 
 ###############################################################################
 #
-# In corpora, the corpus is never loaded to memory, all corpuses wrapped to special class ``Dataset`` and provide ``__iter__`` method
+# For corpora, the corpus is never loaded to memory, all corpora are iterables wrapped in
+# a special class ``Dataset``, with an ``__iter__`` method.
 #
-
-