From dd349ffcf6d0f354767b1b37adb00582a4f036e8 Mon Sep 17 00:00:00 2001
From: johngiorgi <johnmgiorgi@gmail.com>
Date: Mon, 22 Nov 2021 14:17:25 -0500
Subject: [PATCH 01/10] Add option to use scheduled sampling in copynet

---
 .../generation/models/copynet_seq2seq.py      | 36 ++++++++++++++++---
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py
index eaf7fb4d3..f8b18e5eb 100644
--- a/allennlp_models/generation/models/copynet_seq2seq.py
+++ b/allennlp_models/generation/models/copynet_seq2seq.py
@@ -56,6 +56,14 @@ class CopyNetSeq2Seq(Model):
         This is used to during inference to select the tokens of the decoded output sequence.
     target_embedding_dim : `int`, optional (default = `30`)
         The size of the embeddings for the target vocabulary.
+    scheduled_sampling_ratio : `float`, optional (default = `0.`)
+        At each timestep during training, we sample a random number between 0 and 1, and if it is
+        not less than this value, we use the ground truth labels for the whole batch. Else, we use
+        the predictions from the previous time step for the whole batch. If this value is 0.0
+        (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not
+        using target side ground truth labels.  See the following paper for more information:
+        [Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. Bengio et al.,
+        2015](https://arxiv.org/abs/1506.03099).
     copy_token : `str`, optional (default = `'@COPY@'`)
         The token used to indicate that a target token was copied from the source.
         If this token is not already in your target vocabulary, it will be added.
@@ -83,6 +91,7 @@ def __init__(
         label_smoothing: float = None,
         beam_search: Lazy[BeamSearch] = Lazy(BeamSearch),
         target_embedding_dim: int = 30,
+        scheduled_sampling_ratio: float = 0.0,
         copy_token: str = "@COPY@",
         target_namespace: str = "target_tokens",
         tensor_based_metric: Metric = None,
@@ -92,6 +101,7 @@ def __init__(
     ) -> None:
         super().__init__(vocab)
         self._target_namespace = target_namespace
+        self._scheduled_sampling_ratio = scheduled_sampling_ratio
         self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
         self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
         self._oov_index = self.vocab.get_token_index(self.vocab._oov_token, self._target_namespace)
@@ -515,13 +525,29 @@ def _forward_loss(
             (batch_size, self._target_vocab_size), fill_value=1.0, dtype=torch.bool
         )
 
+        # Initialize target predictions with the start index.
+        # shape: (batch_size,)
+        last_predictions = source_mask.new_full(
+            (batch_size,), fill_value=self._start_index, dtype=torch.long
+        )
+
         step_log_likelihoods = []
         for timestep in range(num_decoding_steps):
-            # shape: (batch_size,)
-            input_choices = target_tokens["tokens"]["tokens"][:, timestep]
-            # Get mask tensor indicating which instances were copied.
-            # shape: (batch_size,)
-            copied = ((input_choices == self._oov_index) & (target_to_source.sum(-1) > 0)).long()
+            if self.training and torch.rand(1).item() < self._scheduled_sampling_ratio:
+                # Use gold tokens at test time and at a rate of 1 - _scheduled_sampling_ratio
+                # during training.
+                # shape: (batch_size,)
+                input_choices = last_predictions
+                # Get mask tensor indicating which instances were copied.
+                # shape: (batch_size,)
+                copied = (input_choices >= self._target_vocab_size).long()
+            else:
+                # shape: (batch_size,)
+                input_choices = target_tokens["tokens"]["tokens"][:, timestep]
+                # shape: (batch_size,)
+                copied = (
+                    (input_choices == self._oov_index) & (target_to_source.sum(-1) > 0)
+                ).long()
             # shape: (batch_size,)
             input_choices = input_choices * (1 - copied) + copy_input_choices * copied
             # shape: (batch_size, source_sequence_length)

From 2a3a70246556e0d3c7d91a2e94e9a874f0cf027d Mon Sep 17 00:00:00 2001
From: johngiorgi <johnmgiorgi@gmail.com>
Date: Fri, 10 Dec 2021 10:44:44 -0500
Subject: [PATCH 02/10] Don't call torch.rand if sampling_ratio is None or 0.0

---
 .../generation/models/copynet_seq2seq.py         | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py
index f8b18e5eb..d12a0a3f7 100644
--- a/allennlp_models/generation/models/copynet_seq2seq.py
+++ b/allennlp_models/generation/models/copynet_seq2seq.py
@@ -1,6 +1,6 @@
 import logging
 import warnings
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union, Optional
 
 import numpy
 import torch
@@ -56,11 +56,11 @@ class CopyNetSeq2Seq(Model):
         This is used to during inference to select the tokens of the decoded output sequence.
     target_embedding_dim : `int`, optional (default = `30`)
         The size of the embeddings for the target vocabulary.
-    scheduled_sampling_ratio : `float`, optional (default = `0.`)
+    scheduled_sampling_ratio : `float`, optional (default = `None`)
         At each timestep during training, we sample a random number between 0 and 1, and if it is
         not less than this value, we use the ground truth labels for the whole batch. Else, we use
-        the predictions from the previous time step for the whole batch. If this value is 0.0
-        (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not
+        the predictions from the previous time step for the whole batch. If this value is 0.0 or
+        None (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not
         using target side ground truth labels.  See the following paper for more information:
         [Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. Bengio et al.,
         2015](https://arxiv.org/abs/1506.03099).
@@ -91,7 +91,7 @@ def __init__(
         label_smoothing: float = None,
         beam_search: Lazy[BeamSearch] = Lazy(BeamSearch),
         target_embedding_dim: int = 30,
-        scheduled_sampling_ratio: float = 0.0,
+        scheduled_sampling_ratio: Optional[float] = None,
         copy_token: str = "@COPY@",
         target_namespace: str = "target_tokens",
         tensor_based_metric: Metric = None,
@@ -533,7 +533,11 @@ def _forward_loss(
 
         step_log_likelihoods = []
         for timestep in range(num_decoding_steps):
-            if self.training and torch.rand(1).item() < self._scheduled_sampling_ratio:
+            if (
+                self.training
+                and self._scheduled_sampling_ratio
+                and torch.rand(1).item() < self._scheduled_sampling_ratio
+            ):
                 # Use gold tokens at test time and at a rate of 1 - _scheduled_sampling_ratio
                 # during training.
                 # shape: (batch_size,)

From a15356fc4e3b2a6c1d91e90d795a5b8cd110ac02 Mon Sep 17 00:00:00 2001
From: johngiorgi <johnmgiorgi@gmail.com>
Date: Fri, 10 Dec 2021 10:48:39 -0500
Subject: [PATCH 03/10] Add test for copynet with scheduled_sampling_ratio

---
 tests/generation/models/copynet_test.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/generation/models/copynet_test.py b/tests/generation/models/copynet_test.py
index c857c2ac7..1e2712474 100644
--- a/tests/generation/models/copynet_test.py
+++ b/tests/generation/models/copynet_test.py
@@ -44,6 +44,13 @@ def test_model_can_train_with_amp(self):
             overrides="{'trainer.use_amp':true,'trainer.cuda_device':0}",
         )
 
+    def test_model_can_train_with_scheduled_sampling_ratio(self):
+        train_model_from_file(
+            self.param_file,
+            self.TEST_DIR,
+            overrides="{'model.scheduled_sampling_ratio':0.5}",
+        )
+
     def test_vocab(self):
         vocab = self.model.vocab
         assert vocab.get_vocab_size(self.model._target_namespace) == 8

From 61792c2650370405ae33b69ab7e2ebf4d4f436a6 Mon Sep 17 00:00:00 2001
From: johngiorgi <johnmgiorgi@gmail.com>
Date: Fri, 10 Dec 2021 11:03:54 -0500
Subject: [PATCH 04/10] Avoid call to torch.rand in sampling_ration falsey

---
 allennlp_models/generation/models/copynet_seq2seq.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py
index bf5704d28..e5702aa18 100644
--- a/allennlp_models/generation/models/copynet_seq2seq.py
+++ b/allennlp_models/generation/models/copynet_seq2seq.py
@@ -1,6 +1,6 @@
 import logging
 import warnings
-from typing import Any, Dict, List, Tuple, Union, Optional
+from typing import Any, Dict, List, Tuple, Union
 
 import numpy
 import torch
@@ -56,11 +56,11 @@ class CopyNetSeq2Seq(Model):
         This is used to during inference to select the tokens of the decoded output sequence.
     target_embedding_dim : `int`, optional (default = `30`)
         The size of the embeddings for the target vocabulary.
-    scheduled_sampling_ratio : `float`, optional (default = `None`)
+    scheduled_sampling_ratio : `float`, optional (default = `0.`)
         At each timestep during training, we sample a random number between 0 and 1, and if it is
         not less than this value, we use the ground truth labels for the whole batch. Else, we use
-        the predictions from the previous time step for the whole batch. If this value is 0.0 or
-        None (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not
+        the predictions from the previous time step for the whole batch. If this value is 0.0
+        (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not
         using target side ground truth labels.  See the following paper for more information:
         [Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. Bengio et al.,
         2015](https://arxiv.org/abs/1506.03099).
@@ -91,7 +91,7 @@ def __init__(
         label_smoothing: float = None,
         beam_search: Lazy[BeamSearch] = Lazy(BeamSearch),
         target_embedding_dim: int = 30,
-        scheduled_sampling_ratio: Optional[float] = None,
+        scheduled_sampling_ratio: float = 0.0,
         copy_token: str = "@COPY@",
         target_namespace: str = "target_tokens",
         tensor_based_metric: Metric = None,

From acd7f296712a2a1d3a2310afecfa787c933d5ed7 Mon Sep 17 00:00:00 2001
From: John Giorgi <johnmgiorgi@gmail.com>
Date: Mon, 13 Dec 2021 12:25:03 -0500
Subject: [PATCH 05/10] Update
 allennlp_models/generation/models/copynet_seq2seq.py

Co-authored-by: Pete <epwalsh10@gmail.com>
---
 allennlp_models/generation/models/copynet_seq2seq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py
index e5702aa18..bd2cfacaf 100644
--- a/allennlp_models/generation/models/copynet_seq2seq.py
+++ b/allennlp_models/generation/models/copynet_seq2seq.py
@@ -534,7 +534,7 @@ def _forward_loss(
         for timestep in range(num_decoding_steps):
             if (
                 self.training
-                and self._scheduled_sampling_ratio
+                and self._scheduled_sampling_ratio > 0.0
                 and torch.rand(1).item() < self._scheduled_sampling_ratio
             ):
                 # Use gold tokens at test time and at a rate of 1 - _scheduled_sampling_ratio

From 4ce4c7badc58fa391313fbbddd3928c430f89441 Mon Sep 17 00:00:00 2001
From: johngiorgi <johnmgiorgi@gmail.com>
Date: Mon, 13 Dec 2021 12:40:14 -0500
Subject: [PATCH 06/10] Correctly compute last_predictions

---
 allennlp_models/generation/models/copynet_seq2seq.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py
index bd2cfacaf..380523c7f 100644
--- a/allennlp_models/generation/models/copynet_seq2seq.py
+++ b/allennlp_models/generation/models/copynet_seq2seq.py
@@ -401,7 +401,7 @@ def _get_ll_contrib(
         target_tokens: torch.Tensor,
         target_to_source: torch.Tensor,
         source_mask: torch.BoolTensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Get the log-likelihood contribution from a single timestep.
 
@@ -484,7 +484,7 @@ def _get_ll_contrib(
         # shape: (batch_size,)
         step_log_likelihood = util.logsumexp(combined_gen_and_copy)
 
-        return step_log_likelihood, selective_weights
+        return step_log_likelihood, selective_weights, log_probs
 
     def _forward_loss(
         self,
@@ -568,7 +568,7 @@ def _forward_loss(
             copy_scores = self._get_copy_scores(state)
             # shape: (batch_size,)
             step_target_tokens = target_tokens["tokens"]["tokens"][:, timestep + 1]
-            step_log_likelihood, selective_weights = self._get_ll_contrib(
+            step_log_likelihood, selective_weights, log_probs = self._get_ll_contrib(
                 generation_scores,
                 generation_scores_mask,
                 copy_scores,
@@ -577,6 +577,8 @@ def _forward_loss(
                 source_mask,
             )
             step_log_likelihoods.append(step_log_likelihood.unsqueeze(1))
+            # shape (predicted_classes): (batch_size,)
+            _, last_predictions = torch.max(log_probs, 1)
 
         # Gather step log-likelihoods.
         # shape: (batch_size, num_decoding_steps = target_sequence_length - 1)

From c4d5444b5aedf8d5652124a1ee4b7c8ad7cbd3d7 Mon Sep 17 00:00:00 2001
From: johngiorgi <johnmgiorgi@gmail.com>
Date: Mon, 13 Dec 2021 12:43:32 -0500
Subject: [PATCH 07/10] Update copynet get_ll_contrib test

---
 tests/generation/models/copynet_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generation/models/copynet_test.py b/tests/generation/models/copynet_test.py
index 1e2712474..86b66b3ca 100644
--- a/tests/generation/models/copynet_test.py
+++ b/tests/generation/models/copynet_test.py
@@ -140,7 +140,7 @@ def test_get_ll_contrib(self):
         generation_scores_mask = generation_scores.new_full(
             generation_scores.size(), True, dtype=torch.bool
         )
-        ll_actual, selective_weights_actual = self.model._get_ll_contrib(
+        ll_actual, selective_weights_actual, _ = self.model._get_ll_contrib(
             generation_scores,
             generation_scores_mask,
             copy_scores,

From 0cc0197f55e9b4b0161f877ee72676c38df5a1d4 Mon Sep 17 00:00:00 2001
From: johngiorgi <johnmgiorgi@gmail.com>
Date: Mon, 13 Dec 2021 12:48:48 -0500
Subject: [PATCH 08/10] Avoid call to torch.rand if scheduled_sampling is 0

---
 allennlp_models/generation/models/simple_seq2seq.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/allennlp_models/generation/models/simple_seq2seq.py b/allennlp_models/generation/models/simple_seq2seq.py
index cc73d4cfc..9458ad8ff 100644
--- a/allennlp_models/generation/models/simple_seq2seq.py
+++ b/allennlp_models/generation/models/simple_seq2seq.py
@@ -367,7 +367,11 @@ def _forward_loop(
         step_logits: List[torch.Tensor] = []
         step_predictions: List[torch.Tensor] = []
         for timestep in range(num_decoding_steps):
-            if self.training and torch.rand(1).item() < self._scheduled_sampling_ratio:
+            if (
+                self.training
+                and self._scheduled_sampling_ratio > 0.0
+                and torch.rand(1).item() < self._scheduled_sampling_ratio
+            ):
                 # Use gold tokens at test time and at a rate of 1 - _scheduled_sampling_ratio
                 # during training.
                 # shape: (batch_size,)

From 39ef089035f1a2ab035ad526ec43c87c34eb477c Mon Sep 17 00:00:00 2001
From: johngiorgi <johnmgiorgi@gmail.com>
Date: Mon, 13 Dec 2021 12:48:58 -0500
Subject: [PATCH 09/10] Add test for scheduled sampling

---
 tests/generation/models/simple_seq2seq_test.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/generation/models/simple_seq2seq_test.py b/tests/generation/models/simple_seq2seq_test.py
index 7b48ff863..28aa8922b 100644
--- a/tests/generation/models/simple_seq2seq_test.py
+++ b/tests/generation/models/simple_seq2seq_test.py
@@ -46,6 +46,13 @@ def test_model_can_train_with_amp(self):
             overrides="{'trainer.use_amp':true,'trainer.cuda_device':0}",
         )
 
+    def test_model_can_train_with_scheduled_sampling_ratio(self):
+        train_model_from_file(
+            self.param_file,
+            self.TEST_DIR,
+            overrides="{'model.scheduled_sampling_ratio':0.5}",
+        )
+
     def test_bidirectional_model_can_train_save_and_load(self):
         param_overrides = json.dumps({"model.encoder.bidirectional": True})
         self.ensure_model_can_train_save_and_load(

From 36d08f74672d1e49a6ac797733e271d2778abb7f Mon Sep 17 00:00:00 2001
From: johngiorgi <johnmgiorgi@gmail.com>
Date: Mon, 13 Dec 2021 12:52:29 -0500
Subject: [PATCH 10/10] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 22f351856..3d48d7dc1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -37,6 +37,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added a configuration to train on the PIQA dataset with AllenNLP Tango.
 - Added a transformer classification model.
 - Added a configuration to train on the IMDB dataset with AllenNLP Tango.
+- Added `scheduled_sampling_ratio` argument to `CopyNetSeq2Seq` to use scheduled sampling during training.
 
 ### Fixed