From dd349ffcf6d0f354767b1b37adb00582a4f036e8 Mon Sep 17 00:00:00 2001 From: johngiorgi Date: Mon, 22 Nov 2021 14:17:25 -0500 Subject: [PATCH 01/10] Add option to use scheduled sampling in copynet --- .../generation/models/copynet_seq2seq.py | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py index eaf7fb4d3..f8b18e5eb 100644 --- a/allennlp_models/generation/models/copynet_seq2seq.py +++ b/allennlp_models/generation/models/copynet_seq2seq.py @@ -56,6 +56,14 @@ class CopyNetSeq2Seq(Model): This is used to during inference to select the tokens of the decoded output sequence. target_embedding_dim : `int`, optional (default = `30`) The size of the embeddings for the target vocabulary. + scheduled_sampling_ratio : `float`, optional (default = `0.`) + At each timestep during training, we sample a random number between 0 and 1, and if it is + not less than this value, we use the ground truth labels for the whole batch. Else, we use + the predictions from the previous time step for the whole batch. If this value is 0.0 + (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not + using target side ground truth labels. See the following paper for more information: + [Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. Bengio et al., + 2015](https://arxiv.org/abs/1506.03099). copy_token : `str`, optional (default = `'@COPY@'`) The token used to indicate that a target token was copied from the source. If this token is not already in your target vocabulary, it will be added. @@ -83,6 +91,7 @@ def __init__( label_smoothing: float = None, beam_search: Lazy[BeamSearch] = Lazy(BeamSearch), target_embedding_dim: int = 30, + scheduled_sampling_ratio: float = 0.0, copy_token: str = "@COPY@", target_namespace: str = "target_tokens", tensor_based_metric: Metric = None, @@ -92,6 +101,7 @@ def __init__( ) -> None: super().__init__(vocab) self._target_namespace = target_namespace + self._scheduled_sampling_ratio = scheduled_sampling_ratio self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._oov_index = self.vocab.get_token_index(self.vocab._oov_token, self._target_namespace) @@ -515,13 +525,29 @@ def _forward_loss( (batch_size, self._target_vocab_size), fill_value=1.0, dtype=torch.bool ) + # Initialize target predictions with the start index. + # shape: (batch_size,) + last_predictions = source_mask.new_full( + (batch_size,), fill_value=self._start_index, dtype=torch.long + ) + step_log_likelihoods = [] for timestep in range(num_decoding_steps): - # shape: (batch_size,) - input_choices = target_tokens["tokens"]["tokens"][:, timestep] - # Get mask tensor indicating which instances were copied. - # shape: (batch_size,) - copied = ((input_choices == self._oov_index) & (target_to_source.sum(-1) > 0)).long() + if self.training and torch.rand(1).item() < self._scheduled_sampling_ratio: + # Use gold tokens at test time and at a rate of 1 - _scheduled_sampling_ratio + # during training. + # shape: (batch_size,) + input_choices = last_predictions + # Get mask tensor indicating which instances were copied. + # shape: (batch_size,) + copied = (input_choices >= self._target_vocab_size).long() + else: + # shape: (batch_size,) + input_choices = target_tokens["tokens"]["tokens"][:, timestep] + # shape: (batch_size,) + copied = ( + (input_choices == self._oov_index) & (target_to_source.sum(-1) > 0) + ).long() # shape: (batch_size,) input_choices = input_choices * (1 - copied) + copy_input_choices * copied # shape: (batch_size, source_sequence_length) From 2a3a70246556e0d3c7d91a2e94e9a874f0cf027d Mon Sep 17 00:00:00 2001 From: johngiorgi Date: Fri, 10 Dec 2021 10:44:44 -0500 Subject: [PATCH 02/10] Don't call torch.rand if sampling_ratio is None or 0.0 --- .../generation/models/copynet_seq2seq.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py index f8b18e5eb..d12a0a3f7 100644 --- a/allennlp_models/generation/models/copynet_seq2seq.py +++ b/allennlp_models/generation/models/copynet_seq2seq.py @@ -1,6 +1,6 @@ import logging import warnings -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Dict, List, Tuple, Union, Optional import numpy import torch @@ -56,11 +56,11 @@ class CopyNetSeq2Seq(Model): This is used to during inference to select the tokens of the decoded output sequence. target_embedding_dim : `int`, optional (default = `30`) The size of the embeddings for the target vocabulary. - scheduled_sampling_ratio : `float`, optional (default = `0.`) + scheduled_sampling_ratio : `float`, optional (default = `None`) At each timestep during training, we sample a random number between 0 and 1, and if it is not less than this value, we use the ground truth labels for the whole batch. Else, we use - the predictions from the previous time step for the whole batch. If this value is 0.0 - (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not + the predictions from the previous time step for the whole batch. If this value is 0.0 or + None (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not using target side ground truth labels. See the following paper for more information: [Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. Bengio et al., 2015](https://arxiv.org/abs/1506.03099). @@ -91,7 +91,7 @@ def __init__( label_smoothing: float = None, beam_search: Lazy[BeamSearch] = Lazy(BeamSearch), target_embedding_dim: int = 30, - scheduled_sampling_ratio: float = 0.0, + scheduled_sampling_ratio: Optional[float] = None, copy_token: str = "@COPY@", target_namespace: str = "target_tokens", tensor_based_metric: Metric = None, @@ -533,7 +533,11 @@ def _forward_loss( step_log_likelihoods = [] for timestep in range(num_decoding_steps): - if self.training and torch.rand(1).item() < self._scheduled_sampling_ratio: + if ( + self.training + and self._scheduled_sampling_ratio + and torch.rand(1).item() < self._scheduled_sampling_ratio + ): # Use gold tokens at test time and at a rate of 1 - _scheduled_sampling_ratio # during training. # shape: (batch_size,) From a15356fc4e3b2a6c1d91e90d795a5b8cd110ac02 Mon Sep 17 00:00:00 2001 From: johngiorgi Date: Fri, 10 Dec 2021 10:48:39 -0500 Subject: [PATCH 03/10] Add test for copynet with scheduled_sampling_ratio --- tests/generation/models/copynet_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/generation/models/copynet_test.py b/tests/generation/models/copynet_test.py index c857c2ac7..1e2712474 100644 --- a/tests/generation/models/copynet_test.py +++ b/tests/generation/models/copynet_test.py @@ -44,6 +44,13 @@ def test_model_can_train_with_amp(self): overrides="{'trainer.use_amp':true,'trainer.cuda_device':0}", ) + def test_model_can_train_with_scheduled_sampling_ratio(self): + train_model_from_file( + self.param_file, + self.TEST_DIR, + overrides="{'model.scheduled_sampling_ratio':0.5}", + ) + def test_vocab(self): vocab = self.model.vocab assert vocab.get_vocab_size(self.model._target_namespace) == 8 From 61792c2650370405ae33b69ab7e2ebf4d4f436a6 Mon Sep 17 00:00:00 2001 From: johngiorgi Date: Fri, 10 Dec 2021 11:03:54 -0500 Subject: [PATCH 04/10] Avoid call to torch.rand in sampling_ration falsey --- allennlp_models/generation/models/copynet_seq2seq.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py index bf5704d28..e5702aa18 100644 --- a/allennlp_models/generation/models/copynet_seq2seq.py +++ b/allennlp_models/generation/models/copynet_seq2seq.py @@ -1,6 +1,6 @@ import logging import warnings -from typing import Any, Dict, List, Tuple, Union, Optional +from typing import Any, Dict, List, Tuple, Union import numpy import torch @@ -56,11 +56,11 @@ class CopyNetSeq2Seq(Model): This is used to during inference to select the tokens of the decoded output sequence. target_embedding_dim : `int`, optional (default = `30`) The size of the embeddings for the target vocabulary. - scheduled_sampling_ratio : `float`, optional (default = `None`) + scheduled_sampling_ratio : `float`, optional (default = `0.`) At each timestep during training, we sample a random number between 0 and 1, and if it is not less than this value, we use the ground truth labels for the whole batch. Else, we use - the predictions from the previous time step for the whole batch. If this value is 0.0 or - None (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not + the predictions from the previous time step for the whole batch. If this value is 0.0 + (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not using target side ground truth labels. See the following paper for more information: [Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. Bengio et al., 2015](https://arxiv.org/abs/1506.03099). @@ -91,7 +91,7 @@ def __init__( label_smoothing: float = None, beam_search: Lazy[BeamSearch] = Lazy(BeamSearch), target_embedding_dim: int = 30, - scheduled_sampling_ratio: Optional[float] = None, + scheduled_sampling_ratio: float = 0.0, copy_token: str = "@COPY@", target_namespace: str = "target_tokens", tensor_based_metric: Metric = None, From acd7f296712a2a1d3a2310afecfa787c933d5ed7 Mon Sep 17 00:00:00 2001 From: John Giorgi Date: Mon, 13 Dec 2021 12:25:03 -0500 Subject: [PATCH 05/10] Update allennlp_models/generation/models/copynet_seq2seq.py Co-authored-by: Pete --- allennlp_models/generation/models/copynet_seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py index e5702aa18..bd2cfacaf 100644 --- a/allennlp_models/generation/models/copynet_seq2seq.py +++ b/allennlp_models/generation/models/copynet_seq2seq.py @@ -534,7 +534,7 @@ def _forward_loss( for timestep in range(num_decoding_steps): if ( self.training - and self._scheduled_sampling_ratio + and self._scheduled_sampling_ratio > 0.0 and torch.rand(1).item() < self._scheduled_sampling_ratio ): # Use gold tokens at test time and at a rate of 1 - _scheduled_sampling_ratio From 4ce4c7badc58fa391313fbbddd3928c430f89441 Mon Sep 17 00:00:00 2001 From: johngiorgi Date: Mon, 13 Dec 2021 12:40:14 -0500 Subject: [PATCH 06/10] Correctly compute last_predictions --- allennlp_models/generation/models/copynet_seq2seq.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/allennlp_models/generation/models/copynet_seq2seq.py b/allennlp_models/generation/models/copynet_seq2seq.py index bd2cfacaf..380523c7f 100644 --- a/allennlp_models/generation/models/copynet_seq2seq.py +++ b/allennlp_models/generation/models/copynet_seq2seq.py @@ -401,7 +401,7 @@ def _get_ll_contrib( target_tokens: torch.Tensor, target_to_source: torch.Tensor, source_mask: torch.BoolTensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Get the log-likelihood contribution from a single timestep. @@ -484,7 +484,7 @@ def _get_ll_contrib( # shape: (batch_size,) step_log_likelihood = util.logsumexp(combined_gen_and_copy) - return step_log_likelihood, selective_weights + return step_log_likelihood, selective_weights, log_probs def _forward_loss( self, @@ -568,7 +568,7 @@ def _forward_loss( copy_scores = self._get_copy_scores(state) # shape: (batch_size,) step_target_tokens = target_tokens["tokens"]["tokens"][:, timestep + 1] - step_log_likelihood, selective_weights = self._get_ll_contrib( + step_log_likelihood, selective_weights, log_probs = self._get_ll_contrib( generation_scores, generation_scores_mask, copy_scores, @@ -577,6 +577,8 @@ def _forward_loss( source_mask, ) step_log_likelihoods.append(step_log_likelihood.unsqueeze(1)) + # shape (predicted_classes): (batch_size,) + _, last_predictions = torch.max(log_probs, 1) # Gather step log-likelihoods. # shape: (batch_size, num_decoding_steps = target_sequence_length - 1) From c4d5444b5aedf8d5652124a1ee4b7c8ad7cbd3d7 Mon Sep 17 00:00:00 2001 From: johngiorgi Date: Mon, 13 Dec 2021 12:43:32 -0500 Subject: [PATCH 07/10] Update copynet get_ll_contrib test --- tests/generation/models/copynet_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/generation/models/copynet_test.py b/tests/generation/models/copynet_test.py index 1e2712474..86b66b3ca 100644 --- a/tests/generation/models/copynet_test.py +++ b/tests/generation/models/copynet_test.py @@ -140,7 +140,7 @@ def test_get_ll_contrib(self): generation_scores_mask = generation_scores.new_full( generation_scores.size(), True, dtype=torch.bool ) - ll_actual, selective_weights_actual = self.model._get_ll_contrib( + ll_actual, selective_weights_actual, _ = self.model._get_ll_contrib( generation_scores, generation_scores_mask, copy_scores, From 0cc0197f55e9b4b0161f877ee72676c38df5a1d4 Mon Sep 17 00:00:00 2001 From: johngiorgi Date: Mon, 13 Dec 2021 12:48:48 -0500 Subject: [PATCH 08/10] Avoid call to torch.rand if scheduled_sampling is 0 --- allennlp_models/generation/models/simple_seq2seq.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/allennlp_models/generation/models/simple_seq2seq.py b/allennlp_models/generation/models/simple_seq2seq.py index cc73d4cfc..9458ad8ff 100644 --- a/allennlp_models/generation/models/simple_seq2seq.py +++ b/allennlp_models/generation/models/simple_seq2seq.py @@ -367,7 +367,11 @@ def _forward_loop( step_logits: List[torch.Tensor] = [] step_predictions: List[torch.Tensor] = [] for timestep in range(num_decoding_steps): - if self.training and torch.rand(1).item() < self._scheduled_sampling_ratio: + if ( + self.training + and self._scheduled_sampling_ratio > 0.0 + and torch.rand(1).item() < self._scheduled_sampling_ratio + ): # Use gold tokens at test time and at a rate of 1 - _scheduled_sampling_ratio # during training. # shape: (batch_size,) From 39ef089035f1a2ab035ad526ec43c87c34eb477c Mon Sep 17 00:00:00 2001 From: johngiorgi Date: Mon, 13 Dec 2021 12:48:58 -0500 Subject: [PATCH 09/10] Add test for scheduled sampling --- tests/generation/models/simple_seq2seq_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/generation/models/simple_seq2seq_test.py b/tests/generation/models/simple_seq2seq_test.py index 7b48ff863..28aa8922b 100644 --- a/tests/generation/models/simple_seq2seq_test.py +++ b/tests/generation/models/simple_seq2seq_test.py @@ -46,6 +46,13 @@ def test_model_can_train_with_amp(self): overrides="{'trainer.use_amp':true,'trainer.cuda_device':0}", ) + def test_model_can_train_with_scheduled_sampling_ratio(self): + train_model_from_file( + self.param_file, + self.TEST_DIR, + overrides="{'model.scheduled_sampling_ratio':0.5}", + ) + def test_bidirectional_model_can_train_save_and_load(self): param_overrides = json.dumps({"model.encoder.bidirectional": True}) self.ensure_model_can_train_save_and_load( From 36d08f74672d1e49a6ac797733e271d2778abb7f Mon Sep 17 00:00:00 2001 From: johngiorgi Date: Mon, 13 Dec 2021 12:52:29 -0500 Subject: [PATCH 10/10] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22f351856..3d48d7dc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added a configuration to train on the PIQA dataset with AllenNLP Tango. - Added a transformer classification model. - Added a configuration to train on the IMDB dataset with AllenNLP Tango. +- Added `scheduled_sampling_ratio` argument to `CopyNetSeq2Seq` to use scheduled sampling during training. ### Fixed