diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index dcdccad23a54c1..c7e626f1a7c284 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -113,10 +113,10 @@ class GenerationConfig(PushToHubMixin):
             heuristic is applied and the generation stops when is it very unlikely to find better candidates;
             `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
             beam search algorithm).
-        max_time(`float`, *optional*):
+        max_time (`float`, *optional*):
             The maximum amount of time you allow the computation to run for in seconds. generation will still finish
             the current pass after allocated time has been passed.
-        stop_strings(`str or List[str]`, *optional*):
+        stop_strings (`str or List[str]`, *optional*):
             A string or a list of strings that should terminate generation if the model outputs them.
 
         > Parameters that control the generation strategy used
@@ -181,10 +181,10 @@ class GenerationConfig(PushToHubMixin):
             `length_penalty` < 0.0 encourages shorter sequences.
         no_repeat_ngram_size (`int`, *optional*, defaults to 0):
             If set to int > 0, all ngrams of that size can only occur once.
-        bad_words_ids(`List[List[int]]`, *optional*):
+        bad_words_ids (`List[List[int]]`, *optional*):
             List of list of token ids that are not allowed to be generated. Check
             [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
-        force_words_ids(`List[List[int]]` or `List[List[List[int]]]`, *optional*):
+        force_words_ids (`List[List[int]]` or `List[List[List[int]]]`, *optional*):
             List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of
             words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this
             triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one
@@ -200,7 +200,7 @@ class GenerationConfig(PushToHubMixin):
             The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
             multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
             language token.
-        forced_eos_token_id (`Union[int, List[int]]`, *optional*, defaults to `model.config.forced_eos_token_id`):
+        forced_eos_token_id (`int` or List[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
             The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
             list to set multiple *end-of-sequence* tokens.
         remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`):
@@ -210,7 +210,7 @@ class GenerationConfig(PushToHubMixin):
             This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
             generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where
             penalty starts and `decay_factor` represents the factor of exponential decay
-        suppress_tokens  (`List[int]`, *optional*):
+        suppress_tokens (`List[int]`, *optional*):
             A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their
             log probs to `-inf` so that they are not sampled.
         begin_suppress_tokens  (`List[int]`, *optional*):
@@ -234,7 +234,7 @@ class GenerationConfig(PushToHubMixin):
         low_memory (`bool`, *optional*):
             Switch to sequential beam search and sequential topk for contrastive search to reduce peak memory.
             Used with beam search and contrastive search.
-        watermarking_config (Union[`WatermarkingConfig`, `dict`], *optional*):
+        watermarking_config (`WatermarkingConfig` or `dict`, *optional*):
             Arguments used to watermark the model outputs by adding a small bias to randomly selected set of "green" tokens.
             If passed as `Dict`, it will be converted to a `WatermarkingConfig` internally.
             See [this paper](https://arxiv.org/abs/2306.04634) for more details. Accepts the following keys:
@@ -249,12 +249,12 @@ class GenerationConfig(PushToHubMixin):
                     - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper)
                     - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper)
                         The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash".
-            - context_width(`int`):
+            - context_width (`int`):
                 The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust.
 
         > Parameters that define the output variables of generate
 
-        num_return_sequences(`int`, *optional*, defaults to 1):
+        num_return_sequences (`int`, *optional*, defaults to 1):
             The number of independently computed returned sequences for each element in the batch.
         output_attentions (`bool`, *optional*, defaults to `False`):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -284,7 +284,7 @@ class GenerationConfig(PushToHubMixin):
         encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
             If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
             `decoder_input_ids`.
-        decoder_start_token_id (`Union[int, List[int]]`, *optional*):
+        decoder_start_token_id (`int` or `List[int]`, *optional*):
             If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token or a list of length
             `batch_size`. Indicating a list enables different start ids for each element in the batch
             (e.g. multilingual models with different target languages in one batch)
@@ -323,7 +323,7 @@ class GenerationConfig(PushToHubMixin):
 
         cache_implementation (`str`, *optional*, default to `None`):
             Cache class that should be used when generating.
-        cache_config (`Union[CacheConfig, dict]`, *optional*, default to `None`):
+        cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
             Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
             it will be converted to its repsective `CacheConfig` internally.
             Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.