From 1fbd6993b8ac0c5dee1b26656648ee8360c06aa3 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 19 Oct 2023 13:23:00 +0200
Subject: [PATCH 1/5] Resolve incorrect ValueError in RoPE config for Falcon

---
 src/transformers/models/falcon/configuration_falcon.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py
index fce21b146cf97f..6c85bfb18d0157 100644
--- a/src/transformers/models/falcon/configuration_falcon.py
+++ b/src/transformers/models/falcon/configuration_falcon.py
@@ -173,7 +173,7 @@ def _rope_scaling_validation(self):
         if self.rope_scaling is None:
             return
 
-        if self.rotary:
+        if self.alibi:
             raise ValueError("`rope_scaling` is not supported when `alibi` is `True`.")
 
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:

From b9e4f0a826792ba7bd4ed20d810a0311ac76890f Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 19 Oct 2023 13:23:21 +0200
Subject: [PATCH 2/5] Add broken codeblock tag in Falcon Config

---
 src/transformers/models/falcon/configuration_falcon.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py
index 6c85bfb18d0157..3dadcd26262585 100644
--- a/src/transformers/models/falcon/configuration_falcon.py
+++ b/src/transformers/models/falcon/configuration_falcon.py
@@ -92,7 +92,7 @@ class FalconConfig(PretrainedConfig):
 
     Example:
 
-    ```pytho
+    ```python
     >>> from transformers import FalconModel, FalconConfig
 
     >>> # Initializing a small (2-layer) Falcon configuration
@@ -188,4 +188,4 @@ def _rope_scaling_validation(self):
                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )
         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

From 69214ce56aeeb450c6deb49c3365048b99225436 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 19 Oct 2023 13:24:23 +0200
Subject: [PATCH 3/5] Fix typo: an float -> a float

---
 .../models/deprecated/open_llama/configuration_open_llama.py  | 4 ++--
 src/transformers/models/falcon/configuration_falcon.py        | 2 +-
 src/transformers/models/fuyu/configuration_fuyu.py            | 4 ++--
 src/transformers/models/gpt_neox/configuration_gpt_neox.py    | 4 ++--
 src/transformers/models/llama/configuration_llama.py          | 4 ++--
 src/transformers/models/persimmon/configuration_persimmon.py  | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
index 93e1394ab6d9d6..069e03335f8529 100644
--- a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -69,7 +69,7 @@ class OpenLlamaConfig(PretrainedConfig):
             Whether to tie weight embeddings
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
@@ -164,4 +164,4 @@ def _rope_scaling_validation(self):
                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )
         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py
index 3dadcd26262585..1c8f11654a526f 100644
--- a/src/transformers/models/falcon/configuration_falcon.py
+++ b/src/transformers/models/falcon/configuration_falcon.py
@@ -79,7 +79,7 @@ class FalconConfig(PretrainedConfig):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py
index b031fff45a7e33..2284adc9a4f96a 100644
--- a/src/transformers/models/fuyu/configuration_fuyu.py
+++ b/src/transformers/models/fuyu/configuration_fuyu.py
@@ -72,7 +72,7 @@ class FuyuConfig(PretrainedConfig):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
@@ -208,4 +208,4 @@ def _rope_scaling_validation(self):
                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )
         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index 896bda51317713..d6863268d1c372 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -80,7 +80,7 @@ class GPTNeoXConfig(PretrainedConfig):
             speedup at large scales (e.g. 20B).
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
@@ -173,4 +173,4 @@ def _rope_scaling_validation(self):
                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )
         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index f3da8ab4cdc242..cd51d2dbe08833 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -87,7 +87,7 @@ class LlamaConfig(PretrainedConfig):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
@@ -184,4 +184,4 @@ def _rope_scaling_validation(self):
                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )
         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py
index 8606e4febffe80..a423f1780d85f4 100644
--- a/src/transformers/models/persimmon/configuration_persimmon.py
+++ b/src/transformers/models/persimmon/configuration_persimmon.py
@@ -65,7 +65,7 @@ class PersimmonConfig(PretrainedConfig):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
@@ -160,4 +160,4 @@ def _rope_scaling_validation(self):
                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )
         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

From 4ee278a6a5c904feee02df7eaaddb91d11839d00 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 19 Oct 2023 13:25:34 +0200
Subject: [PATCH 4/5] Implement copy functionality for Fuyu and Persimmon

for RoPE scaling validation
---
 src/transformers/models/fuyu/configuration_fuyu.py           | 1 +
 src/transformers/models/persimmon/configuration_persimmon.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py
index 2284adc9a4f96a..5416baad215eb3 100644
--- a/src/transformers/models/fuyu/configuration_fuyu.py
+++ b/src/transformers/models/fuyu/configuration_fuyu.py
@@ -189,6 +189,7 @@ def __init__(
             **kwargs,
         )
 
+    # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
     def _rope_scaling_validation(self):
         """
         Validate the `rope_scaling` configuration.
diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py
index a423f1780d85f4..e2589f46c6b442 100644
--- a/src/transformers/models/persimmon/configuration_persimmon.py
+++ b/src/transformers/models/persimmon/configuration_persimmon.py
@@ -141,6 +141,7 @@ def __init__(
             **kwargs,
         )
 
+    # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
     def _rope_scaling_validation(self):
         """
         Validate the `rope_scaling` configuration.

From ba1bb6ee8bc752148bd226a00200750de2e3fe0f Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 19 Oct 2023 13:39:33 +0200
Subject: [PATCH 5/5] Make style

---
 .../models/deprecated/open_llama/configuration_open_llama.py  | 4 ++--
 src/transformers/models/falcon/configuration_falcon.py        | 4 ++--
 src/transformers/models/fuyu/configuration_fuyu.py            | 4 ++--
 src/transformers/models/gpt_neox/configuration_gpt_neox.py    | 4 ++--
 src/transformers/models/llama/configuration_llama.py          | 4 ++--
 src/transformers/models/persimmon/configuration_persimmon.py  | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
index 069e03335f8529..1d5756cd38a399 100644
--- a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -69,8 +69,8 @@ class OpenLlamaConfig(PretrainedConfig):
             Whether to tie weight embeddings
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
             https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py
index 1c8f11654a526f..251dd7f2d571c9 100644
--- a/src/transformers/models/falcon/configuration_falcon.py
+++ b/src/transformers/models/falcon/configuration_falcon.py
@@ -79,8 +79,8 @@ class FalconConfig(PretrainedConfig):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
             https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py
index 5416baad215eb3..d3c447469c7175 100644
--- a/src/transformers/models/fuyu/configuration_fuyu.py
+++ b/src/transformers/models/fuyu/configuration_fuyu.py
@@ -72,8 +72,8 @@ class FuyuConfig(PretrainedConfig):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
             https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index d6863268d1c372..e263a8a9fee66c 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -80,8 +80,8 @@ class GPTNeoXConfig(PretrainedConfig):
             speedup at large scales (e.g. 20B).
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
             https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index cd51d2dbe08833..f37e52ca9ca539 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -87,8 +87,8 @@ class LlamaConfig(PretrainedConfig):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
             https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py
index e2589f46c6b442..c48070a9da0cbb 100644
--- a/src/transformers/models/persimmon/configuration_persimmon.py
+++ b/src/transformers/models/persimmon/configuration_persimmon.py
@@ -65,8 +65,8 @@ class PersimmonConfig(PretrainedConfig):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
             these scaling strategies behave:
             https://www.reddit.com/r/LocalPersimmon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This