From 7df08c4ac10ad8d95c8123da366e02af94fe8e4c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 19 May 2024 16:22:58 +1000
Subject: [PATCH 001/153] Update llama.py

---
 unsloth/models/llama.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index f7fd5f13..77d0a6ab 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1490,19 +1490,19 @@ def get_peft_model(
         final_modules = []
         for module in target_modules:
             if module == "lm_head":
-                logger.warning_once(
-                    "Unsloth: `lm_head` should be placed in `modules_to_save` and not `target_modules`. "\
-                    "Luckily, we shall do it for you!"
-                )
+                # logger.warning_once(
+                #     "Unsloth: `lm_head` should be placed in `modules_to_save` and not `target_modules`. "\
+                #     "Luckily, we shall do it for you!"
+                # )
                 train_lm_head = True
                 if modules_to_save is None: modules_to_save = ["lm_head"]
                 else: modules_to_save.append("lm_head")
 
             elif module == "embed_tokens":
-                logger.warning_once(
-                    "Unsloth: `embed_tokens` should be placed in `modules_to_save` and not `target_modules`. "\
-                    "Luckily, we shall do it for you!"
-                )
+                # logger.warning_once(
+                #     "Unsloth: `embed_tokens` should be placed in `modules_to_save` and not `target_modules`. "\
+                #     "Luckily, we shall do it for you!"
+                # )
                 train_embed_tokens = True
                 if modules_to_save is None: modules_to_save = ["embed_tokens"]
                 else: modules_to_save.append("embed_tokens")

From ba5b6ce37a528464305f1f08af05d50a2f5f9188 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 19 May 2024 18:08:31 +1000
Subject: [PATCH 002/153] offload

---
 unsloth/models/_utils.py | 46 ++++++++++++++++++++++++++++++++++++++++
 unsloth/models/llama.py  | 21 +++++++++++++++---
 2 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index a53de42c..4d82d067 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -72,6 +72,9 @@
     "patch_tokenizer",
     "get_statistics",
     "Unsloth_Offloaded_Gradient_Checkpointer",
+    "offload_to_disk",
+    "offload_input_embeddings",
+    "offload_output_embeddings",
 ]
 
 
@@ -421,3 +424,46 @@ def backward(ctx, dY):
         "Luckily, your training run will still work in the meantime!"
     )
 pass
+
+
+# Offloading to disk for modules (lm_head, embed_tokens)
+import os
+import pickle
+
+def offload_to_disk(W, model, name, temporary_location : str = "_unsloth_temporary_saved_buffers"):
+    file_location = os.path.join(temporary_location, model.config._name_or_path)
+    if not os.path.exists(file_location):
+        os.makedirs(file_location)
+    pass
+
+    filename = os.path.join(file_location, f"{name}.pt")
+    W = W.weight if hasattr(W, "weight") else W
+    torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,)
+    offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)
+    offloaded_W._offloaded_file_location = filename
+    return offloaded_W
+pass
+
+
+def offload_input_embeddings(model, temporary_location : str = "_unsloth_temporary_saved_buffers"):
+    offloaded_W = offload_to_disk(model.get_input_embeddings(), model, "input_embeddings", temporary_location)
+    new_input_embeddings = torch.nn.Embedding.from_pretrained(offloaded_W)
+    new_input_embeddings._offloaded_file_location = offloaded_W._offloaded_file_location
+    model.set_input_embeddings(new_input_embeddings)
+    return
+pass
+
+
+def offload_output_embeddings(model, temporary_location : str = "_unsloth_temporary_saved_buffers"):
+    offloaded_W = offload_to_disk(model.get_output_embeddings(), model, "output_embeddings", temporary_location)
+
+    new_output_embeddings = torch.nn.Linear(1, 1, bias = None)
+    del new_output_embeddings.weight
+    new_output_embeddings.weight = offloaded_W
+    new_output_embeddings.in_features  = offloaded_W.shape[1]
+    new_output_embeddings.out_features = offloaded_W.shape[0]
+
+    new_output_embeddings._offloaded_file_location = offloaded_W._offloaded_file_location
+    model.set_output_embeddings(new_output_embeddings)
+    return
+pass
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 77d0a6ab..4f754c6a 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1396,6 +1396,7 @@ def get_peft_model(
         modules_to_save     = None,
         init_lora_weights   = True,
         loftq_config        = {},
+        temporary_location  = "_unsloth_temporary_saved_buffers",
         **kwargs,
     ):
         transformers_set_seed(random_state)
@@ -1579,7 +1580,19 @@ def get_peft_model(
         _saved_temp_tokenizer = model._saved_temp_tokenizer
 
         lora_config = LoraConfig(**arguments)
-        model = _get_peft_model(model, lora_config)
+
+        # First offload lm_head and embed_tokens to disk
+        original_device = model.get_input_embeddings.device
+        if use_gradient_checkpointing == "unsloth":
+            if train_embed_tokens:
+                print("Unsloth: Offloading input_embeddings to disk to save VRAM")
+                offload_input_embeddings(model, temporary_location)
+            pass
+            if train_lm_head:
+                print("Unsloth: Offloading output_embeddings to disk to save VRAM")
+                offload_output_embeddings(model, temporary_location)
+            pass
+        pass
 
         model._saved_temp_tokenizer = _saved_temp_tokenizer
 
@@ -1589,14 +1602,16 @@ def get_peft_model(
         if train_embed_tokens:
             print("Unsloth: Casting embed_tokens to float32")
             assert(hasattr(model.model.model.embed_tokens, "modules_to_save"))
-            model.model.model.embed_tokens.modules_to_save.default.to(torch.float32)
+            model.model.model.embed_tokens.modules_to_save.default\
+                .to(torch.float32, device = original_device, non_blocking = True)
             model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True)
         pass
 
         if train_lm_head:
             print("Unsloth: Casting lm_head to float32")
             assert(hasattr(model.model.lm_head, "modules_to_save"))
-            model.model.lm_head.modules_to_save.default.to(torch.float32)
+            model.model.lm_head.modules_to_save.default\
+                .to(torch.float32, device = original_device, non_blocking = True)
             model.model.lm_head.modules_to_save.default.requires_grad_(True)
         pass
 

From a07057e6b8ea66aaa98b7f839933532e960c6c5c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 19 May 2024 18:09:21 +1000
Subject: [PATCH 003/153] Update llama.py

---
 unsloth/models/llama.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 4f754c6a..31f73fb1 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import torch
+import gc
 from typing import Optional, Tuple, List, Union
 from torch.nn.functional import scaled_dot_product_attention
 from transformers.models.llama.modeling_llama import (
@@ -1370,7 +1371,6 @@ def post_patch(model):
         pass
 
         # Clear deleted GPU items
-        import gc
         for _ in range(3):
             gc.collect()
             torch.cuda.empty_cache()
@@ -1592,6 +1592,12 @@ def get_peft_model(
                 print("Unsloth: Offloading output_embeddings to disk to save VRAM")
                 offload_output_embeddings(model, temporary_location)
             pass
+
+            # Remove old items to save VRAM
+            for _ in range(3):
+                gc.collect()
+                torch.cuda.empty_cache()
+            pass
         pass
 
         model._saved_temp_tokenizer = _saved_temp_tokenizer

From 4be9063a46a987d2c6a7c0f3a3852fa499711206 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 19 May 2024 18:10:36 +1000
Subject: [PATCH 004/153] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 31f73fb1..5276ec9c 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1582,7 +1582,7 @@ def get_peft_model(
         lora_config = LoraConfig(**arguments)
 
         # First offload lm_head and embed_tokens to disk
-        original_device = model.get_input_embeddings.device
+        original_device = model.get_input_embeddings().weight.device
         if use_gradient_checkpointing == "unsloth":
             if train_embed_tokens:
                 print("Unsloth: Offloading input_embeddings to disk to save VRAM")

From 3dc3d3ff7109ccc4a9db943477c8bc29571d2499 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 19 May 2024 18:13:02 +1000
Subject: [PATCH 005/153] Update llama.py

---
 unsloth/models/llama.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 5276ec9c..133138d4 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1599,6 +1599,8 @@ def get_peft_model(
                 torch.cuda.empty_cache()
             pass
         pass
+        
+        model = _get_peft_model(model, lora_config)
 
         model._saved_temp_tokenizer = _saved_temp_tokenizer
 

From f1cc1e8e4c3fe6f30ac2eae4f2ba4226ea791fcd Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 19 May 2024 18:14:32 +1000
Subject: [PATCH 006/153] Update llama.py

---
 unsloth/models/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 133138d4..0fc4257e 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1599,7 +1599,7 @@ def get_peft_model(
                 torch.cuda.empty_cache()
             pass
         pass
-        
+
         model = _get_peft_model(model, lora_config)
 
         model._saved_temp_tokenizer = _saved_temp_tokenizer
@@ -1611,7 +1611,7 @@ def get_peft_model(
             print("Unsloth: Casting embed_tokens to float32")
             assert(hasattr(model.model.model.embed_tokens, "modules_to_save"))
             model.model.model.embed_tokens.modules_to_save.default\
-                .to(torch.float32, device = original_device, non_blocking = True)
+                .to(device = original_device, dtype = torch.float32, non_blocking = True)
             model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True)
         pass
 
@@ -1619,7 +1619,7 @@ def get_peft_model(
             print("Unsloth: Casting lm_head to float32")
             assert(hasattr(model.model.lm_head, "modules_to_save"))
             model.model.lm_head.modules_to_save.default\
-                .to(torch.float32, device = original_device, non_blocking = True)
+                .to(device = original_device, dtype = torch.float32, non_blocking = True)
             model.model.lm_head.modules_to_save.default.requires_grad_(True)
         pass
 

From 5cb531a3ddca5b37714495050725cb5cec39b742 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 19 May 2024 19:03:23 +1000
Subject: [PATCH 007/153] Update llama.py

---
 unsloth/models/llama.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 0fc4257e..eff35eef 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1588,6 +1588,13 @@ def get_peft_model(
                 print("Unsloth: Offloading input_embeddings to disk to save VRAM")
                 offload_input_embeddings(model, temporary_location)
             pass
+
+            # Remove old items to save VRAM
+            for _ in range(3):
+                gc.collect()
+                torch.cuda.empty_cache()
+            pass
+            
             if train_lm_head:
                 print("Unsloth: Offloading output_embeddings to disk to save VRAM")
                 offload_output_embeddings(model, temporary_location)

From 6bd8e600d72aeccb1108c83d50df07471ad0d400 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 19 May 2024 19:04:01 +1000
Subject: [PATCH 008/153] Update llama.py

---
 unsloth/models/llama.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index eff35eef..cad4c6a0 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1582,7 +1582,9 @@ def get_peft_model(
         lora_config = LoraConfig(**arguments)
 
         # First offload lm_head and embed_tokens to disk
-        original_device = model.get_input_embeddings().weight.device
+        input_embeddings_device  = model. get_input_embeddings().weight.device
+        output_embeddings_device = model.get_output_embeddings().weight.device
+
         if use_gradient_checkpointing == "unsloth":
             if train_embed_tokens:
                 print("Unsloth: Offloading input_embeddings to disk to save VRAM")
@@ -1594,7 +1596,7 @@ def get_peft_model(
                 gc.collect()
                 torch.cuda.empty_cache()
             pass
-            
+
             if train_lm_head:
                 print("Unsloth: Offloading output_embeddings to disk to save VRAM")
                 offload_output_embeddings(model, temporary_location)
@@ -1618,7 +1620,7 @@ def get_peft_model(
             print("Unsloth: Casting embed_tokens to float32")
             assert(hasattr(model.model.model.embed_tokens, "modules_to_save"))
             model.model.model.embed_tokens.modules_to_save.default\
-                .to(device = original_device, dtype = torch.float32, non_blocking = True)
+                .to(device = input_embeddings_device,  dtype = torch.float32, non_blocking = True)
             model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True)
         pass
 
@@ -1626,7 +1628,7 @@ def get_peft_model(
             print("Unsloth: Casting lm_head to float32")
             assert(hasattr(model.model.lm_head, "modules_to_save"))
             model.model.lm_head.modules_to_save.default\
-                .to(device = original_device, dtype = torch.float32, non_blocking = True)
+                .to(device = output_embeddings_device, dtype = torch.float32, non_blocking = True)
             model.model.lm_head.modules_to_save.default.requires_grad_(True)
         pass
 

From d1d57ff99079d0ada0fde31cb67c637dd7ac27cc Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 20 May 2024 02:08:26 +1000
Subject: [PATCH 009/153] Update llama.py

---
 unsloth/models/llama.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index cad4c6a0..175a62ec 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1194,7 +1194,11 @@ def from_pretrained(
         f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
         f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
         f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
-        logger.warning_once(debug_info)"""
+        logger.warning_once(debug_info)
+        import gc
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()"""
 
         debug_info = debug_info.split('\n')
         debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
@@ -1644,6 +1648,12 @@ def get_peft_model(
             internal_model._saved_temp_tokenizer.padding_side = "right"
         pass
 
+        # Clear deleted GPU items
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        pass
+
         return model
     pass
 

From 7470f672bd596373a931d9a5d13c8b31eb57141b Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 21 May 2024 03:17:13 +1000
Subject: [PATCH 010/153] continued pretraining trainer

---
 unsloth/__init__.py |  1 +
 unsloth/trainer.py  | 94 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 unsloth/trainer.py

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index d4ca45d7..2dcf1e6a 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -114,3 +114,4 @@
 from .save import *
 from .chat_templates import *
 from .tokenizer_utils import *
+from .trainer import *
diff --git a/unsloth/trainer.py b/unsloth/trainer.py
new file mode 100644
index 00000000..226eb4ed
--- /dev/null
+++ b/unsloth/trainer.py
@@ -0,0 +1,94 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass, field
+from typing import Optional
+from transformers import TrainingArguments
+from trl import SFTTrainer
+
+__all__ = [
+    "UnslothTrainingArguments",
+    "UnslothTrainer",
+]
+
+
+@dataclass
+class UnslothTrainingArguments(TrainingArguments):
+    embedding_learning_rate : Optional[float] = field(
+        default = None,
+        metadata = {"help" : "Different learning rates for embeddings and lm_head."}
+    )
+pass
+
+
+def _create_unsloth_optimizer(
+    model,
+    optimizer_cls,
+    optimizer_kwargs,
+    embedding_lr = 5e-5,
+):
+    lr = optimizer_kwargs["lr"]
+    weight_decay = optimizer_kwargs.get("weight_decay", 0.0)
+
+    param_groups = \
+    {
+        "non_embeddings" : {},
+        "embeddings"     : {},
+    }
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad: continue
+        if "modules_to_save.default" in name:
+            print(f"Unsloth: Setting lr = {embedding_lr} instead of {lr} for {name}.")
+            param_groups["embeddings"]    [name] = param
+        else:
+            param_groups["non_embeddings"][name] = param
+        pass
+    pass
+
+    optimizer_grouped_parameters = [
+        {
+            "params"       : list(param_groups["non_embeddings"].values()),
+            "weight_decay" : weight_decay,
+            "lr"           : lr,
+        },
+        {
+            "params"       : list(param_groups["embeddings"].values()),
+            "weight_decay" : weight_decay,
+            "lr"           : embedding_lr,
+        },
+    ]
+    optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+    return optimizer
+pass
+
+
+class UnslothTrainer(SFTTrainer):
+    def create_optimizer(self):
+        embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
+        if embedding_learning_rate is None: return super().create_optimizer()
+
+        if self.optimizer is None:
+            optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = _create_unsloth_optimizer(
+                self.model,
+                optimizer_cls,
+                optimizer_kwargs,
+                embedding_learning_rate,
+            )
+        pass
+        return self.optimizer
+    pass
+pass

From da9c1a602c3a1b29e08dab013ccf43ee5ad64fe9 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 21 May 2024 03:51:37 +1000
Subject: [PATCH 011/153] Update trainer.py

---
 unsloth/trainer.py | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/unsloth/trainer.py b/unsloth/trainer.py
index 226eb4ed..150124b9 100644
--- a/unsloth/trainer.py
+++ b/unsloth/trainer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from dataclasses import dataclass, field
 from typing import Optional
 from transformers import TrainingArguments
@@ -26,10 +25,11 @@
 
 @dataclass
 class UnslothTrainingArguments(TrainingArguments):
-    embedding_learning_rate : Optional[float] = field(
-        default = None,
-        metadata = {"help" : "Different learning rates for embeddings and lm_head."}
-    )
+    pass
+    # embedding_learning_rate : Optional[float] = field(
+    #     default = None,
+    #     metadata = {"help" : "Different learning rates for embeddings and lm_head."}
+    # )
 pass
 
 
@@ -76,19 +76,20 @@ def _create_unsloth_optimizer(
 
 
 class UnslothTrainer(SFTTrainer):
-    def create_optimizer(self):
-        embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
-        if embedding_learning_rate is None: return super().create_optimizer()
-
-        if self.optimizer is None:
-            optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args)
-            self.optimizer = _create_unsloth_optimizer(
-                self.model,
-                optimizer_cls,
-                optimizer_kwargs,
-                embedding_learning_rate,
-            )
-        pass
-        return self.optimizer
     pass
+    # def create_optimizer(self):
+    #     embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
+    #     if embedding_learning_rate is None: return super().create_optimizer()
+
+    #     if self.optimizer is None:
+    #         optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args)
+    #         self.optimizer = _create_unsloth_optimizer(
+    #             self.model,
+    #             optimizer_cls,
+    #             optimizer_kwargs,
+    #             embedding_learning_rate,
+    #         )
+    #     pass
+    #     return self.optimizer
+    # pass
 pass

From 2c68f5635a325b3847aa585d57083050b5dfbe6b Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 21 May 2024 03:58:21 +1000
Subject: [PATCH 012/153] Update trainer.py

---
 unsloth/trainer.py | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/unsloth/trainer.py b/unsloth/trainer.py
index 150124b9..63a4398a 100644
--- a/unsloth/trainer.py
+++ b/unsloth/trainer.py
@@ -25,11 +25,10 @@
 
 @dataclass
 class UnslothTrainingArguments(TrainingArguments):
-    pass
-    # embedding_learning_rate : Optional[float] = field(
-    #     default = None,
-    #     metadata = {"help" : "Different learning rates for embeddings and lm_head."}
-    # )
+    embedding_learning_rate : Optional[float] = field(
+        default = None,
+        metadata = {"help" : "Different learning rates for embeddings and lm_head."}
+    )
 pass
 
 
@@ -76,20 +75,19 @@ def _create_unsloth_optimizer(
 
 
 class UnslothTrainer(SFTTrainer):
-    pass
-    # def create_optimizer(self):
-    #     embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
-    #     if embedding_learning_rate is None: return super().create_optimizer()
+    def create_optimizer(self):
+        embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
+        if embedding_learning_rate is None: return super().create_optimizer()
 
-    #     if self.optimizer is None:
-    #         optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args)
-    #         self.optimizer = _create_unsloth_optimizer(
-    #             self.model,
-    #             optimizer_cls,
-    #             optimizer_kwargs,
-    #             embedding_learning_rate,
-    #         )
-    #     pass
-    #     return self.optimizer
-    # pass
+        if self.optimizer is None:
+            optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = _create_unsloth_optimizer(
+                self.model,
+                optimizer_cls,
+                optimizer_kwargs,
+                embedding_learning_rate,
+            )
+        pass
+        return self.optimizer
+    pass
 pass

From 217bf9d9eed9b6706b67917fd27b32923d50594f Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 21 May 2024 04:27:15 +1000
Subject: [PATCH 013/153] Update trainer.py

---
 unsloth/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/trainer.py b/unsloth/trainer.py
index 63a4398a..85e47aa0 100644
--- a/unsloth/trainer.py
+++ b/unsloth/trainer.py
@@ -50,7 +50,7 @@ def _create_unsloth_optimizer(
     for name, param in model.named_parameters():
         if not param.requires_grad: continue
         if "modules_to_save.default" in name:
-            print(f"Unsloth: Setting lr = {embedding_lr} instead of {lr} for {name}.")
+            print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {name}.")
             param_groups["embeddings"]    [name] = param
         else:
             param_groups["non_embeddings"][name] = param

From 6e85384ab2e2ad0aaa3b4c5090f87d6cb0d83256 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 22 May 2024 00:21:09 +1000
Subject: [PATCH 014/153] Update trainer.py

---
 unsloth/trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/unsloth/trainer.py b/unsloth/trainer.py
index 85e47aa0..b234a98d 100644
--- a/unsloth/trainer.py
+++ b/unsloth/trainer.py
@@ -49,8 +49,10 @@ def _create_unsloth_optimizer(
 
     for name, param in model.named_parameters():
         if not param.requires_grad: continue
-        if "modules_to_save.default" in name:
-            print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {name}.")
+        if name.endswith("modules_to_save.default.weight"):
+            partial_name = name[:-len(".modules_to_save.default.weight")]
+            partial_name = partial_name[partial_name.rfind(".")+1:]
+            print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {partial_name}.")
             param_groups["embeddings"]    [name] = param
         else:
             param_groups["non_embeddings"][name] = param

From 77f9c516050e1997cbf16c2d9db0ee886b1ff222 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 22 May 2024 04:35:36 +1000
Subject: [PATCH 015/153] is_bfloat16_supported

---
 unsloth/models/_utils.py  | 10 ++++++++++
 unsloth/models/llama.py   |  2 +-
 unsloth/models/mistral.py |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 4d82d067..2c1eb4d5 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -35,7 +35,10 @@
 
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 major_version, minor_version = torch.cuda.get_device_capability()
+SUPPORTS_BFLOAT16 = False
+
 if major_version >= 8:
+    SUPPORTS_BFLOAT16 = True
     try:
         from flash_attn import flash_attn_func
         # Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl"
@@ -75,6 +78,7 @@
     "offload_to_disk",
     "offload_input_embeddings",
     "offload_output_embeddings",
+    "is_bfloat16_supported",
 ]
 
 
@@ -467,3 +471,9 @@ def offload_output_embeddings(model, temporary_location : str = "_unsloth_tempor
     model.set_output_embeddings(new_output_embeddings)
     return
 pass
+
+
+# Fixes a weird Torch 2.3 bug which says T4s have bfloat16
+def is_bfloat16_supported():
+    return SUPPORTS_BFLOAT16
+pass
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 175a62ec..2d6021a3 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1047,7 +1047,7 @@ def from_pretrained(
             token = os.environ["HUGGINGFACE_TOKEN"]
 
         if model_patcher is None: model_patcher = FastLlamaModel
-        SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
+        SUPPORTS_BFLOAT16 = is_bfloat16_supported()
         gpu_stats = torch.cuda.get_device_properties(0)
         max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
 
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 4594919b..365d60a3 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -314,7 +314,7 @@ def from_pretrained(
             logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
         pass
 
-        SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
+        SUPPORTS_BFLOAT16 = is_bfloat16_supported()
         gpu_stats = torch.cuda.get_device_properties(0)
         max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
 

From c0e1d27b7e56b2778c295e1e6b0152ff52d18a44 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 22 May 2024 04:41:01 +1000
Subject: [PATCH 016/153] Update __init__.py

---
 unsloth/models/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py
index ff7129e0..e67a9e5f 100644
--- a/unsloth/models/__init__.py
+++ b/unsloth/models/__init__.py
@@ -17,3 +17,4 @@
 from .mistral import FastMistralModel
 from .qwen2   import FastQwen2Model
 from .dpo     import PatchDPOTrainer
+from ._utils  import is_bfloat16_supported

From 2b23b9357aba25ab2f3a49d899045547d7dde1d7 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 22 May 2024 04:44:04 +1000
Subject: [PATCH 017/153] Update README.md

---
 README.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ca5b6533..4d934eb0 100644
--- a/README.md
+++ b/README.md
@@ -180,7 +180,8 @@ python -m bitsandbytes
 - We're in 🤗Hugging Face's official docs! Check out the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth)!
 
 ```python
-from unsloth import FastLanguageModel
+from unsloth import FastLanguageModel 
+from unsloth import is_bfloat16_supported
 import torch
 from trl import SFTTrainer
 from transformers import TrainingArguments
@@ -238,8 +239,8 @@ trainer = SFTTrainer(
         gradient_accumulation_steps = 4,
         warmup_steps = 10,
         max_steps = 60,
-        fp16 = not torch.cuda.is_bf16_supported(),
-        bf16 = torch.cuda.is_bf16_supported(),
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
         logging_steps = 1,
         output_dir = "outputs",
         optim = "adamw_8bit",
@@ -263,6 +264,7 @@ We're in 🤗Hugging Face's official docs! We're on the [SFT docs](https://huggi
 
 ```python
 from unsloth import FastLanguageModel, PatchDPOTrainer
+from unsloth import is_bfloat16_supported
 PatchDPOTrainer()
 import torch
 from transformers import TrainingArguments
@@ -298,8 +300,8 @@ dpo_trainer = DPOTrainer(
         gradient_accumulation_steps = 8,
         warmup_ratio = 0.1,
         num_train_epochs = 3,
-        fp16 = not torch.cuda.is_bf16_supported(),
-        bf16 = torch.cuda.is_bf16_supported(),
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
         logging_steps = 1,
         optim = "adamw_8bit",
         seed = 42,

From 902e23af08e63790bc4b1801f3366c76e88b4d83 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 22 May 2024 04:45:32 +1000
Subject: [PATCH 018/153] Update llama.py

---
 unsloth/models/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2d6021a3..1d6a282a 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1757,7 +1757,7 @@ def patch_peft_model(
                     n_mlp += 1
                 else:
                     logger.warning_once(
-                        "Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters\n"\
+                        "Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters\n"\
                         "are not enabled or a bias term (like in Qwen) is used."
                     )
                 pass
@@ -1780,7 +1780,7 @@ def patch_peft_model(
                     n_qkv += 1
                 else:
                     logger.warning_once(
-                        "Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters\n"\
+                        "Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters\n"\
                         "are not enabled or a bias term (like in Qwen) is used."
                     )
                 pass
@@ -1795,7 +1795,7 @@ def patch_peft_model(
                     n_o += 1
                 else:
                     logger.warning_once(
-                        "Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters\n"\
+                        "Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters\n"\
                         "are not enabled or a bias term (like in Qwen) is used."
                     )
                 pass

From 3193cac8813d38cb9f7c57cb02ad7c09fb8b5b51 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 22 May 2024 20:35:24 +1000
Subject: [PATCH 019/153] is_bfloat16_supported

---
 unsloth/__init__.py | 6 ++++++
 unsloth/trainer.py  | 1 +
 2 files changed, 7 insertions(+)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 2dcf1e6a..05755d43 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -38,6 +38,12 @@
 
 try:
     import torch
+
+    # Fix up is_bf16_supported https://github.com/unslothai/unsloth/issues/504
+    major_version, minor_version = torch.cuda.get_device_capability()
+    SUPPORTS_BFLOAT16 = (major_version >= 8)
+    def is_bf16_supported(): return SUPPORTS_BFLOAT16
+    torch.cuda.is_bf16_supported = is_bf16_supported
 except:
     raise ImportError("Pytorch is not installed. Go to https://pytorch.org/.\n"\
                       "We have some installation instructions on our Github page.")
diff --git a/unsloth/trainer.py b/unsloth/trainer.py
index b234a98d..c8e00be2 100644
--- a/unsloth/trainer.py
+++ b/unsloth/trainer.py
@@ -16,6 +16,7 @@
 from typing import Optional
 from transformers import TrainingArguments
 from trl import SFTTrainer
+from . import is_bfloat16_supported
 
 __all__ = [
     "UnslothTrainingArguments",

From dfeaf4bf116226cdcae339135d90168c7e45f582 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 22 May 2024 20:37:48 +1000
Subject: [PATCH 020/153] Update __init__.py

---
 unsloth/__init__.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 05755d43..c8f4ca10 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -38,15 +38,16 @@
 
 try:
     import torch
-
-    # Fix up is_bf16_supported https://github.com/unslothai/unsloth/issues/504
-    major_version, minor_version = torch.cuda.get_device_capability()
-    SUPPORTS_BFLOAT16 = (major_version >= 8)
-    def is_bf16_supported(): return SUPPORTS_BFLOAT16
-    torch.cuda.is_bf16_supported = is_bf16_supported
 except:
     raise ImportError("Pytorch is not installed. Go to https://pytorch.org/.\n"\
                       "We have some installation instructions on our Github page.")
+pass
+
+# Fix up is_bf16_supported https://github.com/unslothai/unsloth/issues/504
+major_version, minor_version = torch.cuda.get_device_capability()
+SUPPORTS_BFLOAT16 = (major_version >= 8)
+def is_bf16_supported(): return SUPPORTS_BFLOAT16
+torch.cuda.is_bf16_supported = is_bf16_supported
 
 # We support Pytorch 2
 # Fixes https://github.com/unslothai/unsloth/issues/38

From 1e84090231fa2157bb2695b91044e398c2fa9b6d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 23 May 2024 04:12:45 +1000
Subject: [PATCH 021/153] Mistral v3

---
 unsloth/__init__.py      | 1 +
 unsloth/models/mapper.py | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index c8f4ca10..d85eca00 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -17,6 +17,7 @@
 
 # Currently only supports 1 GPU, or else seg faults will occur.
 if "CUDA_VISIBLE_DEVICES" in os.environ:
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     devices = os.environ["CUDA_VISIBLE_DEVICES"]
     # Check if there are multiple cuda devices set in env
     if not devices.isdigit():
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index b4fbe573..29896ef2 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -144,6 +144,14 @@
         "unsloth/Phi-3-mini-4k-instruct",
         "microsoft/Phi-3-mini-4k-instruct",
     ),
+    "unsloth/mistral-7b-v0.3-bnb-4bit" : (
+        "unsloth/mistral-7b-v0.3",
+        "mistralai/Mistral-7B-v0.3",
+    ),
+    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit" : (
+        "unsloth/mistral-7b-instruct-v0.3",
+        "mistralai/Mistral-7B-Instruct-v0.3",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}

From 57ad8e784645d3b5f437d5edfa8486d8998a9829 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 24 May 2024 03:03:56 +1000
Subject: [PATCH 022/153] Phi 3 medium

---
 unsloth/models/_utils.py | 13 +++++++++----
 unsloth/models/mapper.py |  4 ++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 2c1eb4d5..0217c7b5 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -159,7 +159,7 @@ def patch_tokenizer(model, tokenizer):
         Check if pad_token is not the same as eos_token otherwise the loss will ignore it!!
         Fixes https://github.com/unslothai/unsloth/issues/5
     """
-    possible_reserved_tokens = ("<|reserved", "<|placeholder",)
+    possible_reserved_tokens = ("<|reserved", "<|placeholder", "[control")
 
     if model is not None:
         model.config.update({"unsloth_version" : __version__})
@@ -176,14 +176,19 @@ def patch_tokenizer(model, tokenizer):
 
     if bad_pad_token:
         # Find a better pad token
-        added_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()]
+        aadded_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()]
         possible_pad_token = None
+        n_possible_pad_tokens = 0
         for added_token in added_tokens[::-1]:
             if added_token.startswith(possible_reserved_tokens):
-                possible_pad_token = added_token
-                break
+                if possible_pad_token is None: possible_pad_token = added_token
+                n_possible_pad_tokens += 1
+                # We must see at least 3 of the reserved tokens
+                if n_possible_pad_tokens >= 3: break
             pass
         pass
+        if n_possible_pad_tokens < 3: possible_pad_token = None
+
         if possible_pad_token is None:
             # Try unk_token
             possible_pad_token = tokenizer.unk_token
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 29896ef2..777f310c 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -152,6 +152,10 @@
         "unsloth/mistral-7b-instruct-v0.3",
         "mistralai/Mistral-7B-Instruct-v0.3",
     ),
+    "unsloth/Phi-3-medium-4k-instruct-bnb-4bit" : (
+        "unsloth/Phi-3-medium-4k-instruct",
+        "microsoft/Phi-3-medium-4k-instruct",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}

From 2b994b2ef0c4fe8cbb3c8fada9c1f1fc6c6bc46a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 24 May 2024 03:49:50 +1000
Subject: [PATCH 023/153] Update chat_templates.py

---
 unsloth/chat_templates.py | 40 ++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 3af4c4e9..e6b981ad 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -52,7 +52,7 @@
         "{{ '>>> Assistant: ' }}"\
     "{% endif %}"
 unsloth_eos_token = "eos_token"
-CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token,)
+CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False,)
 
 
 # Zephyr has no BOS!
@@ -70,7 +70,7 @@
         "{{ '<|assistant|>\n' }}"\
     "{% endif %}"
 zephyr_eos_token = "eos_token"
-CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token,)
+CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False,)
 
 
 # ChatML has no BOS and not EOS! Rather <|im_start|> and <|im_end|> acts as BOS / EOS.
@@ -88,7 +88,7 @@
         "{{ '<|im_start|>assistant\n' }}"\
     "{% endif %}"
 chatml_eos_token = "<|im_end|>"
-CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token,)
+CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token, True,)
 
 
 # Mistral Instruct doesn't allow system prompts, so we append it to the user message.
@@ -115,7 +115,7 @@
         "{% endif %}"\
     "{% endfor %}"
 mistral_eos_token = "eos_token"
-CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token,)
+CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False,)
 
 
 # Adds BOS to every convo! And weird <<SYS>> system messages.
@@ -141,7 +141,7 @@
         "{% endif %}"\
     "{% endfor %}"
 llama_eos_token = "eos_token"
-CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token,)
+CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False,)
 
 
 # https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
@@ -167,7 +167,7 @@
         "{{ 'ASSISTANT:' }}"\
     "{% endif %}"
 vicuna_eos_token = "eos_token"
-CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token,)
+CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False,)
 
 
 # https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
@@ -193,7 +193,7 @@
         "{{ '### Assistant:' }}"\
     "{% endif %}"
 vicuna_old_eos_token = "eos_token"
-CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token,)
+CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False,)
 
 
 # https://github.com/tatsu-lab/stanford_alpaca Changed for multi-turn convos
@@ -219,7 +219,7 @@
         "{{ '### Response:\n' }}"\
     "{% endif %}"
 alpaca_eos_token = "eos_token"
-CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token,)
+CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token, False,)
 
 
 # https://huggingface.co/google/gemma-7b-it
@@ -240,7 +240,7 @@
         "{{ '<start_of_turn>model\n' }}"\
     "{% endif %}"
 gemma_eos_token = "<end_of_turn>"
-CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token,)
+CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token, True,)
 
 
 # Gemma with ChatML instead
@@ -250,7 +250,7 @@
     {"<start_of_turn>" : "<|im_start|>", "<eos>" : "<|im_end|>"},
     "<|im_end|>",
 )
-CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token,)
+CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token, True,)
 
 
 # Llama-3
@@ -264,27 +264,33 @@
         "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\
     "{% endif %}"
 llama3_template_eos_token = "eos_token"
-CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token,)
+CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token, False,)
 
 
 # Phi-3
 phi3_template = \
     "{{ bos_token }}"\
     "{% for message in messages %}"\
-        "{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{'<|user|>\n' + message['content'] + '<|end|>\n'}}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}"\
+        "{% else %}"\
+            "{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\
+        "{% endif %}"\
     "{% endfor %}"\
     "{% if add_generation_prompt %}"\
         "{{ '<|assistant|>\n' }}"\
     "{% endif %}"
 phi3_template_eos_token = "<|end|>"
-CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token,)
+CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False,)
 
 
 def get_chat_template(
     tokenizer,
     chat_template = "chatml",
     mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"},
-    map_eos_token = True,
+    map_eos_token = False,
 ):
     assert(type(map_eos_token) is bool)
     old_tokenizer = tokenizer
@@ -319,7 +325,11 @@ def get_chat_template(
 
     elif type(chat_template) is str:
 
-        chat_template, stop_word = CHAT_TEMPLATES[chat_template]
+        chat_template, stop_word, yes_map_eos_token = CHAT_TEMPLATES[chat_template]
+
+        # Check mapping to eos_token
+        if not map_eos_token and yes_map_eos_token: map_eos_token = True
+        if not yes_map_eos_token and map_eos_token: map_eos_token = False
 
         if type(stop_word) in (list, tuple,):
             token_mapping, stop_word = stop_word

From ff8171fc1bb3fa23d1855bed71442bff2ea38b1f Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 24 May 2024 04:07:02 +1000
Subject: [PATCH 024/153] Update chat_templates.py

---
 unsloth/chat_templates.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index e6b981ad..3decdf7f 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -258,7 +258,13 @@
 llama3_template = \
     "{{ bos_token }}"\
     "{% for message in messages %}"\
-        "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
+        "{% else %}"\
+            "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
+        "{% endif %}"\
     "{% endfor %}"\
     "{% if add_generation_prompt %}"\
         "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\
@@ -290,7 +296,7 @@ def get_chat_template(
     tokenizer,
     chat_template = "chatml",
     mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"},
-    map_eos_token = False,
+    map_eos_token = True,
 ):
     assert(type(map_eos_token) is bool)
     old_tokenizer = tokenizer

From 5ca8b58b63585caac050cf0f84414c0dc7ec7281 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 24 May 2024 04:50:12 +1000
Subject: [PATCH 025/153] Phi-3

---
 README.md | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index ab06abe9..1d335101 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 <a href="https://discord.gg/u54VK8m8tk"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
 <a href="https://ko-fi.com/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/buy me a coffee button.png" height="48"></a>
 
-### Finetune Llama 3, Mistral & Gemma 2-5x faster with 80% less memory!
+### Finetune Llama 3, Mistral, Phi-3 & Gemma 2-5x faster with 80% less memory!
 
 ![](https://i.ibb.co/sJ7RhGG/image-41.png)
 
@@ -24,24 +24,24 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 |-----------|---------|--------|----------|
 | **Llama 3 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
 | **Mistral v3 (7B)**    | [▶️ Start for free](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing)               | 2.2x faster | 73% less |
-| **Mistral v1 (7B)**    | [▶️ Start for free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
+| **Phi-3 (medium)** | [▶️ Start for free](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)               | 2x faster | 50% less |
+| **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing)               | 2x faster | 50% less |
 | **Gemma (7B)**      | [▶️ Start for free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
 | **ORPO**     | [▶️ Start for free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
 | **DPO Zephyr**     | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
-| **Phi-3 (3.8B)** | [▶️ Start for free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               | 2x faster | 50% less |
 | **TinyLlama**  | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
-- Benchmarking compared to FA2 + Hugging Face combined.
-- **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
-- Also [Llama-3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing). [Mistral 7b v1 ChatML](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). [Mistral 7b v3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing).
-- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text.
+- **Kaggle Notebooks** for [Llama 3 8B](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7B](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7B](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
+- Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral 7B v3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
+- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text
+
 
 ## 🦥 Unsloth.ai News
-- 📣 NEW! Mistral v3 Base and Instruct now supported! 2x faster, 70% less VRAM notebooks for the [base model](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) and [instruct with ShareGPT](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
+- 📣 NEW! [Phi-3 medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) and [Phi-3 mini](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing) support is here!
+- 📣 NEW! [Mistral v3 Base](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) and [Mistral v3 Instruct](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) support is here!
 - 📣 NEW! Qwen1.5-7B, Qwen1.5-14B, Qwen1.5-32B, Qwen1.5-72B now work, courtesy of Firefly's PR [#428](https://github.com/unslothai/unsloth/pull/428)
 - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
 - 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!
-- 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here!
 - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line:
 ```python
 model = FastLanguageModel.get_peft_model(
@@ -195,15 +195,15 @@ dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
 
 # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
 fourbit_models = [
+    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
+    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
+    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
+    "unsloth/llama-3-8b-Instruct-bnb-4bit",
+    "unsloth/llama-3-70b-bnb-4bit",
+    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
+    "unsloth/Phi-3-medium-4k-instruct",
     "unsloth/mistral-7b-bnb-4bit",
-    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
-    "unsloth/llama-2-7b-bnb-4bit",
-    "unsloth/gemma-7b-bnb-4bit",
-    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
-    "unsloth/gemma-2b-bnb-4bit",
-    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
-    "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
-    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
+    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
 ] # More models at https://huggingface.co/unsloth
 
 model, tokenizer = FastLanguageModel.from_pretrained(

From a1328f619d92b7ec391f40979129b0d1e78d714e Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 24 May 2024 20:08:05 +1000
Subject: [PATCH 026/153] Update save.py

---
 unsloth/save.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 033f6eb1..304d3bee 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -94,7 +94,7 @@ def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencep
 
     temp_tokenizer = model._saved_temp_tokenizer
     sentencepiece_model = False
-    file_location = f"{temporary_location}/{temp_tokenizer.name_or_path}"
+    file_location = os.path.join(temporary_location, temp_tokenizer.name_or_path)
     if not os.path.exists(file_location):
         os.makedirs(file_location)
     pass

From fb296737878e747b804a43258ecef0eb0b0e6ef0 Mon Sep 17 00:00:00 2001
From: Michael Han <107991372+shimmyshimmer@users.noreply.github.com>
Date: Sat, 25 May 2024 23:56:36 +1000
Subject: [PATCH 027/153] Update README.md

Mistral v3 to Mistral v0.3
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 1d335101..3537b8de 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | Unsloth supports | Free Notebooks | Performance | Memory use |
 |-----------|---------|--------|----------|
 | **Llama 3 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
-| **Mistral v3 (7B)**    | [▶️ Start for free](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing)               | 2.2x faster | 73% less |
+| **Mistral v0.3 (7B)**    | [▶️ Start for free](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing)               | 2.2x faster | 73% less |
 | **Phi-3 (medium)** | [▶️ Start for free](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)               | 2x faster | 50% less |
 | **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing)               | 2x faster | 50% less |
 | **Gemma (7B)**      | [▶️ Start for free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
@@ -38,7 +38,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 ## 🦥 Unsloth.ai News
 - 📣 NEW! [Phi-3 medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) and [Phi-3 mini](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing) support is here!
-- 📣 NEW! [Mistral v3 Base](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) and [Mistral v3 Instruct](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) support is here!
+- 📣 NEW! [Mistral v0.3 Base](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) and [Mistral v0.3 Instruct](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) support is here!
 - 📣 NEW! Qwen1.5-7B, Qwen1.5-14B, Qwen1.5-32B, Qwen1.5-72B now work, courtesy of Firefly's PR [#428](https://github.com/unslothai/unsloth/pull/428)
 - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
 - 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!

From fa85556f638ab7eadb7e483936740f08e2b5a42d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:02:30 +1000
Subject: [PATCH 028/153] Untrained tokens

---
 unsloth/models/loader.py   |  16 ++---
 unsloth/tokenizer_utils.py | 133 +++++++++++++++++++++++++++++--------
 2 files changed, 112 insertions(+), 37 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 86a0f5d7..b2f0e4ef 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -45,18 +45,18 @@ def _get_model_name(model_name, load_in_4bit = True):
     
     elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER:
         new_model_name = INT_TO_FLOAT_MAPPER[model_name]
-        logger.warning_once(
-            f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
-            f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
-        )
+        # logger.warning_once(
+        #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
+        #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
+        # )
         model_name = new_model_name
 
     elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER:
         new_model_name = FLOAT_TO_INT_MAPPER[model_name]
-        logger.warning_once(
-            f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
-            f"We shall load `{new_model_name}` for 4x faster loading."
-        )
+        # logger.warning_once(
+        #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
+        #     f"We shall load `{new_model_name}` for 4x faster loading."
+        # )
         model_name = new_model_name
     pass
 
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index a0349166..1240fc8e 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -554,7 +554,7 @@ def check_tokenizer(
 
 
 @torch.inference_mode
-def fix_untrained_tokens(model, eps = 1e-16):
+def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     """
     Llama-3 for eg has untrained vectors in the base model.
     These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
@@ -568,30 +568,104 @@ def fix_untrained_tokens(model, eps = 1e-16):
     where_untrained = torch.where(indicator_untrained)[0]
     n_untrained = where_untrained.shape[0]
     n_trained = embedding_matrix.shape[0] - n_untrained
-    if n_untrained != 0:
-        print(
-            f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\
-            "We shall set them to the mean of the other trained tokens."
+
+    # Get set and actual tokens
+    where_untrained = where_untrained.tolist()
+    if len(where_untrained) == 0: return
+    
+    where_untrained_set = frozenset(where_untrained)
+    actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
+
+    # Check if tokenizer and training datasets have bad tokens
+    if_bad_first  = False
+    if_bad_second = False
+    # Check tokenizer's chat template for any untrained tokens
+    if hasattr(tokenizer, "chat_template"):
+        chat_template = tokenizer.chat_template
+        if_bad_first = any(x in chat_template for x in actual_bad_tokens)
+    pass
+
+    # Check the first 250, last 250 input_ids
+    size_dataset = len(train_dataset)
+    size = min(size_dataset, 250)
+    for j in range(size):
+        input_ids = train_dataset[j]
+        if "input_ids" in input_ids:
+            input_ids = input_ids["input_ids"]
+            if_bad = any(item in where_untrained_set for item in input_ids)
+            if if_bad:
+                if_bad_second = True
+                break
+            pass
+        pass
+    pass
+
+    # Check last 250
+    if not if_bad_second:
+        left = max(size_dataset-250, 0)
+        for j in range(left, size_dataset):
+            input_ids = train_dataset[j]
+            if "input_ids" in input_ids:
+                input_ids = input_ids["input_ids"]
+                if_bad = any(item in where_untrained_set for item in input_ids)
+                if if_bad:
+                    if_bad_second = True
+                    break
+                pass
+            pass
+        pass
+    pass
+
+    # Check if bad tokens exists!
+    if not if_bad_first and not if_bad_second: return
+
+    # Check if lm_head / embed_token are trainable!
+    bad_not_trainable = False
+    if not embedding_matrix.requires_grad: bad_not_trainable = True
+    if not lm_head_matrix  .requires_grad: bad_not_trainable = True
+
+    if bad_not_trainable:
+        raise ValueError(
+            'Unsloth: Untrained tokens found, but embed_tokens & lm_head not trainable, causing NaNs. '\
+            'Restart then add `embed_tokens` & `lm_head` to '\
+            '`FastLanguageModel.get_peft_model(target_modules = [..., "embed_tokens", "lm_head",])`',
         )
     pass
 
-    # First set untrained to all 0s - sometimes it's not! 1e-23 for bfloat16
-    embedding_matrix[where_untrained] = 0
-    lm_head_matrix  [where_untrained] = 0
+    # Get sum of all items
+    sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
+    sum_lm_head   = torch.sum(lm_head_matrix,   dtype = torch.float32, axis = 0)
 
-    # Find sum
-    sum_embedding  = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
-    sum_lm_head    = torch.sum(lm_head_matrix,   dtype = torch.float32, axis = 0)
+    # Remove bad tokens
+    sum_embedding -= torch.sum(embedding_matrix[where_untrained], dtype = torch.float32, axis = 0)
+    sum_lm_head   -= torch.sum(lm_head_matrix  [where_untrained], dtype = torch.float32, axis = 0)
 
     # Find correct average by dividing by sum of trained tokens
-    mean_embedding = (sum_embedding / n_trained).to(embedding_matrix.dtype)
-    mean_lm_head   = (sum_lm_head   / n_trained).to(lm_head_matrix  .dtype)
+    mean_embedding = (sum_embedding / n_trained)
+    mean_lm_head   = (sum_lm_head   / n_trained)
+
+    # Scale by the smallest correct item to make distribution correct
+    smallest_items = torch.amin(embedding_matrix, axis = 1).abs()
+    smallest_items[where_untrained] = torch.inf
+    smallest_item = smallest_items.min().abs()
+    mean_embedding *= (smallest_item / mean_embedding.abs()).min() * 0.1
+    mean_embedding = mean_embedding.to(embedding_matrix.dtype)
+
+    # Do for lm_head
+    smallest_items = torch.amin(lm_head_matrix, axis = 1).abs()
+    smallest_items[where_untrained] = torch.inf
+    smallest_item = smallest_items.min().abs()
+    mean_lm_head *= (smallest_item / mean_lm_head.abs()).min() * 0.1
+    mean_lm_head = mean_lm_head.to(lm_head_matrix.dtype)
 
     # Set them to the mean
+    logger.warning(
+        "Unsloth: Setting embed_tokens & lm_head untrained tokens to "\
+        "mean(trained) to counteract NaNs during training."
+    )
     embedding_matrix[where_untrained] = mean_embedding
     lm_head_matrix  [where_untrained] = mean_lm_head
-
-    return mean_embedding, mean_lm_head
+    return
 pass
 
 
@@ -610,24 +684,24 @@ def mean_of_trained_tokens(model, eps = 1e-16):
     where_untrained = torch.where(indicator_untrained)[0]
     n_untrained = where_untrained.shape[0]
     n_trained = embedding_matrix.shape[0] - n_untrained
-    if n_untrained != 0:
-        print(
-            f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\
-            "We shall set them to the mean of the other trained tokens."
-        )
-    pass
+    # if n_untrained != 0:
+    #     print(
+    #         f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\
+    #         "We shall set them to the mean of the other trained tokens."
+    #     )
+    # pass
 
-    # First set untrained to all 0s - sometimes it's not! 1e-23 for bfloat16
-    embedding_matrix[where_untrained] = 0
-    lm_head_matrix  [where_untrained] = 0
+    # Get sum of all items
+    sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
+    sum_lm_head   = torch.sum(lm_head_matrix,   dtype = torch.float32, axis = 0)
 
-    # Find sum
-    sum_embedding  = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
-    sum_lm_head    = torch.sum(lm_head_matrix,   dtype = torch.float32, axis = 0)
+    # Remove bad tokens
+    sum_embedding -= torch.sum(embedding_matrix[where_untrained], dtype = torch.float32, axis = 0)
+    sum_lm_head   -= torch.sum(lm_head_matrix  [where_untrained], dtype = torch.float32, axis = 0)
 
     # Find correct average by dividing by sum of trained tokens
-    mean_embedding = (sum_embedding / n_trained).to(embedding_matrix.dtype)
-    mean_lm_head   = (sum_lm_head   / n_trained).to(lm_head_matrix  .dtype)
+    mean_embedding = (sum_embedding / n_trained)
+    mean_lm_head   = (sum_lm_head   / n_trained)
 
     return mean_embedding, mean_lm_head
 pass
@@ -734,6 +808,7 @@ def patch_sft_trainer_tokenizer():
 
         check_text = \
         "\n"\
+        "print(self.model)\n"\
         "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\
         "chat_template = getattr(tokenizer, 'chat_template', None)\n"\
         "chat_template = '' if chat_template is None else chat_template\n"\

From c511aca47ff849a522589af384cb1eaf45f27e09 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:08:42 +1000
Subject: [PATCH 029/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 1240fc8e..7dfa9d02 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -808,7 +808,6 @@ def patch_sft_trainer_tokenizer():
 
         check_text = \
         "\n"\
-        "print(self.model)\n"\
         "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\
         "chat_template = getattr(tokenizer, 'chat_template', None)\n"\
         "chat_template = '' if chat_template is None else chat_template\n"\
@@ -824,6 +823,25 @@ def patch_sft_trainer_tokenizer():
 
         exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
     pass
+
+    # Patch _prepare_dataset
+    replacer = "if dataset is None:"
+    function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer._prepare_dataset"))
+    where = function.find("def")
+    function = function.split("\n")
+    function = "\n".join(x[where:] for x in function)
+
+    check_text = \
+    "\n"\
+    "print(dir(self))\n\n"
+
+    check_text = check_text.split("\n")
+    check_text = "\n".join(" "*where + x for x in check_text)
+
+    function = function.replace(replacer, check_text + replacer)
+    exec(function, globals())
+
+    exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
 pass
 
 patch_sft_trainer_tokenizer()

From 35e7355f7584dc00257c0a9697f0fb1ee79215e3 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:10:48 +1000
Subject: [PATCH 030/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 7dfa9d02..376ae97f 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -825,8 +825,8 @@ def patch_sft_trainer_tokenizer():
     pass
 
     # Patch _prepare_dataset
-    replacer = "if dataset is None:"
-    function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer._prepare_dataset"))
+    function_name, replacer = "_prepare_dataset", "if dataset is None:"
+    function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
     where = function.find("def")
     function = function.split("\n")
     function = "\n".join(x[where:] for x in function)

From cc0bf44e78b1bcc0322a029df9acea999a247c07 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:17:05 +1000
Subject: [PATCH 031/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 376ae97f..3ba60e06 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -824,8 +824,8 @@ def patch_sft_trainer_tokenizer():
         exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
     pass
 
-    # Patch _prepare_dataset
-    function_name, replacer = "_prepare_dataset", "if dataset is None:"
+    # Patch __init__ with fix_untrained_tokens
+    function_name, replacer = "__init__", "if self.args.max_steps > 0 and packing:"
     function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
     where = function.find("def")
     function = function.split("\n")
@@ -833,7 +833,8 @@ def patch_sft_trainer_tokenizer():
 
     check_text = \
     "\n"\
-    "print(dir(self))\n\n"
+    "print('Fixing!')\n"
+    "fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16)\n\n"
 
     check_text = check_text.split("\n")
     check_text = "\n".join(" "*where + x for x in check_text)

From 674ba66d0556e71f0905b5709a488851910d1472 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:21:16 +1000
Subject: [PATCH 032/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 3ba60e06..2208d886 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -825,7 +825,7 @@ def patch_sft_trainer_tokenizer():
     pass
 
     # Patch __init__ with fix_untrained_tokens
-    function_name, replacer = "__init__", "if self.args.max_steps > 0 and packing:"
+    function_name, replacer = "train", "if resume_from_checkpoint is False:"
     function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
     where = function.find("def")
     function = function.split("\n")

From 9823f52bd2aad44728d575738b32724c94298281 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:22:07 +1000
Subject: [PATCH 033/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 2208d886..ec7cc6cb 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -834,7 +834,7 @@ def patch_sft_trainer_tokenizer():
     check_text = \
     "\n"\
     "print('Fixing!')\n"
-    "fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16)\n\n"
+    "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
 
     check_text = check_text.split("\n")
     check_text = "\n".join(" "*where + x for x in check_text)

From c0c761b19423ed964c4b01a59b5c4baf3d652de7 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:23:10 +1000
Subject: [PATCH 034/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index ec7cc6cb..bcdd2c65 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -792,6 +792,7 @@ def add_new_tokens(
 from inspect import getsource
 import trl.trainer.sft_trainer
 from trl.trainer.sft_trainer import *
+from transformers.trainer import *
 
 def patch_sft_trainer_tokenizer():
     """

From e2850c07a4f77dce31ae688c97621adcd776744c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:26:15 +1000
Subject: [PATCH 035/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index bcdd2c65..b1aad831 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -834,7 +834,6 @@ def patch_sft_trainer_tokenizer():
 
     check_text = \
     "\n"\
-    "print('Fixing!')\n"
     "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
 
     check_text = check_text.split("\n")

From 8e12780357988263d02326dc949c3f1464982c8d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:26:49 +1000
Subject: [PATCH 036/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index b1aad831..f5a5cb9f 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -834,6 +834,7 @@ def patch_sft_trainer_tokenizer():
 
     check_text = \
     "\n"\
+    "print(self.train_dataset)\n"\
     "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
 
     check_text = check_text.split("\n")

From 6f1855e50ff8053235cc3c901388bfd6c752dd8e Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:29:21 +1000
Subject: [PATCH 037/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index f5a5cb9f..5225e251 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -580,8 +580,8 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     if_bad_first  = False
     if_bad_second = False
     # Check tokenizer's chat template for any untrained tokens
-    if hasattr(tokenizer, "chat_template"):
-        chat_template = tokenizer.chat_template
+    chat_template = getattr(tokenizer, "chat_template", None):
+    if chat_template is not None:
         if_bad_first = any(x in chat_template for x in actual_bad_tokens)
     pass
 

From d27b173423c1d734b9e97e50cff96302ac3bbcfe Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:30:15 +1000
Subject: [PATCH 038/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 5225e251..f39508ca 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -580,7 +580,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     if_bad_first  = False
     if_bad_second = False
     # Check tokenizer's chat template for any untrained tokens
-    chat_template = getattr(tokenizer, "chat_template", None):
+    chat_template = getattr(tokenizer, "chat_template", None)
     if chat_template is not None:
         if_bad_first = any(x in chat_template for x in actual_bad_tokens)
     pass

From 7bf7399e179769bd947614a7a294a10633ac9b94 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:43:11 +1000
Subject: [PATCH 039/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index f39508ca..0f799859 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -553,7 +553,7 @@ def check_tokenizer(
 pass
 
 
-@torch.inference_mode
+@torch.no_grad
 def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     """
     Llama-3 for eg has untrained vectors in the base model.

From 31ecef989a290cc53fe23fcc0d4626418eb606f8 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:46:38 +1000
Subject: [PATCH 040/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 0f799859..d74bedb4 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -553,15 +553,15 @@ def check_tokenizer(
 pass
 
 
-@torch.no_grad
+@torch.inference_mode
 def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     """
     Llama-3 for eg has untrained vectors in the base model.
     These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
     We reset them to the mean of the rest of the tokens
     """
-    embedding_matrix = model.get_input_embeddings ().weight.data
-    lm_head_matrix   = model.get_output_embeddings().weight.data
+    embedding_matrix = model.get_input_embeddings ().weight
+    lm_head_matrix   = model.get_output_embeddings().weight
 
     # Get untrained tokens
     indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps
@@ -676,8 +676,8 @@ def mean_of_trained_tokens(model, eps = 1e-16):
     These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
     We reset them to the mean of the rest of the tokens
     """
-    embedding_matrix = model.get_input_embeddings ().weight.data.clone()
-    lm_head_matrix   = model.get_output_embeddings().weight.data.clone()
+    embedding_matrix = model.get_input_embeddings ().weight.clone()
+    lm_head_matrix   = model.get_output_embeddings().weight.clone()
 
     # Get untrained tokens
     indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps

From b67d93f7391d014b90263aee3c9461a01103a1d9 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:47:17 +1000
Subject: [PATCH 041/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index d74bedb4..712c9db7 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -750,8 +750,8 @@ def add_new_tokens(
 
     # If we use interpolation, we interpolate between the mean embeddings and
     # the Word2Vec sum of the other vectors
-    embedding_matrix = model.get_input_embeddings ().weight.data
-    lm_head_matrix   = model.get_output_embeddings().weight.data
+    embedding_matrix = model.get_input_embeddings ().weight
+    lm_head_matrix   = model.get_output_embeddings().weight
 
     if method == "interpolation":
         print(

From e874ccdbb39cfc63284fd3b953a61b77bd8da758 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 04:47:54 +1000
Subject: [PATCH 042/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 712c9db7..df2809e9 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -834,7 +834,6 @@ def patch_sft_trainer_tokenizer():
 
     check_text = \
     "\n"\
-    "print(self.train_dataset)\n"\
     "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
 
     check_text = check_text.split("\n")

From d7b54ffeee9fbd5be61c6a656737ac5fcd0ddaf7 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 17:55:19 +1000
Subject: [PATCH 043/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index df2809e9..6a8dc954 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -648,14 +648,14 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     smallest_items = torch.amin(embedding_matrix, axis = 1).abs()
     smallest_items[where_untrained] = torch.inf
     smallest_item = smallest_items.min().abs()
-    mean_embedding *= (smallest_item / mean_embedding.abs()).min() * 0.1
+    mean_embedding *= (smallest_item / mean_embedding.abs()).min() * 0.01
     mean_embedding = mean_embedding.to(embedding_matrix.dtype)
 
     # Do for lm_head
     smallest_items = torch.amin(lm_head_matrix, axis = 1).abs()
     smallest_items[where_untrained] = torch.inf
     smallest_item = smallest_items.min().abs()
-    mean_lm_head *= (smallest_item / mean_lm_head.abs()).min() * 0.1
+    mean_lm_head *= (smallest_item / mean_lm_head.abs()).min() * 0.01
     mean_lm_head = mean_lm_head.to(lm_head_matrix.dtype)
 
     # Set them to the mean

From 5a4a512db9afad17c3db1bd347199f5054a5168e Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 19:59:06 +1000
Subject: [PATCH 044/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 48 ++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 6a8dc954..3e84f9ce 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -20,6 +20,10 @@
 from transformers.models.llama.modeling_llama import logger
 from peft import PeftModelForCausalLM
 import torch
+import itertools
+import collections
+import numpy as np
+import gc
 
 __all__ = [
     "load_correct_tokenizer",
@@ -274,12 +278,10 @@ def fix_sentencepiece_gguf(saved_location):
         user defined tokens.
         Inspiration from https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py
     """
-    import numpy as np
     from copy import deepcopy
     from transformers.utils import sentencepiece_model_pb2
     import json
     from enum import IntEnum
-    import os
     
     class SentencePieceTokenTypes(IntEnum):
         NORMAL = 1
@@ -632,6 +634,15 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
         )
     pass
 
+    # Count all the possible bad tokens
+    final_counts = np.zeros(len(tokenizer), dtype = np.int64)
+    def mapping(examples):
+        input_ids = examples["input_ids"]
+        counter = np.fromiter(itertools.chain.from_iterable(input_ids), dtype = np.int32)
+        np.add.at(final_counts, counter, 1)
+    pass
+    train_dataset.map(mapping, batched = True, desc = "Counting untrained tokens")
+
     # Get sum of all items
     sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
     sum_lm_head   = torch.sum(lm_head_matrix,   dtype = torch.float32, axis = 0)
@@ -644,27 +655,28 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     mean_embedding = (sum_embedding / n_trained)
     mean_lm_head   = (sum_lm_head   / n_trained)
 
-    # Scale by the smallest correct item to make distribution correct
-    smallest_items = torch.amin(embedding_matrix, axis = 1).abs()
-    smallest_items[where_untrained] = torch.inf
-    smallest_item = smallest_items.min().abs()
-    mean_embedding *= (smallest_item / mean_embedding.abs()).min() * 0.01
-    mean_embedding = mean_embedding.to(embedding_matrix.dtype)
-
-    # Do for lm_head
-    smallest_items = torch.amin(lm_head_matrix, axis = 1).abs()
-    smallest_items[where_untrained] = torch.inf
-    smallest_item = smallest_items.min().abs()
-    mean_lm_head *= (smallest_item / mean_lm_head.abs()).min() * 0.01
-    mean_lm_head = mean_lm_head.to(lm_head_matrix.dtype)
+    # Scale each to be equal to 1/max_frequency. Also set some to 0 if none seen
+    scaling = final_counts[where_untrained] / max(final_counts.max(), 1)
+    scaling = torch.tensor(scaling, device = mean_embedding.device).unsqueeze(1)
+    mean_embedding = mean_embedding.repeat((n_untrained, 1,)) * scaling
+    mean_lm_head   = mean_lm_head  .repeat((n_untrained, 1,)) * scaling
+    where_null = scaling.ravel() == 0
+    mean_embedding[where_null] = 0
+    mean_lm_head  [where_null] = 0
 
     # Set them to the mean
     logger.warning(
         "Unsloth: Setting embed_tokens & lm_head untrained tokens to "\
         "mean(trained) to counteract NaNs during training."
     )
-    embedding_matrix[where_untrained] = mean_embedding
-    lm_head_matrix  [where_untrained] = mean_lm_head
+    embedding_matrix[where_untrained] = mean_embedding.to(embedding_matrix.dtype)
+    lm_head_matrix  [where_untrained] = mean_lm_head  .to(lm_head_matrix  .dtype)
+
+    # Clean up
+    for _ in range(3):
+        gc.collect()
+        torch.cuda.empty_cache()
+    pass
     return
 pass
 
@@ -825,7 +837,7 @@ def patch_sft_trainer_tokenizer():
         exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
     pass
 
-    # Patch __init__ with fix_untrained_tokens
+    # Patch train with fix_untrained_tokens
     function_name, replacer = "train", "if resume_from_checkpoint is False:"
     function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
     where = function.find("def")

From 82c040e3d634681206bd5e9a996e1d226c0c0958 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 20:14:09 +1000
Subject: [PATCH 045/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 3e84f9ce..4669f8bd 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -846,6 +846,7 @@ def patch_sft_trainer_tokenizer():
 
     check_text = \
     "\n"\
+    "print(dir(self))\n"\
     "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
 
     check_text = check_text.split("\n")

From 8e227b2391958b2007144e145086c6f82c608e10 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 20:21:18 +1000
Subject: [PATCH 046/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 4669f8bd..0ca2967c 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -846,7 +846,12 @@ def patch_sft_trainer_tokenizer():
 
     check_text = \
     "\n"\
-    "print(dir(self))\n"\
+    "if Trainer._inner_training_loop.__name__ != '_fast_inner_training_loop':"\
+    "    raise RuntimeError(\n"\
+    "       'You must not edit specific areas of Unsloth's codebase since you'll make it slower.\n'"\
+    "       'Please revert your changes back otherwise you might get CUDA segfaults.\n'"\
+    "    )\n"\
+    "pass\n"\
     "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
 
     check_text = check_text.split("\n")

From 250d386060e50c2a80840b09bb20e33aa3a81620 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 20:24:11 +1000
Subject: [PATCH 047/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 0ca2967c..7521e1f7 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -848,8 +848,7 @@ def patch_sft_trainer_tokenizer():
     "\n"\
     "if Trainer._inner_training_loop.__name__ != '_fast_inner_training_loop':"\
     "    raise RuntimeError(\n"\
-    "       'You must not edit specific areas of Unsloth's codebase since you'll make it slower.\n'"\
-    "       'Please revert your changes back otherwise you might get CUDA segfaults.\n'"\
+    "       'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'"\
     "    )\n"\
     "pass\n"\
     "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"

From e6db3bae1f211d70fc0ffb6665308d149d08cf1d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 27 May 2024 20:30:51 +1000
Subject: [PATCH 048/153] Update llama.py

---
 unsloth/models/llama.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 1d6a282a..31455630 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1166,7 +1166,7 @@ def from_pretrained(
         except:
             raise RuntimeError(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
                 "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
@@ -1194,7 +1194,7 @@ def from_pretrained(
         f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
         f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
         f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
-        logger.warning_once(debug_info)
+        logger.warning(debug_info)
         import gc
         for _ in range(3):
             gc.collect()
@@ -1209,7 +1209,7 @@ def from_pretrained(
         if n_total_devices > 2:
             logger.warning_once(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
                 "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
@@ -1238,9 +1238,10 @@ def from_pretrained(
         n_total_devices = total_batches // ga // bsz
         if n_total_devices > 2:
             logger.warning_once(
-                "Please consider a commercial license - Unsloth was designed for the GPU Poor.\\n"
-                "The OSS currently works on 4 GPUs - we're a 2 person team, so please help fund\\n"
-                "our development costs by supporting us through Ko-fi or buying a license! Thanks!",
+                "Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
             divisor = n_total_devices / 2
             bsz = self._train_batch_size = max(int(bsz / divisor), 1)
@@ -1267,7 +1268,7 @@ def from_pretrained(
         if "n_total_devices >" not in inner_training_loop:
             raise RuntimeError(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
                 "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
@@ -1703,7 +1704,7 @@ def patch_peft_model(
         if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
             raise RuntimeError(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
                 "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )

From e673fa26f38fd2c987401a04aadf7181c183e1a4 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 28 May 2024 00:05:26 +1000
Subject: [PATCH 049/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 7521e1f7..d33ea621 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -846,7 +846,8 @@ def patch_sft_trainer_tokenizer():
 
     check_text = \
     "\n"\
-    "if Trainer._inner_training_loop.__name__ != '_fast_inner_training_loop':"\
+    "print(self.args)\n"\
+    "if self._inner_training_loop.__name__ != '_fast_inner_training_loop':"\
     "    raise RuntimeError(\n"\
     "       'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'"\
     "    )\n"\

From 222b8355bc1fe7b37be555fd590763be1972329f Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 28 May 2024 00:23:21 +1000
Subject: [PATCH 050/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index d33ea621..f9a175f9 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -852,7 +852,17 @@ def patch_sft_trainer_tokenizer():
     "       'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'"\
     "    )\n"\
     "pass\n"\
-    "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
+    "n_devices = torch.cuda.device_count()\n"\
+    "more_than = 0\n"\
+    "for j in range(n_devices):\n"\
+    "    vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\
+    "    more_than += (vram > 4)\n"\
+    "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')"\
+    "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n"\
+    "for _ in range(3):\n"\
+    "    gc.collect()\n"\
+    "    torch.cuda.empty_cache()\n"\
+    "pass\n\n"
 
     check_text = check_text.split("\n")
     check_text = "\n".join(" "*where + x for x in check_text)

From 6404aa563cbb29f8e02a2357cc3052272c7dfeb6 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 28 May 2024 00:23:35 +1000
Subject: [PATCH 051/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index f9a175f9..903c2e85 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -857,7 +857,7 @@ def patch_sft_trainer_tokenizer():
     "for j in range(n_devices):\n"\
     "    vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\
     "    more_than += (vram > 4)\n"\
-    "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')"\
+    "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')\n"\
     "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n"\
     "for _ in range(3):\n"\
     "    gc.collect()\n"\

From cfea7b2d237ad8da6ec8453acccaa1781fea9f55 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 28 May 2024 00:25:24 +1000
Subject: [PATCH 052/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 903c2e85..8ed7142d 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -846,10 +846,9 @@ def patch_sft_trainer_tokenizer():
 
     check_text = \
     "\n"\
-    "print(self.args)\n"\
-    "if self._inner_training_loop.__name__ != '_fast_inner_training_loop':"\
+    "if self._inner_training_loop.__name__ != '_fast_inner_training_loop':\n"\
     "    raise RuntimeError(\n"\
-    "       'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'"\
+    "       'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'\n"\
     "    )\n"\
     "pass\n"\
     "n_devices = torch.cuda.device_count()\n"\

From 083e5ba5179bd0d3346879f74310787c5e57543d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 28 May 2024 03:03:04 +1000
Subject: [PATCH 053/153] Update save.py

---
 unsloth/save.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/unsloth/save.py b/unsloth/save.py
index 304d3bee..8840dedc 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1390,6 +1390,20 @@ def unsloth_save_pretrained_gguf(
 
     model_type = self.config.model_type
     is_sentencepiece_model = check_if_sentencepiece_model(self)
+
+    # Check if BOS added already, then warn
+    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
+        chat_template = getattr(tokenizer, "chat_template", None)
+        if chat_template is not None and tokenizer.bos_token in chat_template:
+            logger.warning(
+                "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\
+                "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\
+                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token."
+            )
+        pass
+    pass
+
+    # Save to GGUF
     file_location = save_to_gguf(model_type, is_sentencepiece_model, 
         new_save_directory, quantization_method, first_conversion, makefile,
     )
@@ -1513,6 +1527,20 @@ def unsloth_push_to_hub_gguf(
 
     model_type = self.config.model_type
     is_sentencepiece_model = check_if_sentencepiece_model(self)
+
+    # Check if BOS added already, then warn
+    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
+        chat_template = getattr(tokenizer, "chat_template", None)
+        if chat_template is not None and tokenizer.bos_token in chat_template:
+            logger.warning(
+                "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\
+                "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\
+                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token."
+            )
+        pass
+    pass
+
+    # Save to GGUF
     file_location = save_to_gguf(model_type, is_sentencepiece_model, 
         new_save_directory, quantization_method, first_conversion, makefile,
     )

From 6f2565cfe4d4f36c61e3a31ca757d5bce587533b Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 28 May 2024 03:23:40 +1000
Subject: [PATCH 054/153] Update save.py

---
 unsloth/save.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 8840dedc..74b06d3e 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1394,7 +1394,8 @@ def unsloth_save_pretrained_gguf(
     # Check if BOS added already, then warn
     if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
         chat_template = getattr(tokenizer, "chat_template", None)
-        if chat_template is not None and tokenizer.bos_token in chat_template:
+        if chat_template is not None and \
+            (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
             logger.warning(
                 "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\
                 "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\
@@ -1531,7 +1532,8 @@ def unsloth_push_to_hub_gguf(
     # Check if BOS added already, then warn
     if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
         chat_template = getattr(tokenizer, "chat_template", None)
-        if chat_template is not None and tokenizer.bos_token in chat_template:
+        if chat_template is not None and \
+            (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
             logger.warning(
                 "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\
                 "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\

From c19b04ecb4d27445747d2f76648271bf11b45af0 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 28 May 2024 03:39:45 +1000
Subject: [PATCH 055/153] Update save.py

---
 unsloth/save.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 74b06d3e..7af62809 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1392,14 +1392,15 @@ def unsloth_save_pretrained_gguf(
     is_sentencepiece_model = check_if_sentencepiece_model(self)
 
     # Check if BOS added already, then warn
+    print_bos_token_message = False
     if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
         chat_template = getattr(tokenizer, "chat_template", None)
         if chat_template is not None and \
             (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
+            print_bos_token_message = True
             logger.warning(
-                "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\
-                "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\
-                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token."
+                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
+                "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template."
             )
         pass
     pass
@@ -1420,6 +1421,13 @@ def unsloth_save_pretrained_gguf(
             new_save_directory.lstrip('/.')
         print(f"Saved GGUF to https://huggingface.co/{link}")
     pass
+
+    if print_bos_token_message:
+        logger.warning(
+            f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
+            "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template."
+        )
+    pass
 pass
 
 
@@ -1530,14 +1538,15 @@ def unsloth_push_to_hub_gguf(
     is_sentencepiece_model = check_if_sentencepiece_model(self)
 
     # Check if BOS added already, then warn
+    print_bos_token_message = False
     if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
         chat_template = getattr(tokenizer, "chat_template", None)
         if chat_template is not None and \
             (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
+            print_bos_token_message = True
             logger.warning(
-                "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\
-                "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\
-                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token."
+                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
+                "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template."
             )
         pass
     pass
@@ -1555,7 +1564,15 @@ def unsloth_push_to_hub_gguf(
     link = f"{username}/{new_save_directory.lstrip('/.')}" \
         if username not in new_save_directory else \
         new_save_directory.lstrip('/.')
+
     print(f"Saved GGUF to https://huggingface.co/{link}")
+
+    if print_bos_token_message:
+        logger.warning(
+            f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
+            "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template."
+        )
+    pass
 pass
 
 

From 64b12a2d2ec243f803f43dcdda9e1317ad2fd514 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 28 May 2024 20:23:02 +1000
Subject: [PATCH 056/153] checkpoint

---
 unsloth/models/_utils.py   | 2 +-
 unsloth/models/llama.py    | 2 +-
 unsloth/tokenizer_utils.py | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index a2d4d50c..22fb5114 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -381,7 +381,7 @@ class Unsloth_Offloaded_Gradient_Checkpointer(torch.autograd.Function):
     def forward(ctx, forward_function, hidden_states, *args):
         saved_hidden_states = hidden_states.to("cpu", non_blocking = True)
         with torch.no_grad():
-            (output,) = forward_function(hidden_states, *args)
+            output = forward_function(hidden_states, *args)
         ctx.save_for_backward(saved_hidden_states)
         ctx.forward_function = forward_function
         ctx.args = args
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 31455630..9aeb55e4 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -657,7 +657,7 @@ def LlamaModel_fast_forward(
                 past_key_values,
                 output_attentions,
                 use_cache,
-            )
+            )[0]
 
         elif gradient_checkpointing:
             def create_custom_forward(module):
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 8ed7142d..03f3e341 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -857,11 +857,12 @@ def patch_sft_trainer_tokenizer():
     "    vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\
     "    more_than += (vram > 4)\n"\
     "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')\n"\
-    "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n"\
     "for _ in range(3):\n"\
     "    gc.collect()\n"\
     "    torch.cuda.empty_cache()\n"\
-    "pass\n\n"
+    "pass\n"\
+    "\n"\
+    "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
 
     check_text = check_text.split("\n")
     check_text = "\n".join(" "*where + x for x in check_text)

From 196faeca19336479ee3a10449a5538d5b1978d88 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 29 May 2024 04:37:59 +1000
Subject: [PATCH 057/153] Update _utils.py

---
 unsloth/models/_utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 22fb5114..b7333f00 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -79,6 +79,7 @@
     "offload_input_embeddings",
     "offload_output_embeddings",
     "is_bfloat16_supported",
+    "unsloth_offloaded_gradient_checkpoint",
 ]
 
 
@@ -402,6 +403,12 @@ def backward(ctx, dY):
 pass
 
 
+@torch._disable_dynamo
+def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, **kwargs):
+    return Unsloth_Offloaded_Gradient_Checkpointer.apply(function, *args)
+pass
+
+
 """
     Remove warnings about missing kwargs
 """

From 235be40450a4a766a5de66be15f77c09059b2081 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 29 May 2024 14:18:27 +1000
Subject: [PATCH 058/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 03f3e341..f0ea73be 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -577,6 +577,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     
     where_untrained_set = frozenset(where_untrained)
     actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
+    print(actual_bad_tokens)
 
     # Check if tokenizer and training datasets have bad tokens
     if_bad_first  = False

From cf9090acf80ada275286e90b11f1d351c6684bee Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 29 May 2024 14:22:15 +1000
Subject: [PATCH 059/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index f0ea73be..d7f86457 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -577,7 +577,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     
     where_untrained_set = frozenset(where_untrained)
     actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
-    print(actual_bad_tokens)
+    print(where_untrained)
 
     # Check if tokenizer and training datasets have bad tokens
     if_bad_first  = False

From 1fb11107a92d6912556730736ec52c84689e9781 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 29 May 2024 14:28:17 +1000
Subject: [PATCH 060/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index d7f86457..6e4d6910 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -577,8 +577,9 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     
     where_untrained_set = frozenset(where_untrained)
     actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
-    print(where_untrained)
-
+    # Remove None items in actual_bad_tokens
+    actual_bad_tokens = [x for x in actual_bad_tokens if x is not None]
+    
     # Check if tokenizer and training datasets have bad tokens
     if_bad_first  = False
     if_bad_second = False

From d1bd60cb90d2c795e767fee71ca8266a666136f3 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 30 May 2024 20:06:13 +1000
Subject: [PATCH 061/153] Update llama.py

---
 unsloth/models/llama.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 9aeb55e4..2390146f 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1336,18 +1336,18 @@ def post_patch(model):
         layers = model.model.layers
 
         # Torch.compile fails on embedding matrix??
-        # Workaround randomnly fixes it for torch versions < 2.2
-        model.model.embed_tokens = torch.nn.Embedding.from_pretrained(model.model.embed_tokens.weight)
+        # Workaround randomnly fixes it for torch versions < 2.
+        model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight))
         model.config.update({"unsloth_version" : __version__})
 
         # We also do this for the lm_head
         lm_head = torch.nn.Linear(1, 1, bias = None)
         del lm_head.weight
-        lm_head.weight = model.lm_head.weight
+        lm_head.weight = model.get_output_embeddings().weight
         lm_head.in_features  = lm_head.weight.shape[1]
         lm_head.out_features = lm_head.weight.shape[0]
         model.lm_head = lm_head
-
+        
         # Also patch all dtypes - BnB seems to not allocate the correct type?
         # BnB default dtype seems to be float16!
         correct_dtype = lm_head.weight.dtype

From 732ead0e4053bd7b36bfcf75f0fca76e1c0884e4 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 30 May 2024 22:29:36 +1000
Subject: [PATCH 062/153] accelerate

---
 unsloth/models/_utils.py | 14 ++++++++++++++
 unsloth/models/llama.py  |  2 ++
 2 files changed, 16 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index b7333f00..de6b864c 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -442,6 +442,20 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 pass
 
 
+# Fix up Accelerate
+import accelerate.accelerator
+prepare = inspect.getsource(accelerate.accelerator.Accelerator.prepare)
+prepare = prepare.split("\n")
+spaces = prepare[0].find("def")
+prepare = "\n".join(x[spaces:] for x in prepare)
+replace = "for obj in args:"
+s = " "*spaces
+prepare = prepare.replace(replace, f'self.distributed_type = DistributedType.MULTI_CPU\n{s}{replace}', 1)
+prepare = prepare.replace("prepare", "_fast_prepare")
+exec(prepare, globals())
+accelerate.accelerator.Accelerator.prepare = _fast_prepare
+
+
 # Offloading to disk for modules (lm_head, embed_tokens)
 import os
 import pickle
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2390146f..7dec8624 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1277,6 +1277,7 @@ def from_pretrained(
             "is_sagemaker_mp_enabled()",
             "False",
         )
+        exec(inner_training_loop, globals())
         Trainer._inner_training_loop = _fast_inner_training_loop
 
         # Save max_seq_length
@@ -1316,6 +1317,7 @@ def from_pretrained(
 
         # Add save modules
         patch_saving_functions(model)
+        Trainer._inner_training_loop = _fast_inner_training_loop
 
         # Save tokenizer for inference purposes
         tokenizer.padding_side = "left" # Force inference

From 359ae5c134c8cfb037d8735d64d3cfa8e3866369 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 30 May 2024 22:43:36 +1000
Subject: [PATCH 063/153] Update _utils.py

---
 unsloth/models/_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index de6b864c..6ba0fd33 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -444,6 +444,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 
 # Fix up Accelerate
 import accelerate.accelerator
+from accelerate.utils.dataclasses import DistributedType
 prepare = inspect.getsource(accelerate.accelerator.Accelerator.prepare)
 prepare = prepare.split("\n")
 spaces = prepare[0].find("def")

From 8dcfad3ad7eb8fcfd097320fb1b29384ff70bd98 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 30 May 2024 22:46:50 +1000
Subject: [PATCH 064/153] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 6ba0fd33..d9d75bbd 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -451,7 +451,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 prepare = "\n".join(x[spaces:] for x in prepare)
 replace = "for obj in args:"
 s = " "*spaces
-prepare = prepare.replace(replace, f'self.distributed_type = DistributedType.MULTI_CPU\n{s}{replace}', 1)
+prepare = prepare.replace(replace, f'print(self.distributed_type)\n{s}{replace}', 1)
 prepare = prepare.replace("prepare", "_fast_prepare")
 exec(prepare, globals())
 accelerate.accelerator.Accelerator.prepare = _fast_prepare

From 2bafc57756d74ce4b771f3e6b71f7afcf8d54f25 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 30 May 2024 22:49:59 +1000
Subject: [PATCH 065/153] Update _utils.py

---
 unsloth/models/_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index d9d75bbd..0ab3ecfc 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -452,9 +452,8 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 replace = "for obj in args:"
 s = " "*spaces
 prepare = prepare.replace(replace, f'print(self.distributed_type)\n{s}{replace}', 1)
-prepare = prepare.replace("prepare", "_fast_prepare")
 exec(prepare, globals())
-accelerate.accelerator.Accelerator.prepare = _fast_prepare
+accelerate.accelerator.Accelerator.prepare = prepare
 
 
 # Offloading to disk for modules (lm_head, embed_tokens)

From 90f631162c04217bc3835c3d5a2dce40462cf380 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 30 May 2024 22:54:03 +1000
Subject: [PATCH 066/153] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 0ab3ecfc..eb4fbac1 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -451,7 +451,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 prepare = "\n".join(x[spaces:] for x in prepare)
 replace = "for obj in args:"
 s = " "*spaces
-prepare = prepare.replace(replace, f'print(self.distributed_type)\n{s}{replace}', 1)
+prepare = prepare.replace(replace, f'try: self.distributed_type = DistributedType.MULTI_CPU\n{s}except: pass\n{s}print(self.distributed_type)\n{s}{replace}', 1)
 exec(prepare, globals())
 accelerate.accelerator.Accelerator.prepare = prepare
 

From 7b84ff768d2970900e7eccdac22ad72864ab3c3a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 30 May 2024 23:25:24 +1000
Subject: [PATCH 067/153] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index eb4fbac1..7b020426 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -451,7 +451,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 prepare = "\n".join(x[spaces:] for x in prepare)
 replace = "for obj in args:"
 s = " "*spaces
-prepare = prepare.replace(replace, f'try: self.distributed_type = DistributedType.MULTI_CPU\n{s}except: pass\n{s}print(self.distributed_type)\n{s}{replace}', 1)
+prepare = prepare.replace(replace, f'self.state.distributed_type = DistributedType.NO\n{s}print(self.distributed_type)\n{s}{replace}', 1)
 exec(prepare, globals())
 accelerate.accelerator.Accelerator.prepare = prepare
 

From 60f4b9a1983c227c9918bfc2bd79452e9f65ad75 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 30 May 2024 23:28:55 +1000
Subject: [PATCH 068/153] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 7b020426..173e5ddb 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -451,7 +451,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 prepare = "\n".join(x[spaces:] for x in prepare)
 replace = "for obj in args:"
 s = " "*spaces
-prepare = prepare.replace(replace, f'self.state.distributed_type = DistributedType.NO\n{s}print(self.distributed_type)\n{s}{replace}', 1)
+prepare = prepare.replace(replace, f'self.state.distributed_type = DistributedType.MULTI_CPU\n{s}print(self.distributed_type)\n{s}{replace}', 1)
 exec(prepare, globals())
 accelerate.accelerator.Accelerator.prepare = prepare
 

From 3ebe5a5da4cce238da656d8b255b2c64099970f7 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 30 May 2024 23:36:40 +1000
Subject: [PATCH 069/153] Update _utils.py

---
 unsloth/models/_utils.py | 66 ++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 173e5ddb..bcd4a7b3 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -410,51 +410,51 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 
 
 """
-    Remove warnings about missing kwargs
+    Remove warnings about missing kwargs and patch stuff
 """
-try:
-    from transformers.utils.quantization_config import BitsAndBytesConfig, QuantizationMethod
-    from inspect import getsource
-    import re
-    BitsAndBytesConfig__init__ = getsource(BitsAndBytesConfig.__init__)
-    BitsAndBytesConfig__init__ = re.sub(
-        r"if[\s]{1,}kwargs\:[\s]{1,}.+?\n",
-        "",
-        BitsAndBytesConfig__init__,
-        flags = re.MULTILINE,
-    )
-    BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.split("\n")
-    length_spaces = len(re.match(r"[\s]{1,}", BitsAndBytesConfig__init__[0]).group(0))
-    BitsAndBytesConfig__init__ = "\n".join(x[length_spaces:] for x in BitsAndBytesConfig__init__)
-    BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.replace(
-        "__init__",
-        "_BitsAndBytesConfig__init__",
-    )
-    exec(BitsAndBytesConfig__init__, globals())
-    
-    import transformers.utils.quantization_config
-    transformers.utils.quantization_config.BitsAndBytesConfig.__init__ = _BitsAndBytesConfig__init__
-except:
-    logger.warning_once(
-        "Unsloth unsuccessfully patched bitsandbytes. Please file a bug report.\n"\
-        "Luckily, your training run will still work in the meantime!"
-    )
+from transformers.utils.quantization_config import BitsAndBytesConfig, QuantizationMethod
+from inspect import getsource
+from accelerate.utils.dataclasses import DistributedType
+import re
+BitsAndBytesConfig__init__ = getsource(BitsAndBytesConfig.__init__)
+BitsAndBytesConfig__init__ = re.sub(
+    r"if[\s]{1,}kwargs\:[\s]{1,}.+?\n",
+    "",
+    BitsAndBytesConfig__init__,
+    flags = re.MULTILINE,
+)
+BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.split("\n")
+length_spaces = len(re.match(r"[\s]{1,}", BitsAndBytesConfig__init__[0]).group(0))
+BitsAndBytesConfig__init__ = "\n".join(x[length_spaces:] for x in BitsAndBytesConfig__init__)
+BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.replace(
+    "__init__",
+    "_BitsAndBytesConfig__init__",
+)
+
+def _prepare_backend(
+    self, cpu: bool = False, sagemaker_dp = False, backend: str = None,
+) -> tuple[str, DistributedType]:
+    return None, DistributedType.NO
 pass
+import accelerate.state
+accelerate.state.PartialState._prepare_backend = _prepare_backend
 
-
-# Fix up Accelerate
 import accelerate.accelerator
-from accelerate.utils.dataclasses import DistributedType
 prepare = inspect.getsource(accelerate.accelerator.Accelerator.prepare)
 prepare = prepare.split("\n")
 spaces = prepare[0].find("def")
 prepare = "\n".join(x[spaces:] for x in prepare)
-replace = "for obj in args:"
+x = "for obj in args:"
 s = " "*spaces
-prepare = prepare.replace(replace, f'self.state.distributed_type = DistributedType.MULTI_CPU\n{s}print(self.distributed_type)\n{s}{replace}', 1)
+prepare = prepare.replace(x, f'self.state.distributed_type = DistributedType.NO\n{s}{x}', 1)
 exec(prepare, globals())
 accelerate.accelerator.Accelerator.prepare = prepare
 
+exec(BitsAndBytesConfig__init__, globals())
+
+import transformers.utils.quantization_config
+transformers.utils.quantization_config.BitsAndBytesConfig.__init__ = _BitsAndBytesConfig__init__
+
 
 # Offloading to disk for modules (lm_head, embed_tokens)
 import os

From 7bbc8cee218c66e8a9c2bd519c74098d477c6284 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 31 May 2024 00:23:08 +1000
Subject: [PATCH 070/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 6e4d6910..7d003a1d 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -579,7 +579,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
     # Remove None items in actual_bad_tokens
     actual_bad_tokens = [x for x in actual_bad_tokens if x is not None]
-    
+
     # Check if tokenizer and training datasets have bad tokens
     if_bad_first  = False
     if_bad_second = False
@@ -855,6 +855,7 @@ def patch_sft_trainer_tokenizer():
     "pass\n"\
     "n_devices = torch.cuda.device_count()\n"\
     "more_than = 0\n"\
+    "print(n_devices)\n"\
     "for j in range(n_devices):\n"\
     "    vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\
     "    more_than += (vram > 4)\n"\

From 6f5c84c09a8859f429c4d889d14203993ed7c872 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 31 May 2024 00:27:27 +1000
Subject: [PATCH 071/153] train_dataloader

---
 unsloth/models/llama.py    | 3 ++-
 unsloth/tokenizer_utils.py | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 7dec8624..2a184b8b 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1236,6 +1236,7 @@ def from_pretrained(
         bsz = self._train_batch_size
         total_batches = bsz * ga * args.world_size
         n_total_devices = total_batches // ga // bsz
+        print('N total devices = ', n_total_devices)
         if n_total_devices > 2:
             logger.warning_once(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\\n"
@@ -1349,7 +1350,7 @@ def post_patch(model):
         lm_head.in_features  = lm_head.weight.shape[1]
         lm_head.out_features = lm_head.weight.shape[0]
         model.lm_head = lm_head
-        
+
         # Also patch all dtypes - BnB seems to not allocate the correct type?
         # BnB default dtype seems to be float16!
         correct_dtype = lm_head.weight.dtype
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 7d003a1d..6afea680 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -855,7 +855,6 @@ def patch_sft_trainer_tokenizer():
     "pass\n"\
     "n_devices = torch.cuda.device_count()\n"\
     "more_than = 0\n"\
-    "print(n_devices)\n"\
     "for j in range(n_devices):\n"\
     "    vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\
     "    more_than += (vram > 4)\n"\

From 0d269ca54380fbc4e3a2d10aace295ee66d6b614 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 31 May 2024 00:30:10 +1000
Subject: [PATCH 072/153] Update llama.py

---
 unsloth/models/llama.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 2a184b8b..d0718310 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1237,6 +1237,9 @@ def from_pretrained(
         total_batches = bsz * ga * args.world_size
         n_total_devices = total_batches // ga // bsz
         print('N total devices = ', n_total_devices)
+        print('N total devices = ', n_total_devices)
+        print('N total devices = ', n_total_devices)
+        print('N total devices = ', n_total_devices)
         if n_total_devices > 2:
             logger.warning_once(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\\n"

From 6b7c142fd7fe575a44b6c4c4af1ee3c7b9b283bd Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 31 May 2024 00:32:18 +1000
Subject: [PATCH 073/153] Update llama.py

---
 unsloth/models/llama.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index d0718310..bdb881f4 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1213,6 +1213,16 @@ def from_pretrained(
                 "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
+        n_devices = torch.cuda.device_count()
+        more_than = 0
+        for j in range(n_devices):
+            vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024
+            more_than += (vram > 4)
+        if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+
         debug_info ="""
         debug_info = debug_info.split('\n')
         debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
@@ -1236,10 +1246,6 @@ def from_pretrained(
         bsz = self._train_batch_size
         total_batches = bsz * ga * args.world_size
         n_total_devices = total_batches // ga // bsz
-        print('N total devices = ', n_total_devices)
-        print('N total devices = ', n_total_devices)
-        print('N total devices = ', n_total_devices)
-        print('N total devices = ', n_total_devices)
         if n_total_devices > 2:
             logger.warning_once(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\\n"

From 54f3a741df03f9440fe14e8630d1c6819a99690a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 31 May 2024 00:34:30 +1000
Subject: [PATCH 074/153] Update llama.py

---
 unsloth/models/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index bdb881f4..b9690b3d 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1214,15 +1214,16 @@ def from_pretrained(
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
         n_devices = torch.cuda.device_count()
+        print(n_devices)
         more_than = 0
         for j in range(n_devices):
             vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024
             more_than += (vram > 4)
+        print(more_than)
         if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')
         for _ in range(3):
             gc.collect()
             torch.cuda.empty_cache()
-
         debug_info ="""
         debug_info = debug_info.split('\n')
         debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])

From 0bc96c54368377fcba90e09bb6d30d5d37d468ad Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 31 May 2024 00:39:25 +1000
Subject: [PATCH 075/153] use_fast_convert

---
 unsloth/models/llama.py | 13 +------------
 unsloth/save.py         |  1 +
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index b9690b3d..7dec8624 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1213,17 +1213,6 @@ def from_pretrained(
                 "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
-        n_devices = torch.cuda.device_count()
-        print(n_devices)
-        more_than = 0
-        for j in range(n_devices):
-            vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024
-            more_than += (vram > 4)
-        print(more_than)
-        if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')
-        for _ in range(3):
-            gc.collect()
-            torch.cuda.empty_cache()
         debug_info ="""
         debug_info = debug_info.split('\n')
         debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
@@ -1360,7 +1349,7 @@ def post_patch(model):
         lm_head.in_features  = lm_head.weight.shape[1]
         lm_head.out_features = lm_head.weight.shape[0]
         model.lm_head = lm_head
-
+        
         # Also patch all dtypes - BnB seems to not allocate the correct type?
         # BnB default dtype seems to be float16!
         correct_dtype = lm_head.weight.dtype
diff --git a/unsloth/save.py b/unsloth/save.py
index 7af62809..5d6f925d 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -975,6 +975,7 @@ def save_to_gguf(
         vocab_type = "bpe"
     pass
 
+    use_fast_convert = False
     if use_fast_convert:
         command = f"python llama.cpp/convert.py {model_directory} "\
             f"--outfile {final_location} --vocab-type {vocab_type} "\

From a8b5d894d18bc18f654701cf94f28a35b4280d86 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 31 May 2024 03:56:12 +1000
Subject: [PATCH 076/153] Update save.py

---
 unsloth/save.py | 86 ++++++++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 36 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 5d6f925d..93fc1b49 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1344,6 +1344,26 @@ def unsloth_save_pretrained_gguf(
     del arguments["quantization_method"]
     del arguments["first_conversion"]
 
+    # Check if BOS added already, then warn
+    fix_bos_token = False
+    chat_template = getattr(tokenizer, "chat_template", None)
+    new_chat_template = None
+
+    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
+        if chat_template is not None and \
+            (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
+
+            fix_bos_token = True
+            logger.warning(
+                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
+                "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
+            )
+            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
+            tokenizer.chat_template = new_chat_template
+
+        pass
+    pass
+
     # Non blocking install GGUF first
     if not os.path.exists("llama.cpp"):
 
@@ -1386,26 +1406,17 @@ def unsloth_save_pretrained_gguf(
         pass
     pass
 
+    # Use old chat template if the bos is removed
+    if fix_bos_token:
+        tokenizer.chat_template = chat_template
+    pass
+
     for _ in range(3):
         gc.collect()
 
     model_type = self.config.model_type
     is_sentencepiece_model = check_if_sentencepiece_model(self)
 
-    # Check if BOS added already, then warn
-    print_bos_token_message = False
-    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
-        chat_template = getattr(tokenizer, "chat_template", None)
-        if chat_template is not None and \
-            (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
-            print_bos_token_message = True
-            logger.warning(
-                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
-                "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template."
-            )
-        pass
-    pass
-
     # Save to GGUF
     file_location = save_to_gguf(model_type, is_sentencepiece_model, 
         new_save_directory, quantization_method, first_conversion, makefile,
@@ -1422,13 +1433,6 @@ def unsloth_save_pretrained_gguf(
             new_save_directory.lstrip('/.')
         print(f"Saved GGUF to https://huggingface.co/{link}")
     pass
-
-    if print_bos_token_message:
-        logger.warning(
-            f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
-            "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template."
-        )
-    pass
 pass
 
 
@@ -1490,6 +1494,26 @@ def unsloth_push_to_hub_gguf(
     del arguments["quantization_method"]
     del arguments["first_conversion"]
 
+    # Check if BOS added already, then warn
+    fix_bos_token = False
+    chat_template = getattr(tokenizer, "chat_template", None)
+    new_chat_template = None
+
+    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
+        if chat_template is not None and \
+            (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
+
+            fix_bos_token = True
+            logger.warning(
+                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
+                "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
+            )
+            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
+            tokenizer.chat_template = new_chat_template
+
+        pass
+    pass
+
     # Non blocking install GGUF first
     if not os.path.exists("llama.cpp"):
 
@@ -1532,26 +1556,17 @@ def unsloth_push_to_hub_gguf(
         pass
     pass
 
+    # Use old chat template if the bos is removed
+    if fix_bos_token:
+        tokenizer.chat_template = chat_template
+    pass
+
     for _ in range(3):
         gc.collect()
 
     model_type = self.config.model_type
     is_sentencepiece_model = check_if_sentencepiece_model(self)
 
-    # Check if BOS added already, then warn
-    print_bos_token_message = False
-    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
-        chat_template = getattr(tokenizer, "chat_template", None)
-        if chat_template is not None and \
-            (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
-            print_bos_token_message = True
-            logger.warning(
-                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
-                "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template."
-            )
-        pass
-    pass
-
     # Save to GGUF
     file_location = save_to_gguf(model_type, is_sentencepiece_model, 
         new_save_directory, quantization_method, first_conversion, makefile,
@@ -1579,7 +1594,6 @@ def unsloth_push_to_hub_gguf(
 
 def patch_saving_functions(model):
     import inspect
-    import re
     import types
     from typing import Callable, Optional, Union, List
 

From 872d569f98304fe103effc478139f44155e36857 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 31 May 2024 04:10:03 +1000
Subject: [PATCH 077/153] Update save.py

---
 unsloth/save.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 93fc1b49..b8e03bd2 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1355,7 +1355,7 @@ def unsloth_save_pretrained_gguf(
 
             fix_bos_token = True
             logger.warning(
-                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
+                f"Unsloth: ##### The current model auto adds a BOS token.\n"\
                 "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
             )
             new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
@@ -1505,7 +1505,7 @@ def unsloth_push_to_hub_gguf(
 
             fix_bos_token = True
             logger.warning(
-                f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
+                f"Unsloth: ##### The current model auto adds a BOS token.\n"\
                 "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
             )
             new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)

From 3a1f5f27d782a3670cead876f70a601b62c27ec6 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 31 May 2024 04:26:37 +1000
Subject: [PATCH 078/153] Update save.py

---
 unsloth/save.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index b8e03bd2..574010ee 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1422,6 +1422,13 @@ def unsloth_save_pretrained_gguf(
         new_save_directory, quantization_method, first_conversion, makefile,
     )
 
+    if fix_bos_token:
+        logger.warning(
+            f"Unsloth: ##### The current model auto adds a BOS token.\n"\
+            "Unsloth: ##### We removed in GGUF's chat template for you."
+        )
+    pass
+
     if push_to_hub:
         print("Unsloth: Uploading GGUF to Huggingface Hub...")
         username = upload_to_huggingface(
@@ -1583,10 +1590,10 @@ def unsloth_push_to_hub_gguf(
 
     print(f"Saved GGUF to https://huggingface.co/{link}")
 
-    if print_bos_token_message:
+    if fix_bos_token:
         logger.warning(
-            f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\
-            "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template."
+            f"Unsloth: ##### The current model auto adds a BOS token.\n"\
+            "Unsloth: ##### We removed in GGUF's chat template for you."
         )
     pass
 pass

From bcadc8cb997d803d84ed1834fbb0e253bd5255c8 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 3 Jun 2024 02:52:30 +1000
Subject: [PATCH 079/153] Update save.py

---
 unsloth/save.py | 78 ++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 574010ee..1a3e532f 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1281,6 +1281,38 @@ def upload_to_huggingface(
 pass
 
 
+def fix_tokenizer_bos_token(tokenizer):
+    # Check if BOS added already, then warn
+    fix_bos_token = False
+    chat_template = getattr(tokenizer, "chat_template", None)
+    
+    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
+        if chat_template is not None and \
+            (
+                tokenizer.bos_token in chat_template or \
+                "{bos_token}" in chat_template.replace(" ", "") or \
+                "{bos_token+" in chat_template.replace(" ", "")
+            ):
+
+            fix_bos_token = True
+            logger.warning(
+                f"Unsloth: ##### The current model auto adds a BOS token.\n"\
+                "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
+            )
+
+            # Remove {{bos_token}}
+            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
+            # Remove {{bos_token +
+            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\+[\s]{0,}", "", new_chat_template)
+            
+            tokenizer.chat_template = new_chat_template
+
+        pass
+    pass
+    return fix_bos_token, chat_template
+pass
+
+
 def unsloth_save_pretrained_gguf(
     self,
     save_directory       : Union[str, os.PathLike],
@@ -1344,25 +1376,8 @@ def unsloth_save_pretrained_gguf(
     del arguments["quantization_method"]
     del arguments["first_conversion"]
 
-    # Check if BOS added already, then warn
-    fix_bos_token = False
-    chat_template = getattr(tokenizer, "chat_template", None)
-    new_chat_template = None
-
-    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
-        if chat_template is not None and \
-            (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
-
-            fix_bos_token = True
-            logger.warning(
-                f"Unsloth: ##### The current model auto adds a BOS token.\n"\
-                "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
-            )
-            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
-            tokenizer.chat_template = new_chat_template
-
-        pass
-    pass
+    # Fix tokenizer adding an extra BOS token at the front
+    fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
 
     # Non blocking install GGUF first
     if not os.path.exists("llama.cpp"):
@@ -1408,7 +1423,7 @@ def unsloth_save_pretrained_gguf(
 
     # Use old chat template if the bos is removed
     if fix_bos_token:
-        tokenizer.chat_template = chat_template
+        tokenizer.chat_template = old_chat_template
     pass
 
     for _ in range(3):
@@ -1501,25 +1516,8 @@ def unsloth_push_to_hub_gguf(
     del arguments["quantization_method"]
     del arguments["first_conversion"]
 
-    # Check if BOS added already, then warn
-    fix_bos_token = False
-    chat_template = getattr(tokenizer, "chat_template", None)
-    new_chat_template = None
-
-    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
-        if chat_template is not None and \
-            (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")):
-
-            fix_bos_token = True
-            logger.warning(
-                f"Unsloth: ##### The current model auto adds a BOS token.\n"\
-                "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
-            )
-            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
-            tokenizer.chat_template = new_chat_template
-
-        pass
-    pass
+    # Fix tokenizer adding an extra BOS token at the front
+    fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
 
     # Non blocking install GGUF first
     if not os.path.exists("llama.cpp"):
@@ -1565,7 +1563,7 @@ def unsloth_push_to_hub_gguf(
 
     # Use old chat template if the bos is removed
     if fix_bos_token:
-        tokenizer.chat_template = chat_template
+        tokenizer.chat_template = old_chat_template
     pass
 
     for _ in range(3):

From 1381820342e3f04b2baeaef838555672bab160c8 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 3 Jun 2024 03:22:59 +1000
Subject: [PATCH 080/153] remove_special_tokens

---
 unsloth/chat_templates.py | 26 +++++++++++++++++++++-----
 unsloth/models/_utils.py  |  2 +-
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 3decdf7f..a5b7a196 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -16,6 +16,7 @@
     "get_chat_template",
     "test_chat_templates",
     "test_hf_gguf_equivalence",
+    "remove_special_tokens",
 ]
 
 from transformers import StoppingCriteria, StoppingCriteriaList
@@ -500,6 +501,19 @@ def get_chat_template(
 pass
 
 
+def remove_special_tokens(tokenizer, prompt):
+    # Removes double BOS token
+    if tokenizer("A").input_ids[0] == tokenizer.bos_token_id:
+        input_ids = tokenizer(prompt).input_ids
+        for j, input_id in enumerate(input_ids):
+            if input_id != tokenizer.bos_token_id: break
+        input_ids = input_ids[j:]
+        prompt = tokenizer.decode(input_ids)
+    pass
+    return prompt
+pass
+
+
 def create_stopping_criteria(tokenizer, stop_word = "eos_token"):
     class StoppingCriteriaSub(StoppingCriteria):
         __slots__ = "stop_token", "single_match", "length",
@@ -670,7 +684,8 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf")
     if tokenizer.chat_template is not None:
         prompt = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
         prompt = prompt.replace("'", "") # Subprocess does not like ''
-        prompts.append(prompts)
+        prompt = remove_special_tokens(tokenizer, prompt)
+        prompts.append(prompt)
     pass
     
     for prompt in prompts:
@@ -688,9 +703,9 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf")
         gguf_tokenized = re.findall("([\d]{1,}) \-\> \'([^\']{1,})\'", gguf_tokens, flags = re.MULTILINE)
         gguf_tokenized = [(int(x[0]), x[1],) for x in gguf_tokenized]
         input_ids = tokenizer(prompt).input_ids
+
         tokens = tokenizer.batch_decode(input_ids)
         hf_tokenized = list(zip(input_ids, tokens))
-        print(gguf_tokenized[:5])
 
         # Compare to Huggingface
         for j, (hf_token, gguf_token) in enumerate(zip(hf_tokenized, gguf_tokenized)):
@@ -698,9 +713,10 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf")
                 print("Failed GGUF != HF at", j)
                 print("HF =", hf_token)
                 print("GGUF =", gguf_token)
-                print(hf_tokenized[:j+1])
-                print(gguf_tokenized[:j+1])
-                print(gguf_tokens)
+                print(hf_tokenized)
+                print()
+                print(gguf_tokenized)
+                print()
                 raise RuntimeError("Failed comparing GGUF to HF.")
             pass
         pass
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index bcd4a7b3..a6933893 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -31,7 +31,7 @@
 import os
 import psutil
 
-__version__ = "2024.5"
+__version__ = "2024.6"
 
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 major_version, minor_version = torch.cuda.get_device_capability()

From e01b87da7c81b4666b8f140dd62c7ccd93fce571 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 3 Jun 2024 04:55:35 +1000
Subject: [PATCH 081/153] Ollama

---
 unsloth/chat_templates.py | 165 ++++++++++++++++++++++++++++++++------
 1 file changed, 139 insertions(+), 26 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index a5b7a196..6d473f60 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -30,6 +30,7 @@
 
 CHAT_TEMPLATES = {}
 
+# =========================================== Unsloth
 # Unsloth efficient template leverages from Zephyr
 unsloth_template = \
     "{{ bos_token }}"\
@@ -54,8 +55,9 @@
     "{% endif %}"
 unsloth_eos_token = "eos_token"
 CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False,)
+pass
 
-
+# =========================================== Zephyr
 # Zephyr has no BOS!
 zephyr_template = \
     "{% for message in messages %}"\
@@ -72,8 +74,9 @@
     "{% endif %}"
 zephyr_eos_token = "eos_token"
 CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False,)
+pass
 
-
+# =========================================== ChatML
 # ChatML has no BOS and not EOS! Rather <|im_start|> and <|im_end|> acts as BOS / EOS.
 chatml_template = \
     "{% for message in messages %}"\
@@ -88,10 +91,27 @@
     "{% if add_generation_prompt %}"\
         "{{ '<|im_start|>assistant\n' }}"\
     "{% endif %}"
-chatml_eos_token = "<|im_end|>"
-CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token, True,)
+pass
 
+chatml_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ .Response }}<|im_end|>
+"""
+PARAMETER stop <|im_start|>
+PARAMETER stop <|im_end|>
+'''
 
+chatml_eos_token = "<|im_end|>"
+CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token, True, chatml_ollama,)
+pass
+
+# =========================================== Mistral-1
 # Mistral Instruct doesn't allow system prompts, so we append it to the user message.
 mistral_template = \
     "{{ bos_token }}"\
@@ -117,8 +137,9 @@
     "{% endfor %}"
 mistral_eos_token = "eos_token"
 CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False,)
+pass
 
-
+# =========================================== Llama-2
 # Adds BOS to every convo! And weird <<SYS>> system messages.
 llama_template = \
     "{% if messages[0]['role'] == 'system' %}"\
@@ -143,8 +164,9 @@
     "{% endfor %}"
 llama_eos_token = "eos_token"
 CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False,)
+pass
 
-
+# ===========================================  Vicuna
 # https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
 vicuna_template = \
     "{{ bos_token }}"\
@@ -169,8 +191,9 @@
     "{% endif %}"
 vicuna_eos_token = "eos_token"
 CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False,)
+pass
 
-
+# =========================================== Vicuna Old
 # https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
 vicuna_old_template = \
     "{{ bos_token }}"\
@@ -195,8 +218,9 @@
     "{% endif %}"
 vicuna_old_eos_token = "eos_token"
 CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False,)
+pass
 
-
+# =========================================== Alpaca multi turn
 # https://github.com/tatsu-lab/stanford_alpaca Changed for multi-turn convos
 alpaca_template = \
     "{{ bos_token }}"\
@@ -219,42 +243,98 @@
     "{% if add_generation_prompt %}"\
         "{{ '### Response:\n' }}"\
     "{% endif %}"
-alpaca_eos_token = "eos_token"
-CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token, False,)
+pass
 
+alpaca_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}{{ .System }}
+
+{{ end }}{{ if .Prompt }}### Instruction:
+{{ .Prompt }}
+
+{{ end }}### Response:
+{{ .Response }}{__EOS_TOKEN__}
+
+"""
+PARAMETER stop {__EOS_TOKEN__}
+'''
+
+alpaca_eos_token = "eos_token"
+CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token, False, alpaca_ollama,)
+pass
 
+# =========================================== Gemma
 # https://huggingface.co/google/gemma-7b-it
 # Notice we must use |trim for lstrip and rstrip. <start_of_turn> maps to 106.
 # <end_of_turn> maps to 107. user and model are normal 1 word tokens.
 gemma_template = \
     "{{ bos_token }}"\
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{{'<start_of_turn>user\n' + messages[0]['content'] | trim + ' ' + messages[1]['content'] | trim + '<end_of_turn>\n'}}"\
+        "{% set loop_messages = messages[2:] %}"\
+    "{% endif %}"\
     "{% for message in messages %}"\
         "{% if message['role'] == 'user' %}"\
             "{{'<start_of_turn>user\n' + message['content'] | trim + '<end_of_turn>\n'}}"\
         "{% elif message['role'] == 'assistant' %}"\
             "{{'<start_of_turn>model\n' + message['content'] | trim + '<end_of_turn>\n' }}"\
         "{% else %}"\
-            "{{ '<start_of_turn>system\n' + message['content'] | trim + '<end_of_turn>\n' }}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
         "{% endif %}"\
     "{% endfor %}"\
     "{% if add_generation_prompt %}"\
         "{{ '<start_of_turn>model\n' }}"\
     "{% endif %}"
-gemma_eos_token = "<end_of_turn>"
-CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token, True,)
+pass
 
+gemma_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """<start_of_turn>user
+{{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}<end_of_turn>
+<start_of_turn>model
+{{ .Response }}<end_of_turn>
+"""
+PARAMETER repeat_penalty 1
+PARAMETER stop <start_of_turn>
+PARAMETER stop <end_of_turn>
+PARAMETER penalize_newline false
+'''
 
-# Gemma with ChatML instead
+gemma_eos_token = "<end_of_turn>"
+CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token, True, gemma_ollama,)
+pass
+
+# =========================================== Gemma with ChatML instead
 # We find using <eos> is still more appropriate!
 gemma_chatml_template = "{{ bos_token }}" + chatml_template
+pass
+
+gemma_chatml_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ .Response }}<|im_end|>
+"""
+PARAMETER repeat_penalty 1
+PARAMETER stop <|im_start|>
+PARAMETER stop <|im_end|>
+PARAMETER penalize_newline false
+'''
+
 gemma_chatml_eos_token = (
     {"<start_of_turn>" : "<|im_start|>", "<eos>" : "<|im_end|>"},
     "<|im_end|>",
 )
-CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token, True,)
-
+CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token, True, gemma_chatml_ollama,)
+pass
 
-# Llama-3
+# =========================================== Llama-3
 # Weirdly \n\n is needed?
 llama3_template = \
     "{{ bos_token }}"\
@@ -270,11 +350,30 @@
     "{% if add_generation_prompt %}"\
         "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\
     "{% endif %}"
+pass
+
+llama3_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+
+{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ .Response }}<|eot_id|>"""
+PARAMETER stop "<|start_header_id|>"
+PARAMETER stop "<|end_header_id|>"
+PARAMETER stop "<|eot_id|>"
+PARAMETER stop "<|reserved_special_token"
+'''
+
 llama3_template_eos_token = "eos_token"
-CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token, False,)
+CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token, False, llama3_ollama,)
+pass
 
 
-# Phi-3
+# =========================================== Phi-3
 phi3_template = \
     "{{ bos_token }}"\
     "{% for message in messages %}"\
@@ -289,8 +388,26 @@
     "{% if add_generation_prompt %}"\
         "{{ '<|assistant|>\n' }}"\
     "{% endif %}"
+pass
+
+phi3_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|system|>
+{{ .System }}<|end|>
+{{ end }}{{ if .Prompt }}<|user|>
+{{ .Prompt }}<|end|>
+{{ end }}<|assistant|>
+{{ .Response }}<|end|>
+"""
+PARAMETER stop <|end|>
+PARAMETER stop <|user|>
+PARAMETER stop <|assistant|>
+'''
+
 phi3_template_eos_token = "<|end|>"
-CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False,)
+CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False, phi3_ollama,)
+pass
 
 
 def get_chat_template(
@@ -503,12 +620,8 @@ def get_chat_template(
 
 def remove_special_tokens(tokenizer, prompt):
     # Removes double BOS token
-    if tokenizer("A").input_ids[0] == tokenizer.bos_token_id:
-        input_ids = tokenizer(prompt).input_ids
-        for j, input_id in enumerate(input_ids):
-            if input_id != tokenizer.bos_token_id: break
-        input_ids = input_ids[j:]
-        prompt = tokenizer.decode(input_ids)
+    if prompt.startswith(tokenizer.bos_token):
+        prompt = prompt[len(tokenizer.bos_token):]
     pass
     return prompt
 pass

From b3479c7bfb55bb23e841488bc610b6c939e17bd2 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 4 Jun 2024 03:36:03 +1000
Subject: [PATCH 082/153] Update chat_templates.py

---
 unsloth/chat_templates.py | 121 +++++++++++++++++++++++++++++++++++---
 1 file changed, 114 insertions(+), 7 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 6d473f60..2509b09e 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -17,6 +17,7 @@
     "test_chat_templates",
     "test_hf_gguf_equivalence",
     "remove_special_tokens",
+    "create_ollama_modelfile",
 ]
 
 from transformers import StoppingCriteria, StoppingCriteriaList
@@ -53,8 +54,20 @@
     "{% if add_generation_prompt %}"\
         "{{ '>>> Assistant: ' }}"\
     "{% endif %}"
+pass
+
+unsloth_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}{{ .System }}
+{{ end }}{{ if .Prompt }}>>> User: {{ .Prompt }}
+{{ end }}>>> Assistant: {{ .Response }}{__EOS_TOKEN__}
+"""
+PARAMETER stop {__EOS_TOKEN__}
+'''
+
 unsloth_eos_token = "eos_token"
-CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False,)
+CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False, unsloth_ollama,)
 pass
 
 # =========================================== Zephyr
@@ -72,8 +85,23 @@
     "{% if add_generation_prompt %}"\
         "{{ '<|assistant|>\n' }}"\
     "{% endif %}"
+pass
+
+zephyr_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}<|system|>
+{{ .System }}{__EOS_TOKEN__}
+{{ end }}{{ if .Prompt }}<|user|>
+{{ .Prompt }}{__EOS_TOKEN__}
+{{ end }}<|assistant|>
+{{ .Response }}{__EOS_TOKEN__}
+"""
+PARAMETER stop {__EOS_TOKEN__}
+'''
+
 zephyr_eos_token = "eos_token"
-CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False,)
+CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False, zephyr_ollama,)
 pass
 
 # =========================================== ChatML
@@ -135,8 +163,17 @@
             "{{ raise_exception('Only user and assistant roles are supported!') }}"\
         "{% endif %}"\
     "{% endfor %}"
+pass
+
+mistral_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST]"""
+PARAMETER stop {__EOS_TOKEN__}
+'''
+
 mistral_eos_token = "eos_token"
-CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False,)
+CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False, mistral_ollama,)
 pass
 
 # =========================================== Llama-2
@@ -162,8 +199,19 @@
             "{{ raise_exception('Only user and assistant roles are supported!') }}"\
         "{% endif %}"\
     "{% endfor %}"
+pass
+
+llama_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """[INST] <<SYS>>{{ .System }}<</SYS>>
+
+{{ .Prompt }} [/INST]"""
+PARAMETER stop {__EOS_TOKEN__}
+'''
+
 llama_eos_token = "eos_token"
-CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False,)
+CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False, llama_ollama,)
 pass
 
 # ===========================================  Vicuna
@@ -189,8 +237,17 @@
     "{% if add_generation_prompt %}"\
         "{{ 'ASSISTANT:' }}"\
     "{% endif %}"
+pass
+
+vicuna_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} {{ end }}ASSISTANT: {{ .Response }} {__EOS_TOKEN__}"""
+PARAMETER stop {__EOS_TOKEN__}
+'''
+
 vicuna_eos_token = "eos_token"
-CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False,)
+CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False, vicuna_ollama,)
 pass
 
 # =========================================== Vicuna Old
@@ -216,8 +273,20 @@
     "{% if add_generation_prompt %}"\
         "{{ '### Assistant:' }}"\
     "{% endif %}"
+pass
+
+vicuna_old_ollama = \
+'''
+FROM {__FILE_LOCATION__}
+TEMPLATE """{{ if .System }}{{ .System }}
+{{ end }}{{ if .Prompt }}### Human: {{ .Prompt }}
+{{ end }}### Assistant: {{ .Response }}{__EOS_TOKEN__}
+"""
+PARAMETER stop {__EOS_TOKEN__}
+'''
+
 vicuna_old_eos_token = "eos_token"
-CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False,)
+CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False, vicuna_old_ollama,)
 pass
 
 # =========================================== Alpaca multi turn
@@ -415,6 +484,7 @@ def get_chat_template(
     chat_template = "chatml",
     mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"},
     map_eos_token = True,
+    system_message = None,
 ):
     assert(type(map_eos_token) is bool)
     old_tokenizer = tokenizer
@@ -449,7 +519,7 @@ def get_chat_template(
 
     elif type(chat_template) is str:
 
-        chat_template, stop_word, yes_map_eos_token = CHAT_TEMPLATES[chat_template]
+        chat_template, stop_word, yes_map_eos_token, ollama_modelfile = CHAT_TEMPLATES[chat_template]
 
         # Check mapping to eos_token
         if not map_eos_token and yes_map_eos_token: map_eos_token = True
@@ -614,6 +684,9 @@ def get_chat_template(
     # Patch saving functions
     tokenizer = patch_saving_functions(tokenizer)
 
+    # Add Ollama
+    tokenizer._ollama_modelfile = ollama_modelfile
+    tokenizer._system_message   = system_message
     return tokenizer#, stopping_criteria
 pass
 
@@ -627,6 +700,40 @@ def remove_special_tokens(tokenizer, prompt):
 pass
 
 
+def create_ollama_modelfile(tokenizer, gguf_location):
+
+    modelfile = getattr(tokenizer, "ollama_modelfile", None)
+    if modelfile is None:
+        raise RuntimeError(
+            "Unsloth: Tokenizer does not have a `ollama_modelfile` attribute.\n"\
+            "Please use get_chat_template(...)."
+        )
+    pass
+
+    system_message = getattr(tokenizer, "_system_message", None)
+    if system_message is None:
+        __SYSTEM_MESSAGE__ = ""
+    else:
+        __SYSTEM_MESSAGE__ = f'SYSTEM """{system_message}"""'
+    pass
+
+    modelfile = modelfile\
+        .replace("{{", "⚫@✅#🦥")\
+        .replace("}}", "⚡@🦥#⛵")\
+        .format(
+            __FILE_LOCATION__  = gguf_location,
+            __SYSTEM_MESSAGE__ = __SYSTEM_MESSAGE__,
+            __EOS_TOKEN__      = tokenizer.eos_token,
+        )\
+        .replace("⚫@✅#🦥", "{{")\
+        .replace("⚡@🦥#⛵", "}}")\
+        .rstrip()
+    pass
+
+    return modelfile
+pass
+
+
 def create_stopping_criteria(tokenizer, stop_word = "eos_token"):
     class StoppingCriteriaSub(StoppingCriteria):
         __slots__ = "stop_token", "single_match", "length",

From 86804dc98c9c0c8860545691e429a9e8cdf8ba28 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 4 Jun 2024 03:37:08 +1000
Subject: [PATCH 083/153] Update chat_templates.py

---
 unsloth/chat_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 2509b09e..b8713789 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -702,7 +702,7 @@ def remove_special_tokens(tokenizer, prompt):
 
 def create_ollama_modelfile(tokenizer, gguf_location):
 
-    modelfile = getattr(tokenizer, "ollama_modelfile", None)
+    modelfile = getattr(tokenizer, "_ollama_modelfile", None)
     if modelfile is None:
         raise RuntimeError(
             "Unsloth: Tokenizer does not have a `ollama_modelfile` attribute.\n"\

From 87fdd3a4a414d62b6e1f8e68a72b544207819bc1 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 4 Jun 2024 03:43:34 +1000
Subject: [PATCH 084/153] Update chat_templates.py

---
 unsloth/chat_templates.py | 82 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index b8713789..fb9c929c 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -18,6 +18,7 @@
     "test_hf_gguf_equivalence",
     "remove_special_tokens",
     "create_ollama_modelfile",
+    "standardize_dataset",
 ]
 
 from transformers import StoppingCriteria, StoppingCriteriaList
@@ -700,8 +701,87 @@ def remove_special_tokens(tokenizer, prompt):
 pass
 
 
-def create_ollama_modelfile(tokenizer, gguf_location):
+def standardize_dataset(
+    dataset,
+    conversation_key = "conversations",
+    system_message = None,
+    aliases_for_system    = ["system",],
+    aliases_for_user      = ["user", "human", "input",],
+    aliases_for_assistant = ["gpt", "assistant", "output",],
+):
+    """
+        Standardizes ShareGPT and other formats to user/assistant Hugging Face format.
+    """
+    import collections
+    import itertools
+
+    convos = dataset[:10][conversation_key]
+    uniques = collections.defaultdict(list)
+    for convo in convos:
+        for message in convo:
+            for key, value in message.items():
+                uniques[key].append(value)
+    pass
+
+    # Must be only 2 entries
+    assert(len(uniques.keys()) == 2)
 
+    keys = list(uniques.keys())
+    length_first  = len(set(uniques[keys[0]]))
+    length_second = len(set(uniques[keys[1]]))
+
+    if length_first < length_second:
+        # Role is assigned to the first element
+        role_key    = keys[0]
+        content_key = keys[1]
+    else:
+        role_key    = keys[1]
+        content_key = keys[0]
+    pass
+
+    # Check roles are in aliases
+    all_aliases = set(aliases_for_system + aliases_for_user + aliases_for_assistant)
+    roles = set(uniques[role_key])
+    leftover_aliases = (all_aliases | roles) - all_aliases
+    if len(leftover_aliases) != 0:
+        raise TypeError(
+            f"Unsloth: {list(leftover_aliases)} are not in aliases. Please update aliases."
+        )
+    pass
+
+    # Mapping for aliases
+    aliases_mapping = {}
+    for x in aliases_for_system:    aliases_mapping[x] = "system"
+    for x in aliases_for_user:      aliases_mapping[x] = "user"
+    for x in aliases_for_assistant: aliases_mapping[x] = "assistant"
+
+    def _standardize_dataset(examples):
+        convos = examples[conversation_key]
+        all_convos = []
+        for convo in convos:
+            new_convo = []
+            if len(convo) == 0: continue
+            has_system = aliases_mapping[convo[0][role_key]] == "system"
+            if not has_system and system_message is not None:
+                new_convo.append({ "role" : "system", "content" : system_message, })
+            for message in convo:
+                role = aliases_mapping[message[role_key]]
+                new_convo.append({ "role" : role, "content" : message[content_key], })
+            pass
+            all_convos.append(new_convo)
+        pass
+        return { conversation_key : all_convos, }
+    pass
+
+    return dataset.map(_standardize_dataset, batched = True,)
+pass
+
+
+def create_ollama_modelfile(tokenizer, gguf_location):
+    """
+        Creates an Ollama Modelfile.
+        Use ollama.create(model = "new_ollama_model", modelfile = modelfile)
+    """
     modelfile = getattr(tokenizer, "_ollama_modelfile", None)
     if modelfile is None:
         raise RuntimeError(

From 6386d9439e8eeef95f16382a4c8d43364db0f498 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 8 Jun 2024 04:18:25 +1000
Subject: [PATCH 085/153] Update llama.py

---
 unsloth/models/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 0e860b9d..dd7f6ba1 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -209,8 +209,9 @@ def LlamaAttention_fast_forward_inference(
 
     # Attention
     if bsz == 1:
+        A *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
+        # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
         A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
-        A *= self.scalar
         # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
         A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
         A = torch.matmul(A, Vnn, out = Qn)

From b1a95516d7ed8f992272bcc9e73662110c15ea34 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 9 Jun 2024 20:18:01 +1000
Subject: [PATCH 086/153] Update chat_templates.py

---
 unsloth/chat_templates.py | 48 +++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index fb9c929c..9c6a3a77 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -64,7 +64,8 @@
 {{ end }}{{ if .Prompt }}>>> User: {{ .Prompt }}
 {{ end }}>>> Assistant: {{ .Response }}{__EOS_TOKEN__}
 """
-PARAMETER stop {__EOS_TOKEN__}
+PARAMETER stop "{__EOS_TOKEN__}"
+SYSTEM """You are a helpful assistant to the user"""
 '''
 
 unsloth_eos_token = "eos_token"
@@ -98,7 +99,7 @@
 {{ end }}<|assistant|>
 {{ .Response }}{__EOS_TOKEN__}
 """
-PARAMETER stop {__EOS_TOKEN__}
+PARAMETER stop "{__EOS_TOKEN__}"
 '''
 
 zephyr_eos_token = "eos_token"
@@ -132,8 +133,8 @@
 {{ end }}<|im_start|>assistant
 {{ .Response }}<|im_end|>
 """
-PARAMETER stop <|im_start|>
-PARAMETER stop <|im_end|>
+PARAMETER stop "<|im_start|>"
+PARAMETER stop "<|im_end|>"
 '''
 
 chatml_eos_token = "<|im_end|>"
@@ -166,11 +167,12 @@
     "{% endfor %}"
 pass
 
+# Ollama from https://www.ollama.com/library/mistral
 mistral_ollama = \
 '''
 FROM {__FILE_LOCATION__}
 TEMPLATE """[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST]"""
-PARAMETER stop {__EOS_TOKEN__}
+PARAMETER stop "{__EOS_TOKEN__}"
 '''
 
 mistral_eos_token = "eos_token"
@@ -202,13 +204,14 @@
     "{% endfor %}"
 pass
 
+# Ollama from https://www.ollama.com/library/llama3
 llama_ollama = \
 '''
 FROM {__FILE_LOCATION__}
 TEMPLATE """[INST] <<SYS>>{{ .System }}<</SYS>>
 
 {{ .Prompt }} [/INST]"""
-PARAMETER stop {__EOS_TOKEN__}
+PARAMETER stop "{__EOS_TOKEN__}"
 '''
 
 llama_eos_token = "eos_token"
@@ -240,11 +243,12 @@
     "{% endif %}"
 pass
 
+# Ollama from https://www.ollama.com/library/vicuna
 vicuna_ollama = \
 '''
 FROM {__FILE_LOCATION__}
 TEMPLATE """{{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} {{ end }}ASSISTANT: {{ .Response }} {__EOS_TOKEN__}"""
-PARAMETER stop {__EOS_TOKEN__}
+PARAMETER stop "{__EOS_TOKEN__}"
 '''
 
 vicuna_eos_token = "eos_token"
@@ -283,7 +287,8 @@
 {{ end }}{{ if .Prompt }}### Human: {{ .Prompt }}
 {{ end }}### Assistant: {{ .Response }}{__EOS_TOKEN__}
 """
-PARAMETER stop {__EOS_TOKEN__}
+PARAMETER stop "{__EOS_TOKEN__}"
+SYSTEM """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."""
 '''
 
 vicuna_old_eos_token = "eos_token"
@@ -298,7 +303,7 @@
         "{{ messages[0]['content'] + '\n\n' }}"\
         "{% set loop_messages = messages[1:] %}"\
     "{% else %}"\
-        "{{ 'Below are some instructions that describes some tasks. Write responses that appropriately completes each request.\n\n' }}"\
+        "{{ 'Below are some instructions that describe some tasks. Write responses that appropriately complete each request.\n\n' }}"\
         "{% set loop_messages = messages %}"\
     "{% endif %}"\
     "{% for message in loop_messages %}"\
@@ -321,13 +326,14 @@
 TEMPLATE """{{ if .System }}{{ .System }}
 
 {{ end }}{{ if .Prompt }}### Instruction:
-{{ .Prompt }}
+{{ .Prompt }}{{ end }}
 
-{{ end }}### Response:
+### Response:
 {{ .Response }}{__EOS_TOKEN__}
 
 """
-PARAMETER stop {__EOS_TOKEN__}
+PARAMETER stop "{__EOS_TOKEN__}"
+SYSTEM """Below are some instructions that describe some tasks. Write responses that appropriately complete each request."""
 '''
 
 alpaca_eos_token = "eos_token"
@@ -358,6 +364,7 @@
     "{% endif %}"
 pass
 
+# Ollama from https://www.ollama.com/library/gemma
 gemma_ollama = \
 '''
 FROM {__FILE_LOCATION__}
@@ -367,8 +374,8 @@
 {{ .Response }}<end_of_turn>
 """
 PARAMETER repeat_penalty 1
-PARAMETER stop <start_of_turn>
-PARAMETER stop <end_of_turn>
+PARAMETER stop "<start_of_turn>"
+PARAMETER stop "<end_of_turn>"
 PARAMETER penalize_newline false
 '''
 
@@ -392,8 +399,8 @@
 {{ .Response }}<|im_end|>
 """
 PARAMETER repeat_penalty 1
-PARAMETER stop <|im_start|>
-PARAMETER stop <|im_end|>
+PARAMETER stop "<|im_start|>"
+PARAMETER stop "<|im_end|>"
 PARAMETER penalize_newline false
 '''
 
@@ -422,6 +429,7 @@
     "{% endif %}"
 pass
 
+# Ollama from https://www.ollama.com/library/llama3
 llama3_ollama = \
 '''
 FROM {__FILE_LOCATION__}
@@ -435,7 +443,6 @@
 PARAMETER stop "<|start_header_id|>"
 PARAMETER stop "<|end_header_id|>"
 PARAMETER stop "<|eot_id|>"
-PARAMETER stop "<|reserved_special_token"
 '''
 
 llama3_template_eos_token = "eos_token"
@@ -460,6 +467,7 @@
     "{% endif %}"
 pass
 
+# Ollama from https://www.ollama.com/library/phi3
 phi3_ollama = \
 '''
 FROM {__FILE_LOCATION__}
@@ -470,9 +478,9 @@
 {{ end }}<|assistant|>
 {{ .Response }}<|end|>
 """
-PARAMETER stop <|end|>
-PARAMETER stop <|user|>
-PARAMETER stop <|assistant|>
+PARAMETER stop "<|end|>"
+PARAMETER stop "<|user|>"
+PARAMETER stop "<|assistant|>"
 '''
 
 phi3_template_eos_token = "<|end|>"

From 344a05d467eadd6e35f83b6a67d21d6b8cbd8475 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 9 Jun 2024 20:34:19 +1000
Subject: [PATCH 087/153] Support bfloat16 GGUF

---
 unsloth/save.py | 97 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 77 insertions(+), 20 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 1a3e532f..6cef5b6d 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -59,7 +59,8 @@
     "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
     "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
     "f32"     : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
-    "f16"     : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+    "bf16"    : "Bfloat16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+    "f16"     : "Float16  - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
     "q8_0"    : "Fast conversion. High resource use, but generally acceptable.",
     "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
     "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
@@ -102,7 +103,7 @@ def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencep
     if os.path.isfile(f"{file_location}/tokenizer.model"):
         sentencepiece_model = True
     pass
-    shutil.rmtree(file_location)
+    shutil.rmtree(file_location, ignore_errors = True)
     return sentencepiece_model
 pass
 
@@ -700,7 +701,7 @@ def unsloth_save_model(
 
     # Remove temporary location
     import shutil
-    shutil.rmtree(temporary_location)
+    shutil.rmtree(temporary_location, ignore_errors = True)
 
     for _ in range(3):
         torch.cuda.empty_cache()
@@ -763,7 +764,7 @@ def install_llama_cpp_old(version = -10):
             print(f"**[WARNING]** Deleting llama.cpp directory... {10-i} seconds left.")
             time.sleep(1)
         import shutil
-        shutil.rmtree("llama.cpp")
+        shutil.rmtree("llama.cpp", ignore_errors = True)
     pass
 
     # Clone a specific commit
@@ -866,10 +867,11 @@ def _fix_gemma_gguf():
 
 def save_to_gguf(
     model_type           : str,
+    model_dtype          : str,
     is_sentencepiece     : bool = False,
     model_directory      : str = "unsloth_finetuned_model",
     quantization_method  : str = "fast_quantized",
-    first_conversion     : str = "f16",
+    first_conversion     : str = None,
     _run_installer = None, # Non blocking install of llama.cpp
 ):
     # logger.warning(
@@ -877,6 +879,22 @@ def save_to_gguf(
     #     "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\
     #     "Please be patient - GGUF saving should still work, but might not work as well."
     # )
+    assert(model_dtype == "float16" or model_dtype == "bfloat16")
+    model_dtype = "f16" if model_dtype == "float16" else "bf16"
+
+    # Check if bfloat16 is supported
+    if model_dtype == "bf16" and not torch.cuda.is_bf16_supported():
+        logger.warning(
+            "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
+            "We shall switch instead to f16."
+        )
+        model_dtype = "f16"
+    pass
+
+    # Check first_conversion as well
+    if first_conversion is None:
+        first_conversion = model_dtype
+    pass
 
     if quantization_method.startswith("iq2"):
         raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
@@ -889,7 +907,7 @@ def save_to_gguf(
     pass
     logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.")
 
-    if   quantization_method == "not_quantized":  quantization_method = "f16"
+    if   quantization_method == "not_quantized":  quantization_method = model_dtype
     elif quantization_method == "fast_quantized": quantization_method = "q8_0"
     elif quantization_method == "quantized":      quantization_method = "q4_k_m"
     elif quantization_method is None:             quantization_method = "q8_0"
@@ -911,12 +929,13 @@ def save_to_gguf(
     print(print_info)
 
     # Check first_conversion format
-    if   first_conversion == "f16" : pass
-    elif first_conversion == "f32" : pass
-    elif first_conversion == "q8_0": pass
+    if   first_conversion == "f16"  : pass
+    if   first_conversion == "bf16" : pass
+    elif first_conversion == "f32"  : pass
+    elif first_conversion == "q8_0" : pass
     else:
         raise RuntimeError(
-            f"Unsloth: `first_conversion` can only be one of ['f16', 'f32', 'q8_0'] and not `{first_conversion}`."
+            f"Unsloth: `first_conversion` can only be one of ['f16', 'bf16', 'f32', 'q8_0'] and not `{first_conversion}`."
         )
     pass
 
@@ -935,11 +954,13 @@ def save_to_gguf(
 
     if   quantization_method == "f32":  first_conversion = "f32"
     elif quantization_method == "f16":  first_conversion = "f16"
+    elif quantization_method == "bf16": first_conversion = "bf16"
     elif quantization_method == "q8_0": first_conversion = "q8_0"
     else:
         # Quantized models must have f16 as the default argument
-        if   first_conversion == "f32" : pass
-        elif first_conversion == "f16" : pass
+        if   first_conversion == "f32"  : pass
+        elif first_conversion == "f16"  : pass
+        elif first_conversion == "bf16" : pass
         elif first_conversion == "q8_0":
             logger.warning_once(
                 "Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\
@@ -950,8 +971,22 @@ def save_to_gguf(
     pass
 
     # Non llama/mistral needs can only use f32 or f16
-    if not use_fast_convert and (first_conversion != "f16" or first_conversion != "f32"):
-        logger.warning_once("Unsloth: We must use f16 for non Llama and Mistral models.")
+    if not use_fast_convert and \
+        (first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"):
+
+        pass
+        # Latest llama.cpp works for all models for q8_0!
+
+        # logger.warning_once("Unsloth: We must use f16 for non Llama and Mistral models.")
+        # first_conversion = "f16"
+    pass
+
+    # Check if bfloat16 is supported
+    if first_conversion == "bf16" and not torch.cuda.is_bf16_supported():
+        logger.warning(
+            "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
+            "We shall switch instead to f16."
+        )
         first_conversion = "f16"
     pass
 
@@ -1318,7 +1353,7 @@ def unsloth_save_pretrained_gguf(
     save_directory       : Union[str, os.PathLike],
     tokenizer            = None,
     quantization_method  : str = "fast_quantized",
-    first_conversion     : str = "f16",
+    first_conversion     : str = None,
     push_to_hub          : bool = False,
     token                : Optional[Union[str, bool]] = None,
     private              : Optional[bool] = None,
@@ -1429,11 +1464,22 @@ def unsloth_save_pretrained_gguf(
     for _ in range(3):
         gc.collect()
 
-    model_type = self.config.model_type
+    model_dtype = self.config.torch_dtype
+    model_type  = self.config.model_type
+    if type(model_dtype) is str:
+        assert(model_dtype == "float16" or model_dtype == "bfloat16")
+    elif model_dtype == torch.float16:
+        model_dtype = "float16"
+    elif model_dtype == torch.bfloat16:
+        model_dtype = "bfloat16"
+    else:
+        raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
+    pass
+
     is_sentencepiece_model = check_if_sentencepiece_model(self)
 
     # Save to GGUF
-    file_location = save_to_gguf(model_type, is_sentencepiece_model, 
+    file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, 
         new_save_directory, quantization_method, first_conversion, makefile,
     )
 
@@ -1463,7 +1509,7 @@ def unsloth_push_to_hub_gguf(
     repo_id              : str,
     tokenizer            = None,
     quantization_method  : str = "fast_quantized",
-    first_conversion     : str = "f16",
+    first_conversion     : str = None,
     use_temp_dir         : Optional[bool] = None,
     commit_message       : Optional[str] = "Trained with Unsloth",
     private              : Optional[bool] = None,
@@ -1569,11 +1615,22 @@ def unsloth_push_to_hub_gguf(
     for _ in range(3):
         gc.collect()
 
-    model_type = self.config.model_type
+    model_dtype = self.config.torch_dtype
+    model_type  = self.config.model_type
+    if type(model_dtype) is str:
+        assert(model_dtype == "float16" or model_dtype == "bfloat16")
+    elif model_dtype == torch.float16:
+        model_dtype = "float16"
+    elif model_dtype == torch.bfloat16:
+        model_dtype = "bfloat16"
+    else:
+        raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
+    pass
+
     is_sentencepiece_model = check_if_sentencepiece_model(self)
 
     # Save to GGUF
-    file_location = save_to_gguf(model_type, is_sentencepiece_model, 
+    file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, 
         new_save_directory, quantization_method, first_conversion, makefile,
     )
 

From 6b11e0d6f443c14831490c0a67fb7d1491e8fad0 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 10 Jun 2024 03:20:22 +1000
Subject: [PATCH 088/153] Update save.py

---
 unsloth/save.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/save.py b/unsloth/save.py
index 6cef5b6d..3ad2f346 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1010,6 +1010,7 @@ def save_to_gguf(
         vocab_type = "bpe"
     pass
 
+    # convert.py is deprecated!
     use_fast_convert = False
     if use_fast_convert:
         command = f"python llama.cpp/convert.py {model_directory} "\

From c6e4b5ba33dce8e048465bf9623a3c333ef1a814 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 10 Jun 2024 03:33:12 +1000
Subject: [PATCH 089/153] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index dd7f6ba1..0fa505fc 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -209,7 +209,7 @@ def LlamaAttention_fast_forward_inference(
 
     # Attention
     if bsz == 1:
-        A *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
+        Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
         # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
         A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
         # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched

From 57f29ab0c6fe24e6df94bf3ee608538ea2895b2b Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 10 Jun 2024 03:40:31 +1000
Subject: [PATCH 090/153] fast_forward_inference

---
 unsloth/models/llama.py | 2 +-
 unsloth/models/qwen2.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 0fa505fc..8f48cd12 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -792,7 +792,7 @@ def _CausalLM_fast_forward(
         *args, **kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         
-        if past_key_values is not None and self.config.model_type != "qwen2":
+        if past_key_values is not None:
             outputs = fast_forward_inference(
                 self,
                 input_ids,
diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
index 76fe31a6..115bf3e0 100644
--- a/unsloth/models/qwen2.py
+++ b/unsloth/models/qwen2.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from .llama import *
-from .mistral import FastMistralModel
 import os
 from ._utils import __version__
 
@@ -60,7 +59,7 @@ def pre_patch():
 
     @staticmethod
     def from_pretrained(
-        model_name     = "Qwen/Qwen1.5-7B",
+        model_name     = "Qwen/Qwen2-7B",
         max_seq_length = 4096,
         dtype          = None,
         load_in_4bit   = True,
@@ -73,7 +72,7 @@ def from_pretrained(
         trust_remote_code = False,
         **kwargs,
     ):
-        return FastMistralModel.from_pretrained(
+        return FastLlamaModel.from_pretrained(
             model_name     = model_name,
             max_seq_length = max_seq_length,
             dtype          = dtype,

From d32e97264e4ecdebcc6126bc70b6c9bbe9406a26 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 10 Jun 2024 03:46:54 +1000
Subject: [PATCH 091/153] Update mapper.py

---
 unsloth/models/mapper.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 8808b855..73aa06ca 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -197,4 +197,12 @@
     for value in values:
         FLOAT_TO_INT_MAPPER[value] = key
     pass
+
+    # Get lowercased
+    lowered_key = key.lower()
+    INT_TO_FLOAT_MAPPER[lowered_key] = values[0].lower()
+
+    for value in values:
+        FLOAT_TO_INT_MAPPER[value.lower()] = lowered_key
+    pass
 pass

From e121fa5df2e15376a77505c8df6074186cfc694e Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 10 Jun 2024 03:47:33 +1000
Subject: [PATCH 092/153] Update loader.py

---
 unsloth/models/loader.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index b2f0e4ef..3bc091b3 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -33,6 +33,9 @@
 
 def _get_model_name(model_name, load_in_4bit = True):
 
+    # First try replacing lowercase 'b' with uppercase 'B'
+    model_name = model_name.lower()
+
     if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER:
         model_name = INT_TO_FLOAT_MAPPER[model_name]
         logger.warning_once(

From 5eaa10f3a279eb6acf8ffb348935ce6e9b032ba8 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 10 Jun 2024 03:53:30 +1000
Subject: [PATCH 093/153] Update llama.py

---
 unsloth/models/llama.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 8f48cd12..6064af59 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1831,10 +1831,10 @@ def patch_peft_model(
 
     @staticmethod
     def for_inference(model):
-        if model.config.model_type == "qwen2":
-            FastLlamaModel.for_training(model)
-            return
-        pass
+        # if model.config.model_type == "qwen2":
+        #     FastLlamaModel.for_training(model)
+        #     return
+        # pass
 
         internal_model = model
         internal_model.gradient_checkpointing = False

From f57d28d1460ca5c71b48c99438f5c96197db3c3b Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 11 Jun 2024 03:56:52 +1000
Subject: [PATCH 094/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 6afea680..0e286166 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -193,17 +193,38 @@ def assert_same_tokenization(slow_tokenizer, fast_tokenizer):
         if x.endswith("_token") and x.count("_") == 1
     )))
     all_special_tokens = list(set(special_tokens + slow_tokenizer.all_special_tokens))
+
+    # Check if chat template is enabled!
+    check_chat_template = True
+
+    if  getattr(slow_tokenizer, "chat_template", None) is not None and \
+        getattr(fast_tokenizer, "chat_template", None) is not None:
+
+        # Check chat template!
+        messages = [
+            {"role": "user", "content": " What is 2+2? "},
+            {"role": "assistant", "content": " It's 4. "},
+        ]
+        check_chat_template = \
+            slow_tokenizer(slow_tokenizer.apply_chat_template(messages)).input_ids == \
+            fast_tokenizer(slow_tokenizer.apply_chat_template(messages)).input_ids
+    pass
+
     try:
         string = "\n".join(all_special_tokens) + \
             "A quick brown fox jumps over the lazy dog!!\n\nHi</s>\n\n" + \
             "".join(all_special_tokens)
-        return slow_tokenizer(string).input_ids == fast_tokenizer(string).input_ids
+        check_special_tokens = \
+            slow_tokenizer(string).input_ids == \
+            fast_tokenizer(string).input_ids
+
+        return check_chat_template and check_special_tokens
     except:
         # For eg see https://github.com/unslothai/unsloth/issues/292
         # Sometimes tokenizer has weird tokens, causing a combined tokenization to fail.
         # [TODO] We temporarily disable this for CodeLlama tokenizers
         if slow_tokenizer.__repr__().split("(", 1)[0] in IGNORED_TOKENIZER_CHECKING:
-            return True
+            return check_chat_template
         else:
             return False
 pass

From 893750707f0be1bdb6b04689dd94d02187ece7cf Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 11 Jun 2024 19:53:29 +1000
Subject: [PATCH 095/153] info

---
 unsloth/models/llama.py   | 26 ++++++++++-----------
 unsloth/models/mistral.py | 48 ++++++++++++++++-----------------------
 2 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6064af59..e6f9e756 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1207,12 +1207,12 @@ def from_pretrained(
 
         debug_info = """n_total_devices = total_train_batch_size // \\
             args.gradient_accumulation_steps // self._train_batch_size
-        if n_total_devices > 2:
+        if n_total_devices > 1:
             logger.warning_once(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
         debug_info ="""
         debug_info = debug_info.split('\n')
@@ -1237,17 +1237,17 @@ def from_pretrained(
         bsz = self._train_batch_size
         total_batches = bsz * ga * args.world_size
         n_total_devices = total_batches // ga // bsz
-        if n_total_devices > 2:
+        if n_total_devices > 1:
             logger.warning_once(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
-            divisor = n_total_devices / 2
+            divisor = n_total_devices / 1
             bsz = self._train_batch_size = max(int(bsz / divisor), 1)
-            if total_batches // ga // bsz > 2:
-                divisor = n_total_devices / 2
+            if total_batches // ga // bsz > 1:
+                divisor = n_total_devices / 1
                 ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
         check_batches = check_batches.split('\n')
         check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 365d60a3..5c49c636 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -393,21 +393,6 @@ def from_pretrained(
             layer.self_attn.apply_o   = original_apply_o
         pass
 
-        # Patch Trainer
-        from transformers.trainer import Trainer
-        if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
-            try:
-                inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
-            except:
-                raise RuntimeError(
-                    "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                    "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
-                    "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
-                    "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-                )
-            pass
-        pass
-
         # Patch Trainer
         from transformers.trainer import Trainer
         try:
@@ -419,7 +404,7 @@ def from_pretrained(
         except:
             raise RuntimeError(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
                 "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
@@ -447,7 +432,11 @@ def from_pretrained(
         f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
         f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
         f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
-        logger.warning_once(debug_info)"""
+        logger.warning(debug_info)
+        import gc
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()"""
 
         debug_info = debug_info.split('\n')
         debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
@@ -455,12 +444,12 @@ def from_pretrained(
 
         debug_info = """n_total_devices = total_train_batch_size // \\
             args.gradient_accumulation_steps // self._train_batch_size
-        if n_total_devices > 2:
+        if n_total_devices > 1:
             logger.warning_once(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
         debug_info ="""
         debug_info = debug_info.split('\n')
@@ -485,16 +474,17 @@ def from_pretrained(
         bsz = self._train_batch_size
         total_batches = bsz * ga * args.world_size
         n_total_devices = total_batches // ga // bsz
-        if n_total_devices > 2:
+        if n_total_devices > 1:
             logger.warning_once(
-                "Please consider a commercial license - Unsloth was designed for the GPU Poor.\\n"
-                "The OSS currently works on 4 GPUs - we're a 2 person team, so please help fund\\n"
-                "our development costs by supporting us through Ko-fi or buying a license! Thanks!",
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
-            divisor = n_total_devices / 2
+            divisor = n_total_devices / 1
             bsz = self._train_batch_size = max(int(bsz / divisor), 1)
-            if total_batches // ga // bsz > 2:
-                divisor = n_total_devices / 2
+            if total_batches // ga // bsz > 1:
+                divisor = n_total_devices / 1
                 ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
         check_batches = check_batches.split('\n')
         check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])

From 8982edb6eac2bd2d4facb92a97c3e5add6216348 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 11 Jun 2024 20:21:28 +1000
Subject: [PATCH 096/153] edits

---
 unsloth/models/llama.py    |  8 +++++++-
 unsloth/models/mistral.py  |  8 +++++++-
 unsloth/tokenizer_utils.py | 14 +++++++++-----
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index e6f9e756..4cbbcf0a 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1196,7 +1196,13 @@ def from_pretrained(
         f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
         f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
         logger.warning(debug_info)
-        import gc
+        import subprocess, re, gc
+        output = subprocess.check_output(
+            'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
+        output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
+        output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
+        if output > 1: raise RuntimeError(
+            'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
         for _ in range(3):
             gc.collect()
             torch.cuda.empty_cache()"""
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 5c49c636..fc2e1a9f 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -433,7 +433,13 @@ def from_pretrained(
         f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
         f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
         logger.warning(debug_info)
-        import gc
+        import subprocess, re, gc
+        output = subprocess.check_output(
+            'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
+        output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
+        output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
+        if output > 1: raise RuntimeError(
+            'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
         for _ in range(3):
             gc.collect()
             torch.cuda.empty_cache()"""
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 0e286166..9ffd1d71 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -595,6 +595,8 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     # Get set and actual tokens
     where_untrained = where_untrained.tolist()
     if len(where_untrained) == 0: return
+
+    # Remove untrained indices where it's longer
     
     where_untrained_set = frozenset(where_untrained)
     actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
@@ -875,11 +877,13 @@ def patch_sft_trainer_tokenizer():
     "    )\n"\
     "pass\n"\
     "n_devices = torch.cuda.device_count()\n"\
-    "more_than = 0\n"\
-    "for j in range(n_devices):\n"\
-    "    vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\
-    "    more_than += (vram > 4)\n"\
-    "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')\n"\
+    "import subprocess, re\n"\
+    "output = subprocess.check_output(\n"\
+    "    'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)\n"\
+    "output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)\n"\
+    "output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)\n"\
+    "if output > 1: raise RuntimeError(\n"\
+    "    'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')\n"\
     "for _ in range(3):\n"\
     "    gc.collect()\n"\
     "    torch.cuda.empty_cache()\n"\

From 8904605967995db57cc133d0cb924c53cc006afd Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 12 Jun 2024 06:30:12 +1000
Subject: [PATCH 097/153] Create chat template

---
 unsloth/chat_templates.py | 291 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 290 insertions(+), 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 9c6a3a77..70391cc2 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -17,8 +17,11 @@
     "test_chat_templates",
     "test_hf_gguf_equivalence",
     "remove_special_tokens",
-    "create_ollama_modelfile",
     "standardize_dataset",
+
+    "construct_chat_template",
+    "test_construct_chat_template",
+    "create_ollama_modelfile",
 ]
 
 from transformers import StoppingCriteria, StoppingCriteriaList
@@ -785,6 +788,292 @@ def _standardize_dataset(examples):
 pass
 
 
+import re
+
+def get_ollama_eos_tokens(tokenizer, extra_eos_tokens = []):
+    added_tokens_decoder = tokenizer.added_tokens_decoder.values()
+    added_tokens_decoder = [str(x) for x in added_tokens_decoder]
+
+    # Remove added_tokens_decoder duplicates
+    added_tokens_decoder = list(set(added_tokens_decoder) - set(extra_eos_tokens))
+
+    # Remove BOS
+    if getattr(tokenizer, "bos_token", None) is not None:
+        added_tokens_decoder = [x for x in added_tokens_decoder if x != tokenizer.bos_token]
+    pass
+
+    repeatted_tokens = []
+    # Join all vocab
+    joined_text = "\x01\x00".join(added_tokens_decoder)
+    for token in added_tokens_decoder:
+        n = len(token)
+        repeatted_counts = joined_text.count(token[:n//2])
+        # Try finding longer than 1/2 of the token in the rest
+        # For eg <|reserved_special_token_0|>, <|reserved_special_token_1|>
+        if repeatted_counts > 2:
+            for j in range(n//2+1, n):
+                if joined_text.count(token[:j]) < repeatted_counts:
+                    j -= 1
+                    # Remove repeatted tokens to reduce search space
+                    joined_text = joined_text.replace(token[:j], "")
+                    repeatted_tokens.append(token[:j])
+                    break
+            pass
+        pass
+    pass
+
+    # Remove duplicates
+    splitted = joined_text.split("\x01\x00")
+    final_eos_tokens = []
+    for old, new in zip(added_tokens_decoder, splitted):
+        if old == new: final_eos_tokens.append(old)
+    pass
+    final_eos_tokens += extra_eos_tokens
+    final_eos_tokens += repeatted_tokens
+    return final_eos_tokens
+pass
+
+
+def construct_chat_template( \
+
+tokenizer = None,
+
+template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|>""",
+    
+default_system_message = \
+    "Below are some instructions that describe some tasks. Write responses that appropriately complete each request.",
+  
+extra_eos_tokens = None,
+  
+):
+    """
+    Creates a Ollama modelfile and a HF Jinja template from a custom
+    template. You must provide 2x examples of an input & output.
+    There is an optional system message as well.
+
+    You must use {INPUT}, {OUTPUT} twice, and {SYSTEM} is optional.
+    """
+    assert(tokenizer is not None)
+
+    if extra_eos_tokens is None: extra_eos_tokens = []
+
+    vocab = tokenizer.get_vocab()
+    for extra_eos in extra_eos_tokens:
+        assert(type(extra_eos) is str)
+        if extra_eos not in vocab:
+            raise ValueError(f"Unsloth: `{extra_eos}` is not a singular token in the tokenizer.")
+        pass
+    pass
+
+    # O(N^2) search finding 2 repeatted pieces of text
+    j = len(template)-1
+    at_least_one = False
+    while j > 0:
+        found = template.rfind(template[j:], 0, j)
+        if found == -1: break
+        j -= 1
+        at_least_one = True
+    pass
+    if j > 0: j += 1
+    else: raise
+
+    if not at_least_one: raise
+
+    # Repeatted text
+    instruction_response = template[j:]
+    if instruction_response.count("{INPUT}") != 1 or instruction_response.count("{OUTPUT}") != 1:
+        raise RuntimeError(
+            "Unsloth: Your prompt template must have 2 examples showing the user input {INPUT} "\
+            "and the assistant output {OUTPUT}\n\n"\
+            "For example what is not allowed is just:\n"\
+            "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n\n\n"\
+            "What is required is 2x of this:\n"\
+            "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"\
+            "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"
+        )
+    pass
+
+    # 1st System, Instruction, Output pair
+    left  = template[:j]
+    # 2nd Instruction, Output pair
+    right = template[j:]
+
+    # Isolate input
+    extra_eos_tokens_regex = "|".join(f"(?:{re.escape(x)})" for x in extra_eos_tokens)
+    if len(extra_eos_tokens_regex) != 0:
+        find_end = f"(?:{extra_eos_tokens_regex})?"
+    else:
+        find_end = ""
+    find_end = r"\{INPUT\}[\s\n]{0,}" + find_end
+    input_end = list(re.finditer(find_end, right))
+    assert(len(input_end) == 1)
+    input_end = input_end[0]
+    input_end = input_end.span(0)[1]
+    input_part = right[:input_end]
+
+    # Isolate output
+    output_part = right[input_end:]
+
+    # Isolate system
+    system_part = left[:left.find(input_part)]
+
+    # Check if the user provided a correct prompt
+    combined = system_part + input_part + output_part
+    if combined != left:
+        combined_changed = combined.replace('\n', '\\n')
+        left_changed     = left    .replace('\n', '\\n')
+        raise RuntimeError(
+            "Unsloth: The prompt template you provided isn't correct. You gave:\n"\
+            f"{combined_changed}\n\n"\
+            "But we require the following:\n"\
+            f"{left_changed}"
+        )
+    pass
+
+    # Ollama modelfile parts
+
+    # Check bos_token is in system prompt
+    ollama_system = system_part
+    has_bos_token = False
+    if tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None):
+        if ollama_system.startswith(tokenizer.bos_token):
+            has_bos_token = True
+            ollama_system = ollama_system[len(tokenizer.bos_token):]
+        pass
+    pass
+    system_modelfile = "{{ if .System }}" + ollama_system.replace("{SYSTEM}", "{{ .System }}") + "{{ end }}"
+    input_modelfile  = "{{ if .Prompt }}" + input_part .replace("{INPUT}",  "{{ .Prompt }}") + "{{ end }}"
+    output_modelfile = output_part.replace("{OUTPUT}", "{{ .Response }}")
+
+    # Check if EOS token is at the end of the output
+    if not output_modelfile.endswith(tuple(extra_eos_tokens)):
+        output_modelfile += "{__EOS_TOKEN__}"
+    pass
+
+    # Ollama EOS
+    ollama_eos = get_ollama_eos_tokens(tokenizer, extra_eos_tokens)
+    ollama_eos = '\n'.join(f'PARAMETER stop "{eos}"' for eos in ollama_eos)
+
+    # Ollama modelfile
+    modelfile = 'FROM {__FILE_LOCATION__}\n\n'\
+    'TEMPLATE """' + system_modelfile + input_modelfile + output_modelfile + \
+    '"""\n\n' + ollama_eos
+
+    # HF Jinja Chat template
+    def process(part, which, content = "message['content']"):
+        if part.endswith(which):
+            part = "'" + part[:part.find(which)] + f"' + {content}"
+        elif part.startswith(which):
+            part = f"{content} + '" + part[part.find(which):] + "'"
+        else:
+            part = "'" + part.replace(which, f"' + {content} + '") + "'"
+        if part.startswith("'' + "): part = part[5:]
+        return part
+    pass
+    input_jinja  = process(input_part,  "{INPUT}")
+    output_jinja = process(output_part, "{OUTPUT}")
+    pass
+
+    jinja_template = \
+        "{% for message in loop_messages %}"\
+            "{% if message['role'] == 'user' %}"\
+                "{{ " + input_jinja + " }}"\
+            "{% elif message['role'] == 'assistant' %}"\
+                "{{ " + output_jinja + " }}"\
+            "{% else %}"\
+                "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+            "{% endif %}"\
+        "{% endfor %}"\
+        "{% if add_generation_prompt %}"\
+            "{{ '" + output_part[:output_part.find("{OUTPUT}")] + "' }}"\
+        "{% endif %}"
+    pass
+
+    # Now add system prompt to jinja
+    if len(system_part) != 0:
+        partial_system = process(system_part, "{SYSTEM}", "messages[0]['content']")
+        partial_system = partial_system.replace("{SYSTEM}", "")
+
+        # Separate the BOS
+        if has_bos_token:
+            partial_system = partial_system.replace(tokenizer.bos_token, "", 1)
+        pass
+
+        partial_system = \
+            "{% if messages[0]['role'] == 'system' %}"\
+                "{{ " + partial_system + " }}"\
+                "{% set loop_messages = messages[1:] %}"
+        if default_system_message is not None:
+            partial_system += "{% else %}"\
+                "{{ '" + system_part.replace("{SYSTEM}", default_system_message) + "' }}"\
+                "{% set loop_messages = messages %}"\
+            "{% endif %}"
+        else:
+            partial_system += "{% endif %}"
+        pass
+
+        jinja_template = partial_system + jinja_template
+
+        if has_bos_token:
+            jinja_template = "{{ bos_token }}" + jinja_template
+    pass
+
+    return modelfile, jinja_template
+pass
+
+
+def test_construct_chat_template():
+    token = "hf_"
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token = token)
+
+    template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{OUTPUT}<|eot_id|>"""
+    
+    default_system_message = \
+        "Below are some instructions that describe some tasks. Write responses that appropriately complete each request."
+      
+    extra_eos_tokens = None
+
+    modelfile, jinja_template = construct_chat_template(template, default_system_message, extra_eos_tokens)
+
+    messages = [
+        {"role": "system", "content": "You are an assistant"},
+        {"role": "user", "content": "What is 2+2?"},
+        {"role": "assistant", "content": "It's 4."},
+        {"role": "user", "content": "Ok!"},
+        {"role": "assistant", "content": "Anything else?"},
+        {"role": "user", "content": "What's 2x2?"},
+    ]
+    correct_output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+
+    tokenizer.chat_template = jinja_template
+    new_output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+
+    assert(correct_output == new_output)
+    pass
+pass
+
+
 def create_ollama_modelfile(tokenizer, gguf_location):
     """
         Creates an Ollama Modelfile.

From 2a374c23683a4b39910b23559eb99a9dd0af7e2d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 03:25:36 +1000
Subject: [PATCH 098/153] Fix tokenizer

---
 unsloth/chat_templates.py  |  26 +++---
 unsloth/tokenizer_utils.py | 166 ++++++++++++++++++++++++++++++++-----
 2 files changed, 159 insertions(+), 33 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 70391cc2..4c782326 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -788,8 +788,6 @@ def _standardize_dataset(examples):
 pass
 
 
-import re
-
 def get_ollama_eos_tokens(tokenizer, extra_eos_tokens = []):
     added_tokens_decoder = tokenizer.added_tokens_decoder.values()
     added_tokens_decoder = [str(x) for x in added_tokens_decoder]
@@ -875,6 +873,15 @@ def construct_chat_template( \
         pass
     pass
 
+    error_msg = \
+        "Unsloth: Your prompt template must have 2 examples showing the user input {INPUT} "\
+        "and the assistant output {OUTPUT}\n\n"\
+        "For example what is not allowed is just:\n"\
+        "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n\n\n"\
+        "What is required is 2x of this:\n"\
+        "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"\
+        "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"
+
     # O(N^2) search finding 2 repeatted pieces of text
     j = len(template)-1
     at_least_one = False
@@ -885,22 +892,15 @@ def construct_chat_template( \
         at_least_one = True
     pass
     if j > 0: j += 1
-    else: raise
+    else: raise RuntimeError(error_msg)
+
 
-    if not at_least_one: raise
+    if not at_least_one: raise RuntimeError(error_msg)
 
     # Repeatted text
     instruction_response = template[j:]
     if instruction_response.count("{INPUT}") != 1 or instruction_response.count("{OUTPUT}") != 1:
-        raise RuntimeError(
-            "Unsloth: Your prompt template must have 2 examples showing the user input {INPUT} "\
-            "and the assistant output {OUTPUT}\n\n"\
-            "For example what is not allowed is just:\n"\
-            "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n\n\n"\
-            "What is required is 2x of this:\n"\
-            "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"\
-            "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"
-        )
+        raise RuntimeError(error_msg)
     pass
 
     # 1st System, Instruction, Output pair
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 9ffd1d71..f10b2c0a 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -185,6 +185,111 @@ def convert_to_fast_tokenizer(
 pass
 
 
+# Check Mistral chat template without BOS / EOS
+mistral_template = \
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{% if messages[1]['role'] == 'user' %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[2:] %}"\
+        "{% else %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[1:] %}"\
+        "{% endif %}"\
+    "{% else %}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '[INST] ' + message['content'] + ' [/INST]' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ message['content'] }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"
+pass
+
+# Check Llama chat template without BOS / EOS
+llama_template = \
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{% if messages[1]['role'] == 'user' %}"\
+            "{{ '[INST] <<SYS>>\n' + messages[0]['content'] + '\n<</SYS>>\n\n' + messages[1]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[2:] %}"\
+        "{% else %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[1:] %}"\
+        "{% endif %}"\
+    "{% else %}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '[INST] ' + message['content'].strip() + ' [/INST]' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ ' ' + message['content'].strip() + ' ' }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"
+pass
+    
+
+def select_correct_slow_tokenizer(
+    tokenizer_name,
+    model_max_length = None,
+    padding_side = "right",
+    token = None,
+    trust_remote_code = False,
+    cache_dir = "huggingface_tokenizers_cache",
+):
+    """
+    Returns 'correct' tokenizer by checking if the chat templates are
+    actually tokenized correctly.
+    """
+    messages = [
+        {"role": "user", "content": "What is 2+2?"},
+        {"role": "assistant", "content": "It's 4."},
+    ]
+    
+    settings = (
+        (False, False, True,),
+        (False, True,  True,),
+        (True,  False, True,),
+        (True,  False, False,),
+    )
+
+    for (use_fast, legacy, from_slow,) in settings:
+        # Default as mentioned by Arthur from HF:
+        slow_tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name,
+            model_max_length  = model_max_length,
+            padding_side      = padding_side,
+            token             = token,
+            trust_remote_code = trust_remote_code,
+            # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
+            use_fast          = use_fast,
+            legacy            = legacy,
+            from_slow         = from_slow,
+            cache_dir         = cache_dir,
+        )
+        slow_tokenizer_chat_template = slow_tokenizer.chat_template
+
+        slow_tokenizer.chat_template = llama_template
+        result1 = slow_tokenizer.decode(slow_tokenizer.apply_chat_template(messages))
+        slow_tokenizer.chat_template = mistral_template
+        result2 = slow_tokenizer.decode(slow_tokenizer.apply_chat_template(messages))
+
+        # If 2 spaces seen, normally wrong!
+        if " "*2 not in result1 and " "*2 not in result2:
+            slow_tokenizer.chat_template = slow_tokenizer_chat_template
+            return slow_tokenizer
+        pass
+    pass
+    # Return fast version as default
+    return slow_tokenizer
+pass
+
+
 def assert_same_tokenization(slow_tokenizer, fast_tokenizer):
     # Get eos_token, bos_token etc
     dir_names = dir(slow_tokenizer)
@@ -195,21 +300,44 @@ def assert_same_tokenization(slow_tokenizer, fast_tokenizer):
     all_special_tokens = list(set(special_tokens + slow_tokenizer.all_special_tokens))
 
     # Check if chat template is enabled!
-    check_chat_template = True
+    check_chat_template1 = True
+    check_chat_template2 = True
+    check_chat_template3 = True
+    slow_chat_template = getattr(slow_tokenizer, "chat_template", None)
+    fast_chat_template = getattr(fast_tokenizer, "chat_template", None)
+    messages = [
+        {"role": "user", "content": " What is 2+2? "},
+        {"role": "assistant", "content": " It's 4. "},
+    ]
+    # Check the tokenizer's own chat template
+    if  slow_chat_template is not None and fast_chat_template is not None:
+        check_chat_template1 = \
+            slow_tokenizer.apply_chat_template(messages) == \
+            fast_tokenizer.apply_chat_template(messages)
+    pass
 
-    if  getattr(slow_tokenizer, "chat_template", None) is not None and \
-        getattr(fast_tokenizer, "chat_template", None) is not None:
+    # Check Mistral chat template without BOS / EOS
+    slow_tokenizer.chat_template = mistral_template
+    fast_tokenizer.chat_template = mistral_template
+    check_chat_template2 = \
+        slow_tokenizer.apply_chat_template(messages) == \
+        fast_tokenizer.apply_chat_template(messages)
+    pass
 
-        # Check chat template!
-        messages = [
-            {"role": "user", "content": " What is 2+2? "},
-            {"role": "assistant", "content": " It's 4. "},
-        ]
-        check_chat_template = \
-            slow_tokenizer(slow_tokenizer.apply_chat_template(messages)).input_ids == \
-            fast_tokenizer(slow_tokenizer.apply_chat_template(messages)).input_ids
+    # Check Llama chat template without BOS / EOS
+    slow_tokenizer.chat_template = llama_template
+    fast_tokenizer.chat_template = llama_template
+    check_chat_template3 = \
+        slow_tokenizer.apply_chat_template(messages) == \
+        fast_tokenizer.apply_chat_template(messages)
     pass
 
+    # Combine them all and revert chat templates
+    check_chat_template = check_chat_template1 and check_chat_template2 and check_chat_template3
+    slow_tokenizer.chat_template = slow_chat_template
+    fast_tokenizer.chat_template = fast_chat_template
+
+    # Try special tokens
     try:
         string = "\n".join(all_special_tokens) + \
             "A quick brown fox jumps over the lazy dog!!\n\nHi</s>\n\n" + \
@@ -227,6 +355,7 @@ def assert_same_tokenization(slow_tokenizer, fast_tokenizer):
             return check_chat_template
         else:
             return False
+    pass
 pass
 
 
@@ -379,17 +508,13 @@ def load_correct_tokenizer(
     # Mainly to solve Deepseek models with no tokenizer.model file
     slow_tokenizer = None
     try:
-        slow_tokenizer = AutoTokenizer.from_pretrained(
+        slow_tokenizer = select_correct_slow_tokenizer(
             tokenizer_name,
-            model_max_length  = model_max_length,
-            padding_side      = padding_side,
-            token             = token,
+            model_max_length = model_max_length,
+            padding_side = padding_side,
+            token = token,
             trust_remote_code = trust_remote_code,
-            # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
-            use_fast          = False,
-            legacy            = False,
-            from_slow         = True,
-            cache_dir         = cache_dir,
+            cache_dir = cache_dir,
         )
     except:
         pass
@@ -418,6 +543,7 @@ def load_correct_tokenizer(
         if assert_same_tokenization(slow_tokenizer, fast_tokenizer):
             return fast_tokenizer
         else:
+            logger.warning(f"Unsloth: Will load {tokenizer_name} as a legacy tokenizer.")
             return convert_to_fast_tokenizer(slow_tokenizer)
         pass
     else:

From 8176155d0099d60a3e084a8a04934b674a89c169 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 16:48:50 +1000
Subject: [PATCH 099/153] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 70 +++++---------------------------------
 1 file changed, 9 insertions(+), 61 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index f10b2c0a..df88170e 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -232,62 +232,6 @@ def convert_to_fast_tokenizer(
         "{% endif %}"\
     "{% endfor %}"
 pass
-    
-
-def select_correct_slow_tokenizer(
-    tokenizer_name,
-    model_max_length = None,
-    padding_side = "right",
-    token = None,
-    trust_remote_code = False,
-    cache_dir = "huggingface_tokenizers_cache",
-):
-    """
-    Returns 'correct' tokenizer by checking if the chat templates are
-    actually tokenized correctly.
-    """
-    messages = [
-        {"role": "user", "content": "What is 2+2?"},
-        {"role": "assistant", "content": "It's 4."},
-    ]
-    
-    settings = (
-        (False, False, True,),
-        (False, True,  True,),
-        (True,  False, True,),
-        (True,  False, False,),
-    )
-
-    for (use_fast, legacy, from_slow,) in settings:
-        # Default as mentioned by Arthur from HF:
-        slow_tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name,
-            model_max_length  = model_max_length,
-            padding_side      = padding_side,
-            token             = token,
-            trust_remote_code = trust_remote_code,
-            # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
-            use_fast          = use_fast,
-            legacy            = legacy,
-            from_slow         = from_slow,
-            cache_dir         = cache_dir,
-        )
-        slow_tokenizer_chat_template = slow_tokenizer.chat_template
-
-        slow_tokenizer.chat_template = llama_template
-        result1 = slow_tokenizer.decode(slow_tokenizer.apply_chat_template(messages))
-        slow_tokenizer.chat_template = mistral_template
-        result2 = slow_tokenizer.decode(slow_tokenizer.apply_chat_template(messages))
-
-        # If 2 spaces seen, normally wrong!
-        if " "*2 not in result1 and " "*2 not in result2:
-            slow_tokenizer.chat_template = slow_tokenizer_chat_template
-            return slow_tokenizer
-        pass
-    pass
-    # Return fast version as default
-    return slow_tokenizer
-pass
 
 
 def assert_same_tokenization(slow_tokenizer, fast_tokenizer):
@@ -508,13 +452,17 @@ def load_correct_tokenizer(
     # Mainly to solve Deepseek models with no tokenizer.model file
     slow_tokenizer = None
     try:
-        slow_tokenizer = select_correct_slow_tokenizer(
+        slow_tokenizer = AutoTokenizer.from_pretrained(
             tokenizer_name,
-            model_max_length = model_max_length,
-            padding_side = padding_side,
-            token = token,
+            model_max_length  = model_max_length,
+            padding_side      = padding_side,
+            token             = token,
             trust_remote_code = trust_remote_code,
-            cache_dir = cache_dir,
+            # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
+            use_fast          = False,
+            legacy            = False,
+            from_slow         = True,
+            cache_dir         = cache_dir,
         )
     except:
         pass

From 21a99f1d0c1b1866ae35353669d5b64c80d3804b Mon Sep 17 00:00:00 2001
From: Eliot Hall <60240707+chrehall68@users.noreply.github.com>
Date: Wed, 12 Jun 2024 23:52:28 -0700
Subject: [PATCH 100/153] fix case where gguf saving fails due to
 first_conversion dtype (#630)

---
 unsloth/save.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 3ad2f346..c521799f 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -930,7 +930,7 @@ def save_to_gguf(
 
     # Check first_conversion format
     if   first_conversion == "f16"  : pass
-    if   first_conversion == "bf16" : pass
+    elif first_conversion == "bf16" : pass
     elif first_conversion == "f32"  : pass
     elif first_conversion == "q8_0" : pass
     else:

From dbf2dcff1a9297040ddfa039a581ef6b630fda37 Mon Sep 17 00:00:00 2001
From: Eliot Hall <60240707+chrehall68@users.noreply.github.com>
Date: Wed, 12 Jun 2024 23:53:29 -0700
Subject: [PATCH 101/153] Support revision parameter in
 FastLanguageModel.from_pretrained (#629)

* support `revision` parameter

* match unsloth formatting of named parameters
---
 unsloth/models/loader.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 3bc091b3..190c026a 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -81,6 +81,7 @@ def from_pretrained(
         trust_remote_code = False,
         use_gradient_checkpointing = True,
         resize_model_vocab = None,
+        revision = None,
         *args, **kwargs,
     ):
         if token is None and "HF_TOKEN" in os.environ:
@@ -95,12 +96,12 @@ def from_pretrained(
         # First check if it's a normal model via AutoConfig
         is_peft = False
         try:
-            model_config = AutoConfig.from_pretrained(model_name, token = token)
+            model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
             is_peft = False
         except:
             try:
                 # Most likely a PEFT model
-                peft_config = PeftConfig.from_pretrained(model_name, token = token)
+                peft_config = PeftConfig.from_pretrained(model_name, token = token, revision = revision)
             except:
                 raise RuntimeError(f"Unsloth: `{model_name}` is not a full model or a PEFT model.")
             
@@ -154,6 +155,7 @@ def from_pretrained(
             model_patcher  = dispatch_model,
             tokenizer_name = tokenizer_name,
             trust_remote_code = trust_remote_code,
+            revision = revision if not is_peft else None,
             *args, **kwargs,
         )
         
@@ -189,7 +191,7 @@ def from_pretrained(
 
         if is_peft:
             # Now add PEFT adapters
-            model = PeftModel.from_pretrained(model, old_model_name, token = token)
+            model = PeftModel.from_pretrained(model, old_model_name, token = token, revision = revision)
             # Patch it as well!
             model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
         pass

From 9016171682289d413ee305863a8c476663268118 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rickard=20Ed=C3=A9n?= <rickardeden@gmail.com>
Date: Thu, 13 Jun 2024 08:55:57 +0200
Subject: [PATCH 102/153] clears any selected_adapters before calling
 internal_model.save_pretrained (#609)

---
 unsloth/save.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index c521799f..682dd530 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -643,7 +643,8 @@ def unsloth_save_model(
     model.config = new_config
 
     # Save!
-
+    
+    save_pretrained_settings["selected_adapters"] = None
     # Check if pushing to an organization
     if save_pretrained_settings["push_to_hub"] and (username != actual_username):
         print(f"Unsloth: Saving to organization with address {new_save_directory}")

From 0428920ecde77494a4442b8e8584be7a40dc4d37 Mon Sep 17 00:00:00 2001
From: XiaoYang <xyangk@gmail.com>
Date: Thu, 13 Jun 2024 14:57:23 +0800
Subject: [PATCH 103/153] Update __init__.py (#602)

Check for incompatible modules before importing unsloth
---
 unsloth/__init__.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index d85eca00..bb997147 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -14,8 +14,17 @@
 import os
 import warnings
 import importlib
+import sys
 
-# Currently only supports 1 GPU, or else seg faults will occur.
+# Define a list of modules to check
+MODULES_TO_CHECK = ["peft", "bitsandbytes"]
+
+# Check if any of the modules in the list have been imported
+for module in MODULES_TO_CHECK:
+    if module in sys.modules:
+        raise ImportError(f"Please import unsloth before {module}.")
+    
+# Currently only supports 1 GPU, or else seg faults will occur.    
 if "CUDA_VISIBLE_DEVICES" in os.environ:
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     devices = os.environ["CUDA_VISIBLE_DEVICES"]

From 9fdd847dab60086355c2fe2bffcb7cd1c0b24461 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 13 Jun 2024 15:22:34 +0800
Subject: [PATCH 104/153] Fixed unsloth/tokenizer_utils.py for chat training
 (#604)

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index df88170e..5941623b 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -920,7 +920,7 @@ def patch_sft_trainer_tokenizer():
 
         check_text = \
         "\n"\
-        "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\
+        "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])[0]\n"\
         "chat_template = getattr(tokenizer, 'chat_template', None)\n"\
         "chat_template = '' if chat_template is None else chat_template\n"\
         "has_bos_token_already = (test_text.startswith(tokenizer.bos_token) or tokenizer.bos_token in chat_template) "\

From b5fc6aa0089050a31c734e9fa24b4e69e9d83200 Mon Sep 17 00:00:00 2001
From: mahiatlinux <110882203+mahiatlinux@users.noreply.github.com>
Date: Thu, 13 Jun 2024 19:27:03 +1200
Subject: [PATCH 105/153] Add GGML saving option to Unsloth for easier Ollama
 model creation and testing. (#345)

* Add save to llama.cpp GGML to save.py.

* Fix conversion command and path of convert to GGML function.

* Add autosaving lora to the GGML function

* Create lora save function for conversion to GGML

* Test fix #2 for saving lora

* Test fix #3 to save  the lora adapters to convert to GGML

* Remove unwated tokenizer saving for conversion to ggml and added a few print statements.

* Needed tokenizer for saving, added it back, also made it more unslothy style by having positional arguments, and added a few messages.

* Positional arguments didn't work out, so reverted to older version of the code, and added a few comments.

* Test fix 1 for arch

* Test fix 2 new Mistral error.

* Test fix 3

* Revert to old version for testing.

* Upload issue test fix 1

* Fix 2 uploading ggml

* Positional ags added.

* Temporray remove positional args

* Fix upload again!!!

* Add print statements and fix link

* Make the calling name better

* Create local saving for GGML

* Add choosing directory to save local GGML.

* Fix lil variable error in the save_to_custom_dir func
---
 unsloth/save.py | 144 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 140 insertions(+), 4 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 682dd530..9c1380c4 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1655,6 +1655,140 @@ def unsloth_push_to_hub_gguf(
     pass
 pass
 
+# Corrected function to save LoRA to a custom directory
+def save_lora_to_custom_dir(model, tokenizer, save_directory):
+    # Create the custom directory if it doesn't exist
+    os.makedirs(save_directory, exist_ok=True)
+
+    # Call the unsloth_save_model function with the custom directory
+    unsloth_save_model(
+        model,
+        tokenizer,
+        save_directory=save_directory,
+        save_method="lora",
+        push_to_hub=False,
+    )
+
+# Corrected method within the model class to convert LoRA to GGML and push to Hugging Face Hub
+def unsloth_convert_lora_to_ggml_and_push_to_hub(
+    self,
+    tokenizer,
+    repo_id: str,
+    use_temp_dir: Optional[bool] = None,
+    commit_message: Optional[str] = "Converted LoRA to GGML with Unsloth",
+    private: Optional[bool] = None,
+    token: Union[bool, str, None] = None,
+    create_pr: bool = False,
+    revision: str = None,
+    commit_description: str = "Convert LoRA to GGML format using Unsloth",
+    temporary_location: str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage: float = 0.85,
+):
+    if not os.path.exists("llama.cpp"):
+        if IS_KAGGLE_ENVIRONMENT:
+            python_install = install_python_non_blocking(["protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda=False)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["protobuf"])
+            git_clone.wait()
+            makefile = install_llama_cpp_make_non_blocking()
+            python_install.wait()
+    else:
+        makefile = None
+
+    for _ in range(3):
+        gc.collect()
+
+    lora_directory_push = "lora-to-ggml-push"
+    save_lora_to_custom_dir(self, tokenizer, lora_directory_push)
+
+    model_type = self.config.model_type
+    output_file = os.path.join(lora_directory_push, "ggml-adapter-model.bin")
+
+    print(f"Unsloth: Converting auto-saved LoRA adapters at {lora_directory_push} to GGML format.")
+    print(f"The output file will be {output_file}")
+
+    command = f"python3 llama.cpp/convert-lora-to-ggml.py {lora_directory_push} {output_file} llama"
+
+    try:
+        with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
+            for line in sp.stdout:
+                print(line, end="", flush=True)
+            for line in sp.stderr:
+                print(line, end="", flush=True)
+            sp.wait()
+            if sp.returncode != 0:
+                raise subprocess.CalledProcessError(sp.returncode, command)
+    except subprocess.CalledProcessError as e:
+        print(f"Error: Conversion failed with return code {e.returncode}")
+        return
+
+    print(f"Unsloth: Conversion completed! Output file: {output_file}")
+
+    print("Unsloth: Uploading GGML file to Hugging Face Hub...")
+    username = upload_to_huggingface(
+        self, repo_id, token,
+        "GGML converted LoRA", "ggml", output_file, None, private,
+    )
+    link = f"{repo_id.lstrip('/')}"
+    print("Unsloth: Done.")
+    print(f"Converted LoRA to GGML and uploaded to https://huggingface.co/{link}")
+    print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
+
+def unsloth_convert_lora_to_ggml_and_save_locally(
+    self,
+    save_directory: str, # Added parameter for the folder name 
+    tokenizer, 
+    temporary_location: str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage: float = 0.85,
+):
+    if not os.path.exists("llama.cpp"):
+        if IS_KAGGLE_ENVIRONMENT:
+            python_install = install_python_non_blocking(["protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda=False)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["protobuf"])
+            git_clone.wait()
+            makefile = install_llama_cpp_make_non_blocking()
+            python_install.wait()
+    else:
+        makefile = None
+
+    for _ in range(3):
+        gc.collect()
+
+    # Use the provided save_directory for local saving
+    save_lora_to_custom_dir(self, tokenizer, save_directory)
+
+    model_type = self.config.model_type
+    output_file = os.path.join(save_directory, "ggml-adapter-model.bin")
+
+    print(f"Unsloth: Converting auto-saved LoRA adapters at {save_directory} to GGML format.")
+    print(f"The output file will be {output_file}")
+
+    command = f"python3 llama.cpp/convert-lora-to-ggml.py {save_directory} {output_file} llama"
+
+    try:
+        with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
+            for line in sp.stdout:
+                print(line, end="", flush=True)
+            for line in sp.stderr:
+                print(line, end="", flush=True)
+            sp.wait()
+            if sp.returncode != 0:
+                raise subprocess.CalledProcessError(sp.returncode, command)
+    except subprocess.CalledProcessError as e:
+        print(f"Error: Conversion failed with return code {e.returncode}")
+        return
+    print("Unsloth: Done.")
+    print(f"Unsloth: Conversion completed! Output file: {output_file}")
+    print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
 
 def patch_saving_functions(model):
     import inspect
@@ -1747,10 +1881,12 @@ def patch_saving_functions(model):
     # Add saving methods to top level model
     if hasattr(model, "config"):
         # Counteract tokenizers
-        model.push_to_hub_merged     = types.MethodType(unsloth_push_to_hub_merged,     model)
-        model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged, model)
-        model.push_to_hub_gguf       = types.MethodType(unsloth_push_to_hub_gguf,       model)
-        model.save_pretrained_gguf   = types.MethodType(unsloth_save_pretrained_gguf,   model)
+        model.push_to_hub_merged     = types.MethodType(unsloth_push_to_hub_merged,                    model)
+        model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged,                model)
+        model.push_to_hub_gguf       = types.MethodType(unsloth_push_to_hub_gguf,                      model)
+        model.save_pretrained_gguf   = types.MethodType(unsloth_save_pretrained_gguf,                  model)
+        model.push_to_hub_ggml       = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub,  model)
+        model.save_pretrained_ggml   = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model)
     pass
     return model
 pass

From 3fafbf7dc7c010c7ff6df34afe30514fc2871d1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20De=20Greef?= <sebdg@binarycompany.com>
Date: Thu, 13 Jun 2024 00:30:37 -0700
Subject: [PATCH 106/153] docs: Add LoraConfig parameters documentation (#619)

---
 PARAMETERS.md | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 PARAMETERS.md

diff --git a/PARAMETERS.md b/PARAMETERS.md
new file mode 100644
index 00000000..94d63798
--- /dev/null
+++ b/PARAMETERS.md
@@ -0,0 +1,87 @@
+## LoraConfig Parameters
+
+Adjusting the `LoraConfig` parameters allows you to balance model performance and computational efficiency in Low-Rank Adaptation (LoRA). Here’s a concise breakdown of key parameters:
+
+**r**
+- **Description**: Rank of the low-rank decomposition for factorizing weight matrices.
+- **Impact**:
+  - **Higher**: Retains more information, increases computational load.
+  - **Lower**: Fewer parameters, more efficient training, potential performance drop if too small.
+
+
+**lora_alpha**
+- **Description**: Scaling factor for the low-rank matrices' contribution.
+- **Impact**:
+  - **Higher**: Increases influence, speeds up convergence, risks instability or overfitting.
+  - **Lower**: Subtler effect, may require more training steps.
+
+**lora_dropout**
+- **Description**: Probability of zeroing out elements in low-rank matrices for regularization.
+- **Impact**:
+  - **Higher**: More regularization, prevents overfitting, may slow training and degrade performance.
+  - **Lower**: Less regularization, may speed up training, risks overfitting.
+
+**loftq_config**
+- **Description**: Configuration for LoftQ, a quantization method for the backbone weights and initialization of LoRA layers.
+- **Impact**:
+  - **Not None**: If specified, LoftQ will quantize the backbone weights and initialize the LoRA layers. It requires setting `init_lora_weights='loftq'`.
+  - **None**: LoftQ quantization is not applied.
+  - **Note**: Do not pass an already quantized model when using LoftQ as LoftQ handles the quantization process itself.
+
+
+**use_rslora**
+- **Description**: Enables Rank-Stabilized LoRA (RSLora).
+- **Impact**:
+  - **True**: Uses Rank-Stabilized LoRA, setting the adapter scaling factor to `lora_alpha/math.sqrt(r)`, which has been proven to work better as per the [Rank-Stabilized LoRA paper](https://doi.org/10.48550/arXiv.2312.03732).
+  - **False**: Uses the original default scaling factor `lora_alpha/r`.
+
+**gradient_accumulation_steps**
+- **Default**: 1
+- **Description**: The number of steps to accumulate gradients before performing a backpropagation update.
+- **Impact**: 
+  - **Higher**: Accumulate gradients over multiple steps, effectively increasing the batch size without requiring additional memory. This can improve training stability and convergence, especially with large models and limited hardware.
+  - **Lower**: Faster updates but may require more memory per step and can be less stable.
+
+**weight_decay**
+- **Default**: 0.01
+- **Description**: Regularization technique that applies a small penalty to the weights during training.
+- **Impact**:
+  - **Non-zero Value (e.g., 0.01)**: Adds a penalty proportional to the magnitude of the weights to the loss function, helping to prevent overfitting by discouraging large weights.
+  - **Zero**: No weight decay is applied, which can lead to overfitting, especially in large models or with small datasets.
+
+**learning_rate**
+- **Default**: 2e-4
+- **Description**: The rate at which the model updates its parameters during training.
+- **Impact**:
+  - **Higher**: Faster convergence but risks overshooting optimal parameters and causing instability in training.
+  - **Lower**: More stable and precise updates but may slow down convergence, requiring more training steps to achieve good performance.
+
+## Target Modules 
+
+**q_proj (query projection)**
+- **Description**: Part of the attention mechanism in transformer models, responsible for projecting the input into the query space.
+- **Impact**: Transforms the input into query vectors that are used to compute attention scores.
+
+**k_proj (key projection)**
+- **Description**: Projects the input into the key space in the attention mechanism.
+- **Impact**: Produces key vectors that are compared with query vectors to determine attention weights.
+
+**v_proj (value projection)**
+- **Description**: Projects the input into the value space in the attention mechanism.
+- **Impact**: Produces value vectors that are weighted by the attention scores and combined to form the output.
+
+**o_proj (output projection)**
+- **Description**: Projects the output of the attention mechanism back into the original space.
+- **Impact**: Transforms the combined weighted value vectors back to the input dimension, integrating attention results into the model.
+
+**gate_proj (gate projection)**
+- **Description**: Typically used in gated mechanisms within neural networks, such as gating units in gated recurrent units (GRUs) or other gating mechanisms.
+- **Impact**: Controls the flow of information through the gate, allowing selective information passage based on learned weights.
+
+**up_proj (up projection)**
+- **Description**: Used for up-projection, typically increasing the dimensionality of the input.
+- **Impact**: Expands the input to a higher-dimensional space, often used in feedforward layers or when transitioning between different layers with differing dimensionalities.
+
+**down_proj (down projection)**
+- **Description**: Used for down-projection, typically reducing the dimensionality of the input.
+- **Impact**: Compresses the input to a lower-dimensional space, useful for reducing computational complexity and controlling the model size.

From 273a871c3bd63acc449c253a6371f8cebb7e29e3 Mon Sep 17 00:00:00 2001
From: Alberto Ferrer <albertof@barrahome.org>
Date: Thu, 13 Jun 2024 02:16:20 -0600
Subject: [PATCH 107/153] llama.cpp failing (#371)

llama.cpp is failing to generate quantize versions for the trained models.

Error:

```bash
You might have to compile llama.cpp yourself, then run this again.
You do not need to close this Python program. Run the following commands in a new terminal:
You must run this in the same folder as you're saving your model.
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j
Once that's done, redo the quantization.
```

But when i do clone this with recursive it works.

Co-authored-by: Daniel Han <danielhanchen@gmail.com>

From b312b3fa38aabd3e8386301ee2181927187f0dcb Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.github-private@beamnet.de>
Date: Thu, 13 Jun 2024 10:37:57 +0200
Subject: [PATCH 108/153] fix libcuda_dirs import for triton 3.0 (#227)

* fix libcuda_dirs import for triton 3.0

* Update __init__.py

* Update __init__.py

---------

Co-authored-by: Daniel Han <danielhanchen@gmail.com>
---
 unsloth/__init__.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index bb997147..428c9873 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -15,6 +15,7 @@
 import warnings
 import importlib
 import sys
+from packaging.version import Version
 
 # Define a list of modules to check
 MODULES_TO_CHECK = ["peft", "bitsandbytes"]
@@ -75,8 +76,14 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 
 # Try loading bitsandbytes and triton
 import bitsandbytes as bnb
+
 import triton
-from triton.common.build import libcuda_dirs
+libcuda_dirs = lambda: None
+if Version(triton.__version__) >= Version("3.0.0"):
+    try: from triton.backends.nvidia.driver import libcuda_dirs
+    except: pass
+else: from triton.common.build import libcuda_dirs
+
 import os
 import re
 import numpy as np
@@ -112,8 +119,11 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     importlib.reload(bnb)
     importlib.reload(triton)
     try:
-        import bitsandbytes as bnb
-        from triton.common.build import libcuda_dirs
+        libcuda_dirs = lambda: None
+        if Version(triton.__version__) >= Version("3.0.0"):
+            try: from triton.backends.nvidia.driver import libcuda_dirs
+            except: pass
+        else: from triton.common.build import libcuda_dirs
         cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
         libcuda_dirs()
     except:

From 1601dca031238f46745207b643f90336c3ac1151 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 19:23:31 +1000
Subject: [PATCH 109/153] Update save.py

---
 unsloth/save.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 9c1380c4..26880e5e 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -22,7 +22,7 @@
 import pickle
 import gc
 from transformers.models.llama.modeling_llama import logger
-from .kernels import fast_dequantize, QUANT_STATE, get_lora_parameters
+from .kernels import fast_dequantize, QUANT_STATE, get_lora_parameters_bias
 import subprocess
 import psutil
 import re
@@ -132,9 +132,10 @@ def _free_cached_model(model):
 
 def _merge_lora(layer, name):
 
+    bias = None
     if isinstance(layer, (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear)):
         # Is LoRA so we need to merge!
-        W, quant_state, A, B, s = get_lora_parameters(layer)
+        W, quant_state, A, B, s, bias = get_lora_parameters_bias(layer)
         if quant_state is not None:
             dtype = quant_state.dtype if type(quant_state) is not list else quant_state[2]
             W = fast_dequantize(W, quant_state)
@@ -156,7 +157,7 @@ def _merge_lora(layer, name):
         W = W.t().to(dtype)
     else:
         W = layer.weight
-    return W
+    return W, bias
 pass
 
 
@@ -527,7 +528,12 @@ def unsloth_save_model(
         for item in LLAMA_WEIGHTS:
             proj = eval(f"layer.{item}")
             name = f"model.layers.{j}.{item}.weight"
-            W = _merge_lora(proj, name)
+            W, bias = _merge_lora(proj, name)
+
+            # Bias term
+            if bias is not None:
+                state_dict[f"model.layers.{j}.{item}.bias"] = bias
+            pass
 
             if (torch.cuda.memory_allocated() + W.nbytes) < max_vram:
                 # Save to GPU memory

From 26dc50294a0fe62a85fdb46c76f70148c567739c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 19:25:27 +1000
Subject: [PATCH 110/153] Update __init__.py

---
 unsloth/kernels/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/kernels/__init__.py b/unsloth/kernels/__init__.py
index b1fdba83..ebea02af 100644
--- a/unsloth/kernels/__init__.py
+++ b/unsloth/kernels/__init__.py
@@ -24,6 +24,7 @@
 )
 from .fast_lora import (
 	get_lora_parameters,
+	get_lora_parameters_bias,
 	apply_lora_mlp_swiglu,
 	apply_lora_mlp_geglu_exact,
 	apply_lora_mlp_geglu_approx,

From 6a516573d01c75fe980fba3f5188eb72fca6274a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 19:26:33 +1000
Subject: [PATCH 111/153] Update fast_lora.py

---
 unsloth/kernels/fast_lora.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/unsloth/kernels/fast_lora.py b/unsloth/kernels/fast_lora.py
index aba44f02..8f7aea58 100644
--- a/unsloth/kernels/fast_lora.py
+++ b/unsloth/kernels/fast_lora.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 
 import torch
-from .utils import fast_dequantize, QUANT_STATE, get_lora_parameters, matmul_lora
+from .utils import (
+    fast_dequantize,
+    QUANT_STATE,
+    get_lora_parameters,
+    get_lora_parameters_bias,
+    matmul_lora,
+)
 
 
 class LoRA_MLP(torch.autograd.Function):

From 4a8ba90605d4b41a92797759be963e19d4e30438 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 19:28:48 +1000
Subject: [PATCH 112/153] Update save.py

---
 unsloth/save.py | 48 +-----------------------------------------------
 1 file changed, 1 insertion(+), 47 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 26880e5e..536dc78d 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -829,49 +829,6 @@ def install_llama_cpp_blocking(use_cuda = True):
 pass
 
 
-def _fix_gemma_gguf():
-    # Fixes Gemma saving to GGUF to float32 instead of float16!
-    with open("llama.cpp/convert-hf-to-gguf.py", "rb") as file:
-        text = file.read()
-    pass
-
-    gemma_start = text.find(b"class GemmaModel(Model):")
-    if gemma_start == -1: return
-
-    gemma_end   = text.find(b"self.gguf_writer.add_tensor(new_name, data)", gemma_start)
-    if gemma_end == -1: return
-
-    gemma_text = text[gemma_start : gemma_end]
-    bad_text = \
-b"""         data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)"""
-    good_text = \
-b"""         # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)"""
-    find_bad = gemma_text.find(bad_text)
-    if find_bad == -1: return
-
-    gemma_text = gemma_text[:find_bad] + good_text + gemma_text[find_bad + len(bad_text):]
-    text = text[:gemma_start] + gemma_text + text[gemma_end:]
-
-    with open("llama.cpp/convert-hf-to-gguf.py", "w+b") as file:
-        file.write(text)
-    pass
-pass
-
-
 def save_to_gguf(
     model_type           : str,
     model_dtype          : str,
@@ -1024,9 +981,6 @@ def save_to_gguf(
             f"--outfile {final_location} --vocab-type {vocab_type} "\
             f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab"
     else:
-        # Need to fix convert-hf-to-gguf.py for some models!
-        # _fix_gemma_gguf()
-
         command = f"python llama.cpp/convert-hf-to-gguf.py {model_directory} "\
             f"--outfile {final_location} "\
             f"--outtype {first_conversion}"
@@ -1425,7 +1379,7 @@ def unsloth_save_pretrained_gguf(
     # Non blocking install GGUF first
     if not os.path.exists("llama.cpp"):
 
-        if IS_KAGGLE_ENVIRONMENT:
+        if True:#IS_KAGGLE_ENVIRONMENT:
             # Kaggle is weird - no blocking installs, and no CUDA?
             python_install = install_python_non_blocking(["gguf", "protobuf"])
             python_install.wait()

From 0abb5ba23ef4a28b149f8cb2a136b761de67fce1 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 19:39:04 +1000
Subject: [PATCH 113/153] Update save.py

---
 unsloth/save.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 536dc78d..d314d39d 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -801,7 +801,7 @@ def install_llama_cpp_old(version = -10):
 pass
 
 
-def install_llama_cpp_blocking(use_cuda = True):
+def install_llama_cpp_blocking(use_cuda = False):
     # https://github.com/ggerganov/llama.cpp/issues/7062
     # Weirdly GPU conversion for GGUF breaks??
     # use_cuda = "LLAMA_CUDA=1" if use_cuda else ""
@@ -911,6 +911,10 @@ def save_to_gguf(
         install_llama_cpp_blocking()
     pass
     # Check if successful. If not install 10th latest release
+    print("====================================")
+    print(error)
+    print(os.path.exists("llama.cpp/quantize"))
+    print("====================================")
     if error != 0 or not os.path.exists("llama.cpp/quantize"):
         print(f"Unsloth: llama.cpp error code = {error}.")
         install_llama_cpp_old(-10)
@@ -1383,7 +1387,6 @@ def unsloth_save_pretrained_gguf(
             # Kaggle is weird - no blocking installs, and no CUDA?
             python_install = install_python_non_blocking(["gguf", "protobuf"])
             python_install.wait()
-            install_llama_cpp_blocking(use_cuda = False)
             new_save_directory, old_username = unsloth_save_model(**arguments)
             makefile = None
         else:
@@ -1404,7 +1407,6 @@ def unsloth_save_pretrained_gguf(
                 # Kaggle is weird - no blocking installs, and no CUDA?
                 python_install = install_python_non_blocking(["gguf", "protobuf"])
                 python_install.wait()
-                install_llama_cpp_blocking(use_cuda = False)
                 new_save_directory, old_username = unsloth_save_model(**arguments)
                 makefile = None
             else:
@@ -1534,7 +1536,6 @@ def unsloth_push_to_hub_gguf(
             # Kaggle is weird - no blocking installs, and no CUDA?
             python_install = install_python_non_blocking(["gguf", "protobuf"])
             python_install.wait()
-            install_llama_cpp_blocking(use_cuda = False)
             new_save_directory, old_username = unsloth_save_model(**arguments)
             makefile = None
         else:
@@ -1555,7 +1556,6 @@ def unsloth_push_to_hub_gguf(
                 # Kaggle is weird - no blocking installs, and no CUDA?
                 python_install = install_python_non_blocking(["gguf", "protobuf"])
                 python_install.wait()
-                install_llama_cpp_blocking(use_cuda = False)
                 new_save_directory, old_username = unsloth_save_model(**arguments)
                 makefile = None
             else:

From b24dd050bfd57a922894b7d6d82ecf6754effc91 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 19:44:51 +1000
Subject: [PATCH 114/153] Update save.py

---
 unsloth/save.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/save.py b/unsloth/save.py
index d314d39d..fdfb7397 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -914,6 +914,7 @@ def save_to_gguf(
     print("====================================")
     print(error)
     print(os.path.exists("llama.cpp/quantize"))
+    raise
     print("====================================")
     if error != 0 or not os.path.exists("llama.cpp/quantize"):
         print(f"Unsloth: llama.cpp error code = {error}.")

From 48c6d6dbeec5bed5cd45ff024af8d2e683d54d97 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 19:47:17 +1000
Subject: [PATCH 115/153] Update loader.py

---
 unsloth/models/loader.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 190c026a..ff64360c 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -33,11 +33,8 @@
 
 def _get_model_name(model_name, load_in_4bit = True):
 
-    # First try replacing lowercase 'b' with uppercase 'B'
-    model_name = model_name.lower()
-
     if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER:
-        model_name = INT_TO_FLOAT_MAPPER[model_name]
+        model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         logger.warning_once(
             f"Unsloth: Your transformers version of {transformers_version} does not support native "\
             f"4bit loading.\nThe minimum required version is 4.37.\n"\
@@ -47,7 +44,7 @@ def _get_model_name(model_name, load_in_4bit = True):
         )
     
     elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER:
-        new_model_name = INT_TO_FLOAT_MAPPER[model_name]
+        new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
         #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
@@ -55,7 +52,7 @@ def _get_model_name(model_name, load_in_4bit = True):
         model_name = new_model_name
 
     elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER:
-        new_model_name = FLOAT_TO_INT_MAPPER[model_name]
+        new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
         #     f"We shall load `{new_model_name}` for 4x faster loading."

From e35f6082336c5bb69a727cda74771a3339971a30 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 20:12:50 +1000
Subject: [PATCH 116/153] Update save.py

---
 unsloth/save.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index fdfb7397..d73b6833 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -812,7 +812,7 @@ def install_llama_cpp_blocking(use_cuda = False):
         # https://github.com/ggerganov/llama.cpp/issues/7062
         # Weirdly GPU conversion for GGUF breaks??
         # f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp",
-        f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
+        f"make -j{psutil.cpu_count()*2} quantize -C llama.cpp",
         "pip install gguf protobuf",
     ]
     if os.path.exists("llama.cpp"): return
@@ -914,7 +914,6 @@ def save_to_gguf(
     print("====================================")
     print(error)
     print(os.path.exists("llama.cpp/quantize"))
-    raise
     print("====================================")
     if error != 0 or not os.path.exists("llama.cpp/quantize"):
         print(f"Unsloth: llama.cpp error code = {error}.")

From 4822eaeffa2377721ef623592ceaf48fb54aad88 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 22:30:50 +1000
Subject: [PATCH 117/153] Update save.py

---
 unsloth/save.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index d73b6833..89862f9a 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -812,7 +812,7 @@ def install_llama_cpp_blocking(use_cuda = False):
         # https://github.com/ggerganov/llama.cpp/issues/7062
         # Weirdly GPU conversion for GGUF breaks??
         # f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp",
-        f"make -j{psutil.cpu_count()*2} quantize -C llama.cpp",
+        f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
         "pip install gguf protobuf",
     ]
     if os.path.exists("llama.cpp"): return
@@ -915,10 +915,10 @@ def save_to_gguf(
     print(error)
     print(os.path.exists("llama.cpp/quantize"))
     print("====================================")
-    if error != 0 or not os.path.exists("llama.cpp/quantize"):
-        print(f"Unsloth: llama.cpp error code = {error}.")
-        install_llama_cpp_old(-10)
-    pass
+    # if error != 0 or not os.path.exists("llama.cpp/quantize"):
+    #     print(f"Unsloth: llama.cpp error code = {error}.")
+    #     install_llama_cpp_old(-10)
+    # pass
 
     if   quantization_method == "f32":  first_conversion = "f32"
     elif quantization_method == "f16":  first_conversion = "f16"
@@ -1030,7 +1030,7 @@ def save_to_gguf(
         print(f"Unsloth: [2] Converting GGUF 16bit into {quantization_method}. This will take 20 minutes...")
         final_location = f"./{model_directory}-unsloth.{quantization_method.upper()}.gguf"
 
-        command = f"./llama.cpp/quantize {old_location} "\
+        command = f"./llama.cpp/examples/quantize {old_location} "\
             f"{final_location} {quantization_method} {n_cpus}"
         
         # quantize uses stderr

From 7d847ed3185b340fc5e457f15aa4ffc3a664e26f Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 22:50:08 +1000
Subject: [PATCH 118/153] quantize now llama-quantize

---
 unsloth/save.py | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 89862f9a..cae59cae 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -792,7 +792,7 @@ def install_llama_cpp_old(version = -10):
         pass
     pass
     # Check if successful
-    if not os.path.exists("llama.cpp/quantize"):
+    if not os.path.exists("llama.cpp/quantize") and not os.path.exists("llama.cpp/llama-quantize"):
         raise RuntimeError(
             "Unsloth: llama.cpp GGUF seems to be too buggy to install.\n"\
             "File a report to llama.cpp's main repo since this is not an Unsloth issue."
@@ -910,15 +910,23 @@ def save_to_gguf(
         error = 0
         install_llama_cpp_blocking()
     pass
+
     # Check if successful. If not install 10th latest release
-    print("====================================")
-    print(error)
-    print(os.path.exists("llama.cpp/quantize"))
-    print("====================================")
-    # if error != 0 or not os.path.exists("llama.cpp/quantize"):
-    #     print(f"Unsloth: llama.cpp error code = {error}.")
-    #     install_llama_cpp_old(-10)
-    # pass
+
+    # Careful llama.cpp/quantize changed to llama.cpp/llama-quantize
+    # and llama.cpp/main changed to llama.cpp/llama-cli
+    # See https://github.com/ggerganov/llama.cpp/pull/7809
+    quantize_location = None
+    if os.path.exists("llama.cpp/quantize"):
+        quantize_location = "llama.cpp/quantize"
+    elif os.path.exists("llama.cpp/llama-quantize"):
+        quantize_location = "llama.cpp/llama-quantize"
+    pass
+
+    if error != 0 or quantize_location is None:
+        print(f"Unsloth: llama.cpp error code = {error}.")
+        install_llama_cpp_old(-10)
+    pass
 
     if   quantization_method == "f32":  first_conversion = "f32"
     elif quantization_method == "f16":  first_conversion = "f16"
@@ -1030,7 +1038,7 @@ def save_to_gguf(
         print(f"Unsloth: [2] Converting GGUF 16bit into {quantization_method}. This will take 20 minutes...")
         final_location = f"./{model_directory}-unsloth.{quantization_method.upper()}.gguf"
 
-        command = f"./llama.cpp/examples/quantize {old_location} "\
+        command = f"./{quantize_location} {old_location} "\
             f"{final_location} {quantization_method} {n_cpus}"
         
         # quantize uses stderr
@@ -1383,10 +1391,11 @@ def unsloth_save_pretrained_gguf(
     # Non blocking install GGUF first
     if not os.path.exists("llama.cpp"):
 
-        if True:#IS_KAGGLE_ENVIRONMENT:
+        if IS_KAGGLE_ENVIRONMENT:
             # Kaggle is weird - no blocking installs, and no CUDA?
             python_install = install_python_non_blocking(["gguf", "protobuf"])
             python_install.wait()
+            install_llama_cpp_blocking(use_cuda = False)
             new_save_directory, old_username = unsloth_save_model(**arguments)
             makefile = None
         else:
@@ -1407,6 +1416,7 @@ def unsloth_save_pretrained_gguf(
                 # Kaggle is weird - no blocking installs, and no CUDA?
                 python_install = install_python_non_blocking(["gguf", "protobuf"])
                 python_install.wait()
+                install_llama_cpp_blocking(use_cuda = False)
                 new_save_directory, old_username = unsloth_save_model(**arguments)
                 makefile = None
             else:
@@ -1536,6 +1546,7 @@ def unsloth_push_to_hub_gguf(
             # Kaggle is weird - no blocking installs, and no CUDA?
             python_install = install_python_non_blocking(["gguf", "protobuf"])
             python_install.wait()
+            install_llama_cpp_blocking(use_cuda = False)
             new_save_directory, old_username = unsloth_save_model(**arguments)
             makefile = None
         else:
@@ -1556,6 +1567,7 @@ def unsloth_push_to_hub_gguf(
                 # Kaggle is weird - no blocking installs, and no CUDA?
                 python_install = install_python_non_blocking(["gguf", "protobuf"])
                 python_install.wait()
+                install_llama_cpp_blocking(use_cuda = False)
                 new_save_directory, old_username = unsloth_save_model(**arguments)
                 makefile = None
             else:

From 82f10cbaacc178dba95f9eb468c137b211282f0f Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 22:50:38 +1000
Subject: [PATCH 119/153] Update chat_templates.py

---
 unsloth/chat_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 4c782326..2e3761f5 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -1286,7 +1286,7 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf")
     pass
     
     for prompt in prompts:
-        command = f"./llama.cpp/main -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
+        command = f"./llama.cpp/llama-cli -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
             f"--check-tensors -p '{prompt}'"
 
         datas = []

From 08424f04c9972eff7684b04c81b4559197bda712 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Thu, 13 Jun 2024 23:58:48 +1000
Subject: [PATCH 120/153] Update loader.py

---
 unsloth/models/loader.py | 57 +++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ff64360c..de1e2e57 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -67,18 +67,18 @@ def _get_model_name(model_name, load_in_4bit = True):
 class FastLanguageModel(FastLlamaModel):
     @staticmethod
     def from_pretrained(
-        model_name     = "unsloth/llama-3-8b-bnb-4bit",
-        max_seq_length = None,
-        dtype          = None,
-        load_in_4bit   = True,
-        token          = None,
-        device_map     = "sequential",
-        rope_scaling   = None,
-        fix_tokenizer  = True,
-        trust_remote_code = False,
-        use_gradient_checkpointing = True,
-        resize_model_vocab = None,
-        revision = None,
+        model_name                 = "unsloth/llama-3-8b-bnb-4bit",
+        max_seq_length             = None,
+        dtype                      = None,
+        load_in_4bit               = True,
+        token                      = None,
+        device_map                 = "sequential",
+        rope_scaling               = None,
+        fix_tokenizer              = True,
+        trust_remote_code          = False,
+        use_gradient_checkpointing = "unsloth",
+        resize_model_vocab         = None,
+        revision                   = None,
         *args, **kwargs,
     ):
         if token is None and "HF_TOKEN" in os.environ:
@@ -141,23 +141,24 @@ def from_pretrained(
         pass
 
         model, tokenizer = dispatch_model.from_pretrained(
-            model_name     = model_name,
-            max_seq_length = max_seq_length,
-            dtype          = dtype,
-            load_in_4bit   = load_in_4bit,
-            token          = token,
-            device_map     = device_map,
-            rope_scaling   = rope_scaling,
-            fix_tokenizer  = fix_tokenizer,
-            model_patcher  = dispatch_model,
-            tokenizer_name = tokenizer_name,
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = dtype,
+            load_in_4bit      = load_in_4bit,
+            token             = token,
+            device_map        = device_map,
+            rope_scaling      = rope_scaling,
+            fix_tokenizer     = fix_tokenizer,
+            model_patcher     = dispatch_model,
+            tokenizer_name    = tokenizer_name,
             trust_remote_code = trust_remote_code,
-            revision = revision if not is_peft else None,
+            revision          = revision if not is_peft else None,
             *args, **kwargs,
         )
         
         if resize_model_vocab is not None:
             model.resize_token_embeddings(resize_model_vocab)
+        pass
 
         # In case the model supports tagging, add the unsloth tag.
         if hasattr(model, "add_model_tags"):
@@ -187,8 +188,16 @@ def from_pretrained(
         pass
 
         if is_peft:
+            # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
-            model = PeftModel.from_pretrained(model, old_model_name, token = token, revision = revision)
+            model.enable_input_require_grads()
+            model = PeftModel.from_pretrained(
+                model,
+                old_model_name,
+                token = token,
+                revision = revision,
+                is_trainable = True,
+            )
             # Patch it as well!
             model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
         pass

From eb906d04bf615f817f1c18ef2a332e02f435718a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 00:00:59 +1000
Subject: [PATCH 121/153] Update mapper.py

---
 unsloth/models/mapper.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 73aa06ca..5ef75839 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -186,6 +186,9 @@
     "unsloth/Qwen2-70B-Instruct-bnb-4bit" : (
         "Qwen/Qwen2-70B-Instruct",
     ),
+    "mistralai/Codestral-22B-v0.1" : (
+        "mistral-community/Codestral-22B-v0.1",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}

From 0a304aefc01ba0dccff3d56d28e37eea4160bf90 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 00:07:42 +1000
Subject: [PATCH 122/153] Update __init__.py

---
 unsloth/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 428c9873..a6d6ff92 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -23,7 +23,9 @@
 # Check if any of the modules in the list have been imported
 for module in MODULES_TO_CHECK:
     if module in sys.modules:
-        raise ImportError(f"Please import unsloth before {module}.")
+        raise ImportError(f"Unsloth: Please import Unsloth before {module}.")
+    pass
+pass
     
 # Currently only supports 1 GPU, or else seg faults will occur.    
 if "CUDA_VISIBLE_DEVICES" in os.environ:

From 71edc42ed37425c30fbc66bddebbefec0fc0ba42 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 02:39:07 +1000
Subject: [PATCH 123/153] embedding size

---
 unsloth/__init__.py        | 2 +-
 unsloth/tokenizer_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index a6d6ff92..93960e2f 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -26,7 +26,7 @@
         raise ImportError(f"Unsloth: Please import Unsloth before {module}.")
     pass
 pass
-    
+
 # Currently only supports 1 GPU, or else seg faults will occur.    
 if "CUDA_VISIBLE_DEVICES" in os.environ:
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 5941623b..395c3b73 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -734,7 +734,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
     pass
 
     # Count all the possible bad tokens
-    final_counts = np.zeros(len(tokenizer), dtype = np.int64)
+    final_counts = np.zeros(max(len(tokenizer), embedding_matrix.shape[0]), dtype = np.int64)
     def mapping(examples):
         input_ids = examples["input_ids"]
         counter = np.fromiter(itertools.chain.from_iterable(input_ids), dtype = np.int32)

From b74e321f92e7ad1f51ce0be2e4962b8cb68d82b8 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 04:44:52 +1000
Subject: [PATCH 124/153] Update qwen2.py

---
 unsloth/models/qwen2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
index 115bf3e0..907148df 100644
--- a/unsloth/models/qwen2.py
+++ b/unsloth/models/qwen2.py
@@ -15,6 +15,7 @@
 from .llama import *
 import os
 from ._utils import __version__
+from .mistral import FastMistralModel
 
 from transformers.models.qwen2.modeling_qwen2 import (
     Qwen2Attention,
@@ -72,7 +73,7 @@ def from_pretrained(
         trust_remote_code = False,
         **kwargs,
     ):
-        return FastLlamaModel.from_pretrained(
+        return FastMistralModel.from_pretrained(
             model_name     = model_name,
             max_seq_length = max_seq_length,
             dtype          = dtype,

From b82277f71078ea202d9136aab1e1b1f0d158bc00 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 18:05:39 +1000
Subject: [PATCH 125/153] docs

---
 PARAMETERS.md | 87 ---------------------------------------------------
 README.md     |  2 +-
 2 files changed, 1 insertion(+), 88 deletions(-)
 delete mode 100644 PARAMETERS.md

diff --git a/PARAMETERS.md b/PARAMETERS.md
deleted file mode 100644
index 94d63798..00000000
--- a/PARAMETERS.md
+++ /dev/null
@@ -1,87 +0,0 @@
-## LoraConfig Parameters
-
-Adjusting the `LoraConfig` parameters allows you to balance model performance and computational efficiency in Low-Rank Adaptation (LoRA). Here’s a concise breakdown of key parameters:
-
-**r**
-- **Description**: Rank of the low-rank decomposition for factorizing weight matrices.
-- **Impact**:
-  - **Higher**: Retains more information, increases computational load.
-  - **Lower**: Fewer parameters, more efficient training, potential performance drop if too small.
-
-
-**lora_alpha**
-- **Description**: Scaling factor for the low-rank matrices' contribution.
-- **Impact**:
-  - **Higher**: Increases influence, speeds up convergence, risks instability or overfitting.
-  - **Lower**: Subtler effect, may require more training steps.
-
-**lora_dropout**
-- **Description**: Probability of zeroing out elements in low-rank matrices for regularization.
-- **Impact**:
-  - **Higher**: More regularization, prevents overfitting, may slow training and degrade performance.
-  - **Lower**: Less regularization, may speed up training, risks overfitting.
-
-**loftq_config**
-- **Description**: Configuration for LoftQ, a quantization method for the backbone weights and initialization of LoRA layers.
-- **Impact**:
-  - **Not None**: If specified, LoftQ will quantize the backbone weights and initialize the LoRA layers. It requires setting `init_lora_weights='loftq'`.
-  - **None**: LoftQ quantization is not applied.
-  - **Note**: Do not pass an already quantized model when using LoftQ as LoftQ handles the quantization process itself.
-
-
-**use_rslora**
-- **Description**: Enables Rank-Stabilized LoRA (RSLora).
-- **Impact**:
-  - **True**: Uses Rank-Stabilized LoRA, setting the adapter scaling factor to `lora_alpha/math.sqrt(r)`, which has been proven to work better as per the [Rank-Stabilized LoRA paper](https://doi.org/10.48550/arXiv.2312.03732).
-  - **False**: Uses the original default scaling factor `lora_alpha/r`.
-
-**gradient_accumulation_steps**
-- **Default**: 1
-- **Description**: The number of steps to accumulate gradients before performing a backpropagation update.
-- **Impact**: 
-  - **Higher**: Accumulate gradients over multiple steps, effectively increasing the batch size without requiring additional memory. This can improve training stability and convergence, especially with large models and limited hardware.
-  - **Lower**: Faster updates but may require more memory per step and can be less stable.
-
-**weight_decay**
-- **Default**: 0.01
-- **Description**: Regularization technique that applies a small penalty to the weights during training.
-- **Impact**:
-  - **Non-zero Value (e.g., 0.01)**: Adds a penalty proportional to the magnitude of the weights to the loss function, helping to prevent overfitting by discouraging large weights.
-  - **Zero**: No weight decay is applied, which can lead to overfitting, especially in large models or with small datasets.
-
-**learning_rate**
-- **Default**: 2e-4
-- **Description**: The rate at which the model updates its parameters during training.
-- **Impact**:
-  - **Higher**: Faster convergence but risks overshooting optimal parameters and causing instability in training.
-  - **Lower**: More stable and precise updates but may slow down convergence, requiring more training steps to achieve good performance.
-
-## Target Modules 
-
-**q_proj (query projection)**
-- **Description**: Part of the attention mechanism in transformer models, responsible for projecting the input into the query space.
-- **Impact**: Transforms the input into query vectors that are used to compute attention scores.
-
-**k_proj (key projection)**
-- **Description**: Projects the input into the key space in the attention mechanism.
-- **Impact**: Produces key vectors that are compared with query vectors to determine attention weights.
-
-**v_proj (value projection)**
-- **Description**: Projects the input into the value space in the attention mechanism.
-- **Impact**: Produces value vectors that are weighted by the attention scores and combined to form the output.
-
-**o_proj (output projection)**
-- **Description**: Projects the output of the attention mechanism back into the original space.
-- **Impact**: Transforms the combined weighted value vectors back to the input dimension, integrating attention results into the model.
-
-**gate_proj (gate projection)**
-- **Description**: Typically used in gated mechanisms within neural networks, such as gating units in gated recurrent units (GRUs) or other gating mechanisms.
-- **Impact**: Controls the flow of information through the gate, allowing selective information passage based on learned weights.
-
-**up_proj (up projection)**
-- **Description**: Used for up-projection, typically increasing the dimensionality of the input.
-- **Impact**: Expands the input to a higher-dimensional space, often used in feedforward layers or when transitioning between different layers with differing dimensionalities.
-
-**down_proj (down projection)**
-- **Description**: Used for down-projection, typically reducing the dimensionality of the input.
-- **Impact**: Compresses the input to a lower-dimensional space, useful for reducing computational complexity and controlling the model size.
diff --git a/README.md b/README.md
index 2c50f457..dab899cf 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 ## ✨ Finetune for Free
 
-All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face.
+All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation.
 
 | Unsloth supports | Free Notebooks | Performance | Memory use |
 |-----------|---------|--------|----------|

From d98e45e41b90d5df56432c764135466ee7597e38 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 18:06:12 +1000
Subject: [PATCH 126/153] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index dab899cf..0a2cd1fa 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 ## ✨ Finetune for Free
 
-All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation.
+All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face.
 
 | Unsloth supports | Free Notebooks | Performance | Memory use |
 |-----------|---------|--------|----------|
@@ -35,7 +35,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral 7B v3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text
 - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language
-
+- Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
 
 ## 🦥 Unsloth.ai News
 - 📣 NEW! Continued Pretraining [notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) for other languages like Korean!

From b6f0fdb53d40918aa05ec4dfbd789681ba7879a5 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 18:07:42 +1000
Subject: [PATCH 127/153] Update qwen2.py

---
 unsloth/models/qwen2.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
index 907148df..115bf3e0 100644
--- a/unsloth/models/qwen2.py
+++ b/unsloth/models/qwen2.py
@@ -15,7 +15,6 @@
 from .llama import *
 import os
 from ._utils import __version__
-from .mistral import FastMistralModel
 
 from transformers.models.qwen2.modeling_qwen2 import (
     Qwen2Attention,
@@ -73,7 +72,7 @@ def from_pretrained(
         trust_remote_code = False,
         **kwargs,
     ):
-        return FastMistralModel.from_pretrained(
+        return FastLlamaModel.from_pretrained(
             model_name     = model_name,
             max_seq_length = max_seq_length,
             dtype          = dtype,

From 6c031e4a32f06ed77b86958d1f2f97047ef42ccc Mon Sep 17 00:00:00 2001
From: Walter Korman <lemurware@gmail.com>
Date: Fri, 14 Jun 2024 01:11:28 -0700
Subject: [PATCH 128/153] README: Fix minor typo. (#559)

* README: Fix minor typo.

One-character typo fix while reading.

* Update README.md

---------

Co-authored-by: Daniel Han <danielhanchen@gmail.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0a2cd1fa..a56dea5c 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ model = FastLanguageModel.get_peft_model(
 
 
 ## 🥇 Performance Benchmarking
-- For the full list of **reproducable** benchmarking tables, [go to our website](https://unsloth.ai/blog/mistral-benchmark#Benchmark%20tables)
+- For the full list of **reproducible** benchmarking tables, [go to our website](https://unsloth.ai/blog/mistral-benchmark#Benchmark%20tables)
 
 | 1 A100 40GB  | 🤗Hugging Face | Flash Attention | 🦥Unsloth Open Source | 🦥[Unsloth Pro](https://unsloth.ai/pricing) |
 |--------------|--------------|-----------------|---------------------|-----------------|
@@ -257,7 +257,7 @@ trainer.train()
 # (1) Saving to GGUF / merging to 16bit for vLLM
 # (2) Continued training from a saved LoRA adapter
 # (3) Adding an evaluation loop / OOMs
-# (4) Cutomized chat templates
+# (4) Customized chat templates
 ```
 
 <a name="DPO"></a>

From 2401dee8ff7d23a09a619f5a0feb14bbe59f8b2c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 20:15:47 +1000
Subject: [PATCH 129/153] Update mistral.py

---
 unsloth/models/mistral.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index fc2e1a9f..ff2e909f 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -512,7 +512,7 @@ def from_pretrained(
         if "n_total_devices >" not in inner_training_loop:
             raise RuntimeError(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
                 "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
@@ -521,6 +521,7 @@ def from_pretrained(
             "is_sagemaker_mp_enabled()",
             "False",
         )
+        exec(inner_training_loop, globals())
         Trainer._inner_training_loop = _fast_inner_training_loop
 
         # Save max_seq_length
@@ -560,6 +561,7 @@ def from_pretrained(
 
         # Add save modules
         patch_saving_functions(model)
+        Trainer._inner_training_loop = _fast_inner_training_loop
 
         # Save tokenizer for inference purposes
         tokenizer.padding_side = "left" # Force inference

From 1b93d7eee6c980ca1ce521aca27624b7dc075bbd Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 20:16:58 +1000
Subject: [PATCH 130/153] Update qwen2.py

---
 unsloth/models/qwen2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
index 115bf3e0..96fcf5d9 100644
--- a/unsloth/models/qwen2.py
+++ b/unsloth/models/qwen2.py
@@ -72,7 +72,7 @@ def from_pretrained(
         trust_remote_code = False,
         **kwargs,
     ):
-        return FastLlamaModel.from_pretrained(
+        return FastMistralModel.from_pretrained(
             model_name     = model_name,
             max_seq_length = max_seq_length,
             dtype          = dtype,

From 358103718f265fea647a99c5ffcb6aff490201ad Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 20:19:37 +1000
Subject: [PATCH 131/153] Update qwen2.py

---
 unsloth/models/qwen2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
index 96fcf5d9..04f888b2 100644
--- a/unsloth/models/qwen2.py
+++ b/unsloth/models/qwen2.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .llama import *
+from .mistral import *
 import os
 from ._utils import __version__
 

From b56b8b84dcd560865c3aa9a8c7fbbbaf7eb86e4a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 20:24:06 +1000
Subject: [PATCH 132/153] Update qwen2.py

---
 unsloth/models/qwen2.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
index 04f888b2..2973bd02 100644
--- a/unsloth/models/qwen2.py
+++ b/unsloth/models/qwen2.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .mistral import *
-import os
-from ._utils import __version__
+from .llama import *
 
 from transformers.models.qwen2.modeling_qwen2 import (
     Qwen2Attention,
@@ -72,7 +70,7 @@ def from_pretrained(
         trust_remote_code = False,
         **kwargs,
     ):
-        return FastMistralModel.from_pretrained(
+        return FastLlamaModel.from_pretrained(
             model_name     = model_name,
             max_seq_length = max_seq_length,
             dtype          = dtype,

From fe8c06496f0ebd81932b25fbc48cd673f42920d0 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 20:32:01 +1000
Subject: [PATCH 133/153] Update llama.py

---
 unsloth/models/llama.py | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 4cbbcf0a..0cbab0bc 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -59,6 +59,11 @@
 from ..save import patch_saving_functions
 import re, os, inspect, math, sys
 
+from inspect import currentframe, getframeinfo
+def DEBUG():
+    frameinfo = getframeinfo(currentframe())
+    print(frameinfo.filename, frameinfo.lineno)
+pass
 
 def original_apply_qkv(self, X):
     Q = self.q_proj(X)
@@ -289,6 +294,7 @@ def LlamaAttention_fast_forward(
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     
     # Clear inference
+    DEBUG()
     if hasattr(self, "paged_attention"):
         del self.paged_attention_K
         del self.paged_attention_V
@@ -330,6 +336,7 @@ def LlamaAttention_fast_forward(
         V = torch.cat([past_key_value[1], V], dim = 2)
     pass
     past_key_value = (K, V) if use_cache else None
+    DEBUG()
 
     # Attention module
     if (not HAS_FLASH_ATTENTION and attention_mask is None):
@@ -338,6 +345,7 @@ def LlamaAttention_fast_forward(
         Q = Q.transpose(1, 2)
         K = K.transpose(1, 2)
         V = V.transpose(1, 2)
+        DEBUG()
 
         # Group query attention
         if n_groups != 1:
@@ -353,6 +361,7 @@ def LlamaAttention_fast_forward(
         pass
         A = xformers_attention(Q, K, V, attn_bias = causal_mask)
         A = A.view(bsz, q_len, n_heads, head_dim)
+        DEBUG()
 
     elif HAS_FLASH_ATTENTION and attention_mask is None:
         Q = Q.transpose(1, 2)
@@ -379,6 +388,7 @@ def LlamaAttention_fast_forward(
     attn_output = A.reshape(bsz, q_len, n_heads*head_dim)
     attn_output = self.apply_o(self, attn_output)
     attn_weights = None
+    DEBUG()
     return attn_output, attn_weights, past_key_value
 pass
 
@@ -430,8 +440,10 @@ def LlamaDecoderLayer_fast_forward(
         hidden_states = fast_swiglu_inference(self.mlp, hidden_states)
         hidden_states += residual
     else:
+        DEBUG()
         residual = hidden_states
         hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states)
+        DEBUG()
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             causal_mask=causal_mask,
@@ -442,13 +454,18 @@ def LlamaDecoderLayer_fast_forward(
             use_cache=use_cache,
             padding_mask=padding_mask,
         )
+        DEBUG()
         hidden_states = residual + hidden_states
+        DEBUG()
 
         # Fully Connected
         residual = hidden_states
         hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states)
+        DEBUG()
         hidden_states = self.mlp(hidden_states)
+        DEBUG()
         hidden_states = residual + hidden_states
+        DEBUG()
     pass
 
     outputs = (hidden_states,)
@@ -473,7 +490,8 @@ def LlamaModel_fast_forward(
     return_dict:          Optional[bool] = None,
     *args, **kwargs,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
-
+    
+    DEBUG()
     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
     assert(output_attentions is False)
     output_hidden_states = (
@@ -508,6 +526,7 @@ def LlamaModel_fast_forward(
             inputs_embeds = inputs_embeds[:,:self.max_seq_length,:]
         pass
     pass
+    DEBUG()
     
     past_key_values_length = 0
 
@@ -515,6 +534,7 @@ def LlamaModel_fast_forward(
         past_key_values_length = past_key_values[0][0].shape[2]
         seq_length_with_past = seq_length_with_past + past_key_values_length
     pass
+    DEBUG()
 
     # We already handle KV cache position_ids ourselves.
     if False:#(past_key_values_length != 0):
@@ -529,11 +549,13 @@ def LlamaModel_fast_forward(
     else:
         position_ids = None
     pass
+    DEBUG()
 
     if position_ids is not None:
         if position_ids.shape[0] != batch_size:
             position_ids = position_ids.repeat((batch_size, 1))
     pass
+    DEBUG()
 
     # Embed positions
     if inputs_embeds is None:
@@ -544,6 +566,7 @@ def LlamaModel_fast_forward(
     # Normalized from Gemma
     IS_GEMMA = self.config.model_type == "gemma"
     train_embed_tokens = self.embed_tokens.weight.requires_grad
+    DEBUG()
 
     if IS_GEMMA:
         # Match Gemma exactly by casting to bfloat16 / float16
@@ -568,6 +591,7 @@ def LlamaModel_fast_forward(
             if inputs_requires_grad: inputs_embeds.requires_grad_(True)
         pass
     pass
+    DEBUG()
 
     # Fix up attention mask by setting elements to 0
     # Specifically for DPO
@@ -585,6 +609,7 @@ def LlamaModel_fast_forward(
         inputs_embeds *= attention_mask.unsqueeze(0).transpose(0, 1).transpose(1, 2)
         if inputs_requires_grad: inputs_embeds.requires_grad_(True)
     pass
+    DEBUG()
 
     # Ignore attention_mask
     if attention_mask is None:
@@ -606,6 +631,7 @@ def LlamaModel_fast_forward(
             sliding_window = getattr(self.config, "sliding_window", None),
         )
     pass
+    DEBUG()
 
     hidden_states = inputs_embeds
 
@@ -629,6 +655,7 @@ def LlamaModel_fast_forward(
     else:
         boundaries = None
     pass
+    DEBUG()
 
     # Check checkpointing method
     gradient_checkpointing = False
@@ -641,6 +668,7 @@ def LlamaModel_fast_forward(
         if output_attentions is False and hasattr(self, "_offloaded_gradient_checkpointing"):
             offloaded_gradient_checkpointing = True
     pass
+    DEBUG()
 
     # Go through every layer!
     for idx, decoder_layer in enumerate(self.layers):
@@ -648,6 +676,7 @@ def LlamaModel_fast_forward(
         if output_hidden_states: all_hidden_states += (hidden_states,)
         past_key_value = past_key_values[idx] if past_key_values is not None else None
 
+        DEBUG()
         if offloaded_gradient_checkpointing:
             hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply(
                 decoder_layer,
@@ -703,10 +732,12 @@ def custom_forward(*inputs):
     else:
         hidden_states = fast_rms_layernorm(self.norm, hidden_states, gemma = IS_GEMMA)
     pass
+    DEBUG()
 
     if output_hidden_states: all_hidden_states += (hidden_states,)
     next_cache = next_decoder_cache if use_cache else None
 
+    DEBUG()
     if not return_dict:
         return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
     return BaseModelOutputWithPast(
@@ -801,6 +832,7 @@ def _CausalLM_fast_forward(
                 attention_mask = attention_mask,
             )
         else:
+            DEBUG()
             causal_mask = xformers.attn_bias.LowerTriangularMask()
     
             output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -812,6 +844,7 @@ def _CausalLM_fast_forward(
             # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
             self.model._has_no_labels = labels is None
 
+            DEBUG()
             outputs = self.model(
                 input_ids=input_ids,
                 causal_mask=causal_mask,
@@ -826,6 +859,7 @@ def _CausalLM_fast_forward(
             )
         pass
 
+        DEBUG()
         hidden_states = outputs[0]
         bsz, q_len, hd = hidden_states.shape
         lm_head = self.lm_head.weight
@@ -837,6 +871,8 @@ def _CausalLM_fast_forward(
         pass
         logits = logits.to(self.config.torch_dtype)
 
+        DEBUG()
+
         loss = None
         if labels is not None:
             shift_logits = logits
@@ -851,6 +887,7 @@ def _CausalLM_fast_forward(
                 labels = shift_labels,
             )
         pass
+        DEBUG()
 
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -881,6 +918,7 @@ def PeftModelForCausalLM_fast_forward(
     task_ids=None,
     **kwargs,
 ):
+    DEBUG()
     return self.base_model(
         input_ids=input_ids,
         causal_mask=causal_mask,

From d8d332ac8332107ad89e41b79b106f1efd34d69a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 20:37:10 +1000
Subject: [PATCH 134/153] Update llama.py

---
 unsloth/models/llama.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 0cbab0bc..0b0ce437 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -59,10 +59,9 @@
 from ..save import patch_saving_functions
 import re, os, inspect, math, sys
 
-from inspect import currentframe, getframeinfo
+import sys
 def DEBUG():
-    frameinfo = getframeinfo(currentframe())
-    print(frameinfo.filename, frameinfo.lineno)
+    print(sys._getframe().f_back.f_lineno)
 pass
 
 def original_apply_qkv(self, X):

From cdb1dbb5ff7627d35f727ba2a795be6856bd9e80 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 20:42:19 +1000
Subject: [PATCH 135/153] Update llama.py

---
 unsloth/models/llama.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 0b0ce437..3097c5dd 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -470,6 +470,7 @@ def LlamaDecoderLayer_fast_forward(
     outputs = (hidden_states,)
     if output_attentions: outputs += (self_attn_weights,)
     if use_cache: outputs += (present_key_value,)
+    DEBUG()
     return outputs
 pass
 
@@ -687,6 +688,7 @@ def LlamaModel_fast_forward(
                 output_attentions,
                 use_cache,
             )[0]
+            DEBUG()
 
         elif gradient_checkpointing:
             def create_custom_forward(module):
@@ -694,7 +696,7 @@ def custom_forward(*inputs):
                     return module(*inputs, past_key_value, output_attentions, padding_mask = padding_mask)
                 return custom_forward
             pass
-
+            DEBUG()
             layer_outputs = torch.utils.checkpoint.checkpoint(
                 create_custom_forward(decoder_layer),
                 hidden_states,
@@ -705,7 +707,7 @@ def custom_forward(*inputs):
                 preserve_rng_state = False,
             )
             hidden_states = layer_outputs[0]
-
+            DEBUG()
         else:
             layer_outputs = decoder_layer(
                 hidden_states,
@@ -718,6 +720,7 @@ def custom_forward(*inputs):
                 padding_mask=padding_mask,
             )
             hidden_states = layer_outputs[0]
+            DEBUG()
         pass
 
         if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

From e8b3cf01b1132a62eff2d0cbe88661fc42376c82 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 20:53:51 +1000
Subject: [PATCH 136/153] Update README.md

---
 README.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index a56dea5c..534079ed 100644
--- a/README.md
+++ b/README.md
@@ -100,14 +100,16 @@ model = FastLanguageModel.get_peft_model(
 ### Conda Installation
 Select either `pytorch-cuda=11.8` for CUDA 11.8 or `pytorch-cuda=12.1` for CUDA 12.1. If you have `mamba`, use `mamba` instead of `conda` for faster solving. See this [Github issue](https://github.com/unslothai/unsloth/issues/73) for help on debugging Conda installs.
 ```bash
-conda create --name unsloth_env python=3.10
+conda create --name unsloth_env \
+    python=3.10 \
+    pytorch-cuda=<11.8/12.1> \
+    pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \
+    -y
 conda activate unsloth_env
 
-conda install pytorch-cuda=<12.1/11.8> pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers
-
 pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
 
-pip install --no-deps trl peft accelerate bitsandbytes
+pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes
 ```
 
 ### Pip Installation
@@ -162,7 +164,7 @@ pip install --no-deps packaging ninja einops flash-attn xformers trl peft accele
 
 # Pre Ampere RTX 2080, T4, GTX 1080 GPUs:
 pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
-pip install --no-deps xformers trl peft accelerate bitsandbytes
+pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
 ```
 7. For Pytorch 2.3.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher.
 ```bash

From 7e6f000146df73d47b77387726739a5c8d55ca02 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 20:54:47 +1000
Subject: [PATCH 137/153] FastMistralModel

---
 unsloth/models/llama.py | 46 +++--------------------------------------
 unsloth/models/qwen2.py |  6 +++---
 2 files changed, 6 insertions(+), 46 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 3097c5dd..4cbbcf0a 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -59,10 +59,6 @@
 from ..save import patch_saving_functions
 import re, os, inspect, math, sys
 
-import sys
-def DEBUG():
-    print(sys._getframe().f_back.f_lineno)
-pass
 
 def original_apply_qkv(self, X):
     Q = self.q_proj(X)
@@ -293,7 +289,6 @@ def LlamaAttention_fast_forward(
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     
     # Clear inference
-    DEBUG()
     if hasattr(self, "paged_attention"):
         del self.paged_attention_K
         del self.paged_attention_V
@@ -335,7 +330,6 @@ def LlamaAttention_fast_forward(
         V = torch.cat([past_key_value[1], V], dim = 2)
     pass
     past_key_value = (K, V) if use_cache else None
-    DEBUG()
 
     # Attention module
     if (not HAS_FLASH_ATTENTION and attention_mask is None):
@@ -344,7 +338,6 @@ def LlamaAttention_fast_forward(
         Q = Q.transpose(1, 2)
         K = K.transpose(1, 2)
         V = V.transpose(1, 2)
-        DEBUG()
 
         # Group query attention
         if n_groups != 1:
@@ -360,7 +353,6 @@ def LlamaAttention_fast_forward(
         pass
         A = xformers_attention(Q, K, V, attn_bias = causal_mask)
         A = A.view(bsz, q_len, n_heads, head_dim)
-        DEBUG()
 
     elif HAS_FLASH_ATTENTION and attention_mask is None:
         Q = Q.transpose(1, 2)
@@ -387,7 +379,6 @@ def LlamaAttention_fast_forward(
     attn_output = A.reshape(bsz, q_len, n_heads*head_dim)
     attn_output = self.apply_o(self, attn_output)
     attn_weights = None
-    DEBUG()
     return attn_output, attn_weights, past_key_value
 pass
 
@@ -439,10 +430,8 @@ def LlamaDecoderLayer_fast_forward(
         hidden_states = fast_swiglu_inference(self.mlp, hidden_states)
         hidden_states += residual
     else:
-        DEBUG()
         residual = hidden_states
         hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states)
-        DEBUG()
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             causal_mask=causal_mask,
@@ -453,24 +442,18 @@ def LlamaDecoderLayer_fast_forward(
             use_cache=use_cache,
             padding_mask=padding_mask,
         )
-        DEBUG()
         hidden_states = residual + hidden_states
-        DEBUG()
 
         # Fully Connected
         residual = hidden_states
         hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states)
-        DEBUG()
         hidden_states = self.mlp(hidden_states)
-        DEBUG()
         hidden_states = residual + hidden_states
-        DEBUG()
     pass
 
     outputs = (hidden_states,)
     if output_attentions: outputs += (self_attn_weights,)
     if use_cache: outputs += (present_key_value,)
-    DEBUG()
     return outputs
 pass
 
@@ -490,8 +473,7 @@ def LlamaModel_fast_forward(
     return_dict:          Optional[bool] = None,
     *args, **kwargs,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
-    
-    DEBUG()
+
     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
     assert(output_attentions is False)
     output_hidden_states = (
@@ -526,7 +508,6 @@ def LlamaModel_fast_forward(
             inputs_embeds = inputs_embeds[:,:self.max_seq_length,:]
         pass
     pass
-    DEBUG()
     
     past_key_values_length = 0
 
@@ -534,7 +515,6 @@ def LlamaModel_fast_forward(
         past_key_values_length = past_key_values[0][0].shape[2]
         seq_length_with_past = seq_length_with_past + past_key_values_length
     pass
-    DEBUG()
 
     # We already handle KV cache position_ids ourselves.
     if False:#(past_key_values_length != 0):
@@ -549,13 +529,11 @@ def LlamaModel_fast_forward(
     else:
         position_ids = None
     pass
-    DEBUG()
 
     if position_ids is not None:
         if position_ids.shape[0] != batch_size:
             position_ids = position_ids.repeat((batch_size, 1))
     pass
-    DEBUG()
 
     # Embed positions
     if inputs_embeds is None:
@@ -566,7 +544,6 @@ def LlamaModel_fast_forward(
     # Normalized from Gemma
     IS_GEMMA = self.config.model_type == "gemma"
     train_embed_tokens = self.embed_tokens.weight.requires_grad
-    DEBUG()
 
     if IS_GEMMA:
         # Match Gemma exactly by casting to bfloat16 / float16
@@ -591,7 +568,6 @@ def LlamaModel_fast_forward(
             if inputs_requires_grad: inputs_embeds.requires_grad_(True)
         pass
     pass
-    DEBUG()
 
     # Fix up attention mask by setting elements to 0
     # Specifically for DPO
@@ -609,7 +585,6 @@ def LlamaModel_fast_forward(
         inputs_embeds *= attention_mask.unsqueeze(0).transpose(0, 1).transpose(1, 2)
         if inputs_requires_grad: inputs_embeds.requires_grad_(True)
     pass
-    DEBUG()
 
     # Ignore attention_mask
     if attention_mask is None:
@@ -631,7 +606,6 @@ def LlamaModel_fast_forward(
             sliding_window = getattr(self.config, "sliding_window", None),
         )
     pass
-    DEBUG()
 
     hidden_states = inputs_embeds
 
@@ -655,7 +629,6 @@ def LlamaModel_fast_forward(
     else:
         boundaries = None
     pass
-    DEBUG()
 
     # Check checkpointing method
     gradient_checkpointing = False
@@ -668,7 +641,6 @@ def LlamaModel_fast_forward(
         if output_attentions is False and hasattr(self, "_offloaded_gradient_checkpointing"):
             offloaded_gradient_checkpointing = True
     pass
-    DEBUG()
 
     # Go through every layer!
     for idx, decoder_layer in enumerate(self.layers):
@@ -676,7 +648,6 @@ def LlamaModel_fast_forward(
         if output_hidden_states: all_hidden_states += (hidden_states,)
         past_key_value = past_key_values[idx] if past_key_values is not None else None
 
-        DEBUG()
         if offloaded_gradient_checkpointing:
             hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply(
                 decoder_layer,
@@ -688,7 +659,6 @@ def LlamaModel_fast_forward(
                 output_attentions,
                 use_cache,
             )[0]
-            DEBUG()
 
         elif gradient_checkpointing:
             def create_custom_forward(module):
@@ -696,7 +666,7 @@ def custom_forward(*inputs):
                     return module(*inputs, past_key_value, output_attentions, padding_mask = padding_mask)
                 return custom_forward
             pass
-            DEBUG()
+
             layer_outputs = torch.utils.checkpoint.checkpoint(
                 create_custom_forward(decoder_layer),
                 hidden_states,
@@ -707,7 +677,7 @@ def custom_forward(*inputs):
                 preserve_rng_state = False,
             )
             hidden_states = layer_outputs[0]
-            DEBUG()
+
         else:
             layer_outputs = decoder_layer(
                 hidden_states,
@@ -720,7 +690,6 @@ def custom_forward(*inputs):
                 padding_mask=padding_mask,
             )
             hidden_states = layer_outputs[0]
-            DEBUG()
         pass
 
         if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
@@ -734,12 +703,10 @@ def custom_forward(*inputs):
     else:
         hidden_states = fast_rms_layernorm(self.norm, hidden_states, gemma = IS_GEMMA)
     pass
-    DEBUG()
 
     if output_hidden_states: all_hidden_states += (hidden_states,)
     next_cache = next_decoder_cache if use_cache else None
 
-    DEBUG()
     if not return_dict:
         return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
     return BaseModelOutputWithPast(
@@ -834,7 +801,6 @@ def _CausalLM_fast_forward(
                 attention_mask = attention_mask,
             )
         else:
-            DEBUG()
             causal_mask = xformers.attn_bias.LowerTriangularMask()
     
             output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -846,7 +812,6 @@ def _CausalLM_fast_forward(
             # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
             self.model._has_no_labels = labels is None
 
-            DEBUG()
             outputs = self.model(
                 input_ids=input_ids,
                 causal_mask=causal_mask,
@@ -861,7 +826,6 @@ def _CausalLM_fast_forward(
             )
         pass
 
-        DEBUG()
         hidden_states = outputs[0]
         bsz, q_len, hd = hidden_states.shape
         lm_head = self.lm_head.weight
@@ -873,8 +837,6 @@ def _CausalLM_fast_forward(
         pass
         logits = logits.to(self.config.torch_dtype)
 
-        DEBUG()
-
         loss = None
         if labels is not None:
             shift_logits = logits
@@ -889,7 +851,6 @@ def _CausalLM_fast_forward(
                 labels = shift_labels,
             )
         pass
-        DEBUG()
 
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -920,7 +881,6 @@ def PeftModelForCausalLM_fast_forward(
     task_ids=None,
     **kwargs,
 ):
-    DEBUG()
     return self.base_model(
         input_ids=input_ids,
         causal_mask=causal_mask,
diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
index 2973bd02..47327280 100644
--- a/unsloth/models/qwen2.py
+++ b/unsloth/models/qwen2.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .llama import *
+from .mistral import *
 
 from transformers.models.qwen2.modeling_qwen2 import (
     Qwen2Attention,
@@ -32,7 +32,7 @@
 pass
 
 
-class FastQwen2Model(FastLlamaModel):
+class FastQwen2Model(FastMistralModel):
 
     @staticmethod
     def pre_patch():
@@ -70,7 +70,7 @@ def from_pretrained(
         trust_remote_code = False,
         **kwargs,
     ):
-        return FastLlamaModel.from_pretrained(
+        return FastMistralModel.from_pretrained(
             model_name     = model_name,
             max_seq_length = max_seq_length,
             dtype          = dtype,

From 28995abd7a1d131402078e40f389c7cbdf7d3728 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 23:05:24 +1000
Subject: [PATCH 138/153] Update mistral.py

---
 unsloth/models/mistral.py | 570 +++++++++++++++++++-------------------
 1 file changed, 285 insertions(+), 285 deletions(-)

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index ff2e909f..d0af320e 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -287,291 +287,291 @@ def pre_patch():
     pass
 
 
-    @staticmethod
-    def from_pretrained(
-        model_name     = "unsloth/mistral-7b-bnb-4bit",
-        max_seq_length = None,
-        dtype          = None,
-        load_in_4bit   = True,
-        token          = None,
-        device_map     = "sequential",
-        rope_scaling   = None, # Mistral does not support RoPE scaling
-        fix_tokenizer  = True,
-        model_patcher  = None,
-        tokenizer_name = None,
-        trust_remote_code = False,
-        **kwargs,
-    ):
-        if token is None and "HF_TOKEN" in os.environ:
-            token = os.environ["HF_TOKEN"]
-
-        if token is None and "HUGGINGFACE_TOKEN" in os.environ:
-            token = os.environ["HUGGINGFACE_TOKEN"]
-
-        if model_patcher is None: model_patcher = FastMistralModel
-        # Mistral does NOT support RoPE Scaling!
-        if rope_scaling is not None:
-            logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
-        pass
-
-        SUPPORTS_BFLOAT16 = is_bfloat16_supported()
-        gpu_stats = torch.cuda.get_device_properties(0)
-        max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-
-        statistics = \
-           f"==((====))==  Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\
-           f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
-           f"O^O/ \_/ \\    Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
-           f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\
-           f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
-        print(statistics)
-        model_patcher.pre_patch()
-        # get_statistics()
-
-        if dtype is None:
-            dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
-        elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
-            logger.warning_once("Device does not support bfloat16. Will change to float16.")
-            dtype = torch.float16
-
-        assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
-
-        # Check max sequence length
-        model_config = AutoConfig.from_pretrained(model_name, token = token)
-        model_max_seq_length = model_config.max_position_embeddings
-
-        # If max_seq_length is not specified, use maximum fron config
-        if max_seq_length is None:
-            max_seq_length = model_max_seq_length
-        pass
-
-        # Mistral does NOT support RoPE Scaling sadly so we have to error out.
-        if max_seq_length > model_max_seq_length:
-            raise RuntimeError(
-                f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\
-                f"The maximum sequence length supported is {model_max_seq_length}.",
-            )
-        pass
-
-        bnb_config = None
-        if load_in_4bit:
-            bnb_config = BitsAndBytesConfig(
-                load_in_4bit              = True,
-                bnb_4bit_use_double_quant = True,
-                bnb_4bit_quant_type       = "nf4",
-                bnb_4bit_compute_dtype    = dtype,
-            )
-
-        max_position_embeddings = max(max_seq_length, model_max_seq_length)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            device_map          = device_map,
-            torch_dtype         = dtype,
-            quantization_config = bnb_config,
-            token               = token,
-            # rope_scaling      = rope_scaling,
-            trust_remote_code   = trust_remote_code,
-            **kwargs,
-        )
-
-        # Counteract saved tokenizers
-        tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
-        tokenizer = load_correct_tokenizer(
-            tokenizer_name,
-            model_max_length  = max_position_embeddings,
-            padding_side      = "right",
-            token             = token,
-            trust_remote_code = trust_remote_code,
-        )
-
-        model, tokenizer = patch_tokenizer(model, tokenizer)
-        model = model_patcher.post_patch(model)
-
-        # Patch up QKV / O and MLP
-        for idx, layer in enumerate(model.model.layers):
-            layer.self_attn.apply_qkv = original_apply_qkv
-            layer.self_attn.apply_o   = original_apply_o
-        pass
-
-        # Patch Trainer
-        from transformers.trainer import Trainer
-        try:
-            if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
-                inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
-                Trainer._original_training_loop = inner_training_loop
-            else:
-                inner_training_loop = Trainer._original_training_loop
-        except:
-            raise RuntimeError(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-            )
-        pass
-
-        import transformers.trainer
-        items_in_trainer = dir(transformers.trainer)
-        good_items = []
-        for item in items_in_trainer:
-            # TODO: Support Deepspeed
-            if item.startswith(("deepspeed", "xm", "met", "smp")): continue
-            if item in inner_training_loop: good_items.append(item)
-        pass
-        exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
-
-        start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
-        end = inner_training_loop.find("\n\n", start)
-        original_debug = inner_training_loop[start:end]
-        spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
-        front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)
-
-        debug_info = """debug_info = \\
-        f"==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\
-        f"   \\\\\\   /|    Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\
-        f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
-        f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
-        f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
-        logger.warning(debug_info)
-        import subprocess, re, gc
-        output = subprocess.check_output(
-            'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
-        output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
-        output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
-        if output > 1: raise RuntimeError(
-            'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
-        for _ in range(3):
-            gc.collect()
-            torch.cuda.empty_cache()"""
-
-        debug_info = debug_info.split('\n')
-        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
-        inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
-
-        debug_info = """n_total_devices = total_train_batch_size // \\
-            args.gradient_accumulation_steps // self._train_batch_size
-        if n_total_devices > 1:
-            logger.warning_once(
-                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-            )
-        debug_info ="""
-        debug_info = debug_info.split('\n')
-        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
-        inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1)
-
-        front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0)
-        inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE)
-        inner_training_loop = inner_training_loop.replace(
-            "train_dataloader = tpu_spmd_dataloader(train_dataloader)",
-            "raise RuntimeError('Unsloth: TPUs are not yet supported!')"
-        )
-        inner_training_loop = inner_training_loop.replace(
-            "self.accelerator.free_memory()",
-            "self.accelerator.free_memory()\n" + \
-            front_spaces + "if self.is_deepspeed_enabled:"\
-            "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1,
-        )
-
-        check_batches = """train_dataloader = self.get_train_dataloader()
-        ga  = args.gradient_accumulation_steps
-        bsz = self._train_batch_size
-        total_batches = bsz * ga * args.world_size
-        n_total_devices = total_batches // ga // bsz
-        if n_total_devices > 1:
-            logger.warning_once(
-                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-            )
-            divisor = n_total_devices / 1
-            bsz = self._train_batch_size = max(int(bsz / divisor), 1)
-            if total_batches // ga // bsz > 1:
-                divisor = n_total_devices / 1
-                ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
-        check_batches = check_batches.split('\n')
-        check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
-        inner_training_loop = inner_training_loop.replace(
-            "train_dataloader = self.get_train_dataloader()",
-            check_batches, 1,
-        )
-        inner_training_loop = inner_training_loop.replace(
-            "_inner_training_loop",
-            "_fast_inner_training_loop", 1,
-        )
-        exec(inner_training_loop, globals())
-
-        Trainer._inner_training_loop = _fast_inner_training_loop
-        inner_training_loop = inner_training_loop.replace(
-            "is_torch_tpu_available()",
-            "False",
-        )
-        if "n_total_devices >" not in inner_training_loop:
-            raise RuntimeError(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-            )
-        pass
-        inner_training_loop = inner_training_loop.replace(
-            "is_sagemaker_mp_enabled()",
-            "False",
-        )
-        exec(inner_training_loop, globals())
-        Trainer._inner_training_loop = _fast_inner_training_loop
-
-        # Save max_seq_length
-        max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings)
-        model.max_seq_length = max_position_embeddings
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            internal_model.max_seq_length = max_position_embeddings
-            internal_model = internal_model.model
-        pass
-        internal_model.max_seq_length = max_position_embeddings
-
-        # We check the tokenizer first for errors
-        if fix_tokenizer:
-            tokenizer = check_tokenizer(
-                model            = model,
-                tokenizer        = tokenizer,
-                model_name       = model_name,
-                model_max_length = max_position_embeddings,
-                padding_side     = "right",
-                token            = token,
-            )
-        pass
-        patch_saving_functions(tokenizer)
-
-        # Fix up config for transformers uploading PEFT
-        # Not necessary anymore since we require transformers>=4.37
-        if False:
-            name = model.config._name_or_path
-            if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
-                name = name[:len(name) - len("-bnb-4bit")]
-                model.config.update({"_name_or_path" : name})
-            pass
+    # @staticmethod
+    # def from_pretrained(
+    #     model_name     = "unsloth/mistral-7b-bnb-4bit",
+    #     max_seq_length = None,
+    #     dtype          = None,
+    #     load_in_4bit   = True,
+    #     token          = None,
+    #     device_map     = "sequential",
+    #     rope_scaling   = None, # Mistral does not support RoPE scaling
+    #     fix_tokenizer  = True,
+    #     model_patcher  = None,
+    #     tokenizer_name = None,
+    #     trust_remote_code = False,
+    #     **kwargs,
+    # ):
+    #     if token is None and "HF_TOKEN" in os.environ:
+    #         token = os.environ["HF_TOKEN"]
+
+    #     if token is None and "HUGGINGFACE_TOKEN" in os.environ:
+    #         token = os.environ["HUGGINGFACE_TOKEN"]
+
+    #     if model_patcher is None: model_patcher = FastMistralModel
+    #     # Mistral does NOT support RoPE Scaling!
+    #     if rope_scaling is not None:
+    #         logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
+    #     pass
+
+    #     SUPPORTS_BFLOAT16 = is_bfloat16_supported()
+    #     gpu_stats = torch.cuda.get_device_properties(0)
+    #     max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+
+    #     statistics = \
+    #        f"==((====))==  Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\
+    #        f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
+    #        f"O^O/ \_/ \\    Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
+    #        f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\
+    #        f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
+    #     print(statistics)
+    #     model_patcher.pre_patch()
+    #     # get_statistics()
+
+    #     if dtype is None:
+    #         dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
+    #     elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
+    #         logger.warning_once("Device does not support bfloat16. Will change to float16.")
+    #         dtype = torch.float16
+
+    #     assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
+
+    #     # Check max sequence length
+    #     model_config = AutoConfig.from_pretrained(model_name, token = token)
+    #     model_max_seq_length = model_config.max_position_embeddings
+
+    #     # If max_seq_length is not specified, use maximum fron config
+    #     if max_seq_length is None:
+    #         max_seq_length = model_max_seq_length
+    #     pass
+
+    #     # Mistral does NOT support RoPE Scaling sadly so we have to error out.
+    #     if max_seq_length > model_max_seq_length:
+    #         raise RuntimeError(
+    #             f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\
+    #             f"The maximum sequence length supported is {model_max_seq_length}.",
+    #         )
+    #     pass
+
+    #     bnb_config = None
+    #     if load_in_4bit:
+    #         bnb_config = BitsAndBytesConfig(
+    #             load_in_4bit              = True,
+    #             bnb_4bit_use_double_quant = True,
+    #             bnb_4bit_quant_type       = "nf4",
+    #             bnb_4bit_compute_dtype    = dtype,
+    #         )
+
+    #     max_position_embeddings = max(max_seq_length, model_max_seq_length)
+    #     model = AutoModelForCausalLM.from_pretrained(
+    #         model_name,
+    #         device_map          = device_map,
+    #         torch_dtype         = dtype,
+    #         quantization_config = bnb_config,
+    #         token               = token,
+    #         # rope_scaling      = rope_scaling,
+    #         trust_remote_code   = trust_remote_code,
+    #         **kwargs,
+    #     )
+
+    #     # Counteract saved tokenizers
+    #     tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
+    #     tokenizer = load_correct_tokenizer(
+    #         tokenizer_name,
+    #         model_max_length  = max_position_embeddings,
+    #         padding_side      = "right",
+    #         token             = token,
+    #         trust_remote_code = trust_remote_code,
+    #     )
+
+    #     model, tokenizer = patch_tokenizer(model, tokenizer)
+    #     model = model_patcher.post_patch(model)
+
+    #     # Patch up QKV / O and MLP
+    #     for idx, layer in enumerate(model.model.layers):
+    #         layer.self_attn.apply_qkv = original_apply_qkv
+    #         layer.self_attn.apply_o   = original_apply_o
+    #     pass
+
+    #     # Patch Trainer
+    #     from transformers.trainer import Trainer
+    #     try:
+    #         if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
+    #             inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
+    #             Trainer._original_training_loop = inner_training_loop
+    #         else:
+    #             inner_training_loop = Trainer._original_training_loop
+    #     except:
+    #         raise RuntimeError(
+    #             "Our OSS was designed for people with few GPU resources to level the playing field.\n"
+    #             "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
+    #             "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
+    #             "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+    #         )
+    #     pass
+
+    #     import transformers.trainer
+    #     items_in_trainer = dir(transformers.trainer)
+    #     good_items = []
+    #     for item in items_in_trainer:
+    #         # TODO: Support Deepspeed
+    #         if item.startswith(("deepspeed", "xm", "met", "smp")): continue
+    #         if item in inner_training_loop: good_items.append(item)
+    #     pass
+    #     exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
+
+    #     start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
+    #     end = inner_training_loop.find("\n\n", start)
+    #     original_debug = inner_training_loop[start:end]
+    #     spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
+    #     front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)
+
+    #     debug_info = """debug_info = \\
+    #     f"==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\
+    #     f"   \\\\\\   /|    Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\
+    #     f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
+    #     f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
+    #     f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
+    #     logger.warning(debug_info)
+    #     import subprocess, re, gc
+    #     output = subprocess.check_output(
+    #         'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
+    #     output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
+    #     output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
+    #     if output > 1: raise RuntimeError(
+    #         'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
+    #     for _ in range(3):
+    #         gc.collect()
+    #         torch.cuda.empty_cache()"""
+
+    #     debug_info = debug_info.split('\n')
+    #     debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
+    #     inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
+
+    #     debug_info = """n_total_devices = total_train_batch_size // \\
+    #         args.gradient_accumulation_steps // self._train_batch_size
+    #     if n_total_devices > 1:
+    #         logger.warning_once(
+    #             "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+    #             "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+    #             "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+    #             "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+    #         )
+    #     debug_info ="""
+    #     debug_info = debug_info.split('\n')
+    #     debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
+    #     inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1)
+
+    #     front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0)
+    #     inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE)
+    #     inner_training_loop = inner_training_loop.replace(
+    #         "train_dataloader = tpu_spmd_dataloader(train_dataloader)",
+    #         "raise RuntimeError('Unsloth: TPUs are not yet supported!')"
+    #     )
+    #     inner_training_loop = inner_training_loop.replace(
+    #         "self.accelerator.free_memory()",
+    #         "self.accelerator.free_memory()\n" + \
+    #         front_spaces + "if self.is_deepspeed_enabled:"\
+    #         "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1,
+    #     )
+
+    #     check_batches = """train_dataloader = self.get_train_dataloader()
+    #     ga  = args.gradient_accumulation_steps
+    #     bsz = self._train_batch_size
+    #     total_batches = bsz * ga * args.world_size
+    #     n_total_devices = total_batches // ga // bsz
+    #     if n_total_devices > 1:
+    #         logger.warning_once(
+    #             "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+    #             "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+    #             "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+    #             "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+    #         )
+    #         divisor = n_total_devices / 1
+    #         bsz = self._train_batch_size = max(int(bsz / divisor), 1)
+    #         if total_batches // ga // bsz > 1:
+    #             divisor = n_total_devices / 1
+    #             ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
+    #     check_batches = check_batches.split('\n')
+    #     check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
+    #     inner_training_loop = inner_training_loop.replace(
+    #         "train_dataloader = self.get_train_dataloader()",
+    #         check_batches, 1,
+    #     )
+    #     inner_training_loop = inner_training_loop.replace(
+    #         "_inner_training_loop",
+    #         "_fast_inner_training_loop", 1,
+    #     )
+    #     exec(inner_training_loop, globals())
+
+    #     Trainer._inner_training_loop = _fast_inner_training_loop
+    #     inner_training_loop = inner_training_loop.replace(
+    #         "is_torch_tpu_available()",
+    #         "False",
+    #     )
+    #     if "n_total_devices >" not in inner_training_loop:
+    #         raise RuntimeError(
+    #             "Our OSS was designed for people with few GPU resources to level the playing field.\n"
+    #             "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
+    #             "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
+    #             "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+    #         )
+    #     pass
+    #     inner_training_loop = inner_training_loop.replace(
+    #         "is_sagemaker_mp_enabled()",
+    #         "False",
+    #     )
+    #     exec(inner_training_loop, globals())
+    #     Trainer._inner_training_loop = _fast_inner_training_loop
+
+    #     # Save max_seq_length
+    #     max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings)
+    #     model.max_seq_length = max_position_embeddings
+    #     internal_model = model
+    #     while hasattr(internal_model, "model"):
+    #         internal_model.max_seq_length = max_position_embeddings
+    #         internal_model = internal_model.model
+    #     pass
+    #     internal_model.max_seq_length = max_position_embeddings
+
+    #     # We check the tokenizer first for errors
+    #     if fix_tokenizer:
+    #         tokenizer = check_tokenizer(
+    #             model            = model,
+    #             tokenizer        = tokenizer,
+    #             model_name       = model_name,
+    #             model_max_length = max_position_embeddings,
+    #             padding_side     = "right",
+    #             token            = token,
+    #         )
+    #     pass
+    #     patch_saving_functions(tokenizer)
+
+    #     # Fix up config for transformers uploading PEFT
+    #     # Not necessary anymore since we require transformers>=4.37
+    #     if False:
+    #         name = model.config._name_or_path
+    #         if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
+    #             name = name[:len(name) - len("-bnb-4bit")]
+    #             model.config.update({"_name_or_path" : name})
+    #         pass
         
-        # Log Unsloth version for future fastpaths for inference
-        model.config.update({"unsloth_version" : __version__})
-
-        # Add save modules
-        patch_saving_functions(model)
-        Trainer._inner_training_loop = _fast_inner_training_loop
-
-        # Save tokenizer for inference purposes
-        tokenizer.padding_side = "left" # Force inference
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            internal_model._saved_temp_tokenizer = tokenizer
-            internal_model = internal_model.model
-        pass
-        internal_model._saved_temp_tokenizer = tokenizer
+    #     # Log Unsloth version for future fastpaths for inference
+    #     model.config.update({"unsloth_version" : __version__})
+
+    #     # Add save modules
+    #     patch_saving_functions(model)
+    #     Trainer._inner_training_loop = _fast_inner_training_loop
+
+    #     # Save tokenizer for inference purposes
+    #     tokenizer.padding_side = "left" # Force inference
+    #     internal_model = model
+    #     while hasattr(internal_model, "model"):
+    #         internal_model._saved_temp_tokenizer = tokenizer
+    #         internal_model = internal_model.model
+    #     pass
+    #     internal_model._saved_temp_tokenizer = tokenizer
         
-        return model, tokenizer
-    pass
+    #     return model, tokenizer
+    # pass
 pass

From 515b1ae45d031657bfc930b52e4ee63191f8c6c2 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 23:09:21 +1000
Subject: [PATCH 139/153] Update mistral.py

---
 unsloth/models/mistral.py | 596 ++++++++++++++++++++------------------
 1 file changed, 310 insertions(+), 286 deletions(-)

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index d0af320e..1b89929d 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -287,291 +287,315 @@ def pre_patch():
     pass
 
 
-    # @staticmethod
-    # def from_pretrained(
-    #     model_name     = "unsloth/mistral-7b-bnb-4bit",
-    #     max_seq_length = None,
-    #     dtype          = None,
-    #     load_in_4bit   = True,
-    #     token          = None,
-    #     device_map     = "sequential",
-    #     rope_scaling   = None, # Mistral does not support RoPE scaling
-    #     fix_tokenizer  = True,
-    #     model_patcher  = None,
-    #     tokenizer_name = None,
-    #     trust_remote_code = False,
-    #     **kwargs,
-    # ):
-    #     if token is None and "HF_TOKEN" in os.environ:
-    #         token = os.environ["HF_TOKEN"]
-
-    #     if token is None and "HUGGINGFACE_TOKEN" in os.environ:
-    #         token = os.environ["HUGGINGFACE_TOKEN"]
-
-    #     if model_patcher is None: model_patcher = FastMistralModel
-    #     # Mistral does NOT support RoPE Scaling!
-    #     if rope_scaling is not None:
-    #         logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
-    #     pass
-
-    #     SUPPORTS_BFLOAT16 = is_bfloat16_supported()
-    #     gpu_stats = torch.cuda.get_device_properties(0)
-    #     max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-
-    #     statistics = \
-    #        f"==((====))==  Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\
-    #        f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
-    #        f"O^O/ \_/ \\    Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
-    #        f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\
-    #        f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
-    #     print(statistics)
-    #     model_patcher.pre_patch()
-    #     # get_statistics()
-
-    #     if dtype is None:
-    #         dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
-    #     elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
-    #         logger.warning_once("Device does not support bfloat16. Will change to float16.")
-    #         dtype = torch.float16
-
-    #     assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
-
-    #     # Check max sequence length
-    #     model_config = AutoConfig.from_pretrained(model_name, token = token)
-    #     model_max_seq_length = model_config.max_position_embeddings
-
-    #     # If max_seq_length is not specified, use maximum fron config
-    #     if max_seq_length is None:
-    #         max_seq_length = model_max_seq_length
-    #     pass
-
-    #     # Mistral does NOT support RoPE Scaling sadly so we have to error out.
-    #     if max_seq_length > model_max_seq_length:
-    #         raise RuntimeError(
-    #             f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\
-    #             f"The maximum sequence length supported is {model_max_seq_length}.",
-    #         )
-    #     pass
-
-    #     bnb_config = None
-    #     if load_in_4bit:
-    #         bnb_config = BitsAndBytesConfig(
-    #             load_in_4bit              = True,
-    #             bnb_4bit_use_double_quant = True,
-    #             bnb_4bit_quant_type       = "nf4",
-    #             bnb_4bit_compute_dtype    = dtype,
-    #         )
-
-    #     max_position_embeddings = max(max_seq_length, model_max_seq_length)
-    #     model = AutoModelForCausalLM.from_pretrained(
-    #         model_name,
-    #         device_map          = device_map,
-    #         torch_dtype         = dtype,
-    #         quantization_config = bnb_config,
-    #         token               = token,
-    #         # rope_scaling      = rope_scaling,
-    #         trust_remote_code   = trust_remote_code,
-    #         **kwargs,
-    #     )
-
-    #     # Counteract saved tokenizers
-    #     tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
-    #     tokenizer = load_correct_tokenizer(
-    #         tokenizer_name,
-    #         model_max_length  = max_position_embeddings,
-    #         padding_side      = "right",
-    #         token             = token,
-    #         trust_remote_code = trust_remote_code,
-    #     )
-
-    #     model, tokenizer = patch_tokenizer(model, tokenizer)
-    #     model = model_patcher.post_patch(model)
-
-    #     # Patch up QKV / O and MLP
-    #     for idx, layer in enumerate(model.model.layers):
-    #         layer.self_attn.apply_qkv = original_apply_qkv
-    #         layer.self_attn.apply_o   = original_apply_o
-    #     pass
-
-    #     # Patch Trainer
-    #     from transformers.trainer import Trainer
-    #     try:
-    #         if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
-    #             inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
-    #             Trainer._original_training_loop = inner_training_loop
-    #         else:
-    #             inner_training_loop = Trainer._original_training_loop
-    #     except:
-    #         raise RuntimeError(
-    #             "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-    #             "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
-    #             "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
-    #             "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-    #         )
-    #     pass
-
-    #     import transformers.trainer
-    #     items_in_trainer = dir(transformers.trainer)
-    #     good_items = []
-    #     for item in items_in_trainer:
-    #         # TODO: Support Deepspeed
-    #         if item.startswith(("deepspeed", "xm", "met", "smp")): continue
-    #         if item in inner_training_loop: good_items.append(item)
-    #     pass
-    #     exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
-
-    #     start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
-    #     end = inner_training_loop.find("\n\n", start)
-    #     original_debug = inner_training_loop[start:end]
-    #     spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
-    #     front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)
-
-    #     debug_info = """debug_info = \\
-    #     f"==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\
-    #     f"   \\\\\\   /|    Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\
-    #     f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
-    #     f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
-    #     f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
-    #     logger.warning(debug_info)
-    #     import subprocess, re, gc
-    #     output = subprocess.check_output(
-    #         'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
-    #     output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
-    #     output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
-    #     if output > 1: raise RuntimeError(
-    #         'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
-    #     for _ in range(3):
-    #         gc.collect()
-    #         torch.cuda.empty_cache()"""
-
-    #     debug_info = debug_info.split('\n')
-    #     debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
-    #     inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
-
-    #     debug_info = """n_total_devices = total_train_batch_size // \\
-    #         args.gradient_accumulation_steps // self._train_batch_size
-    #     if n_total_devices > 1:
-    #         logger.warning_once(
-    #             "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-    #             "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-    #             "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-    #             "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-    #         )
-    #     debug_info ="""
-    #     debug_info = debug_info.split('\n')
-    #     debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
-    #     inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1)
-
-    #     front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0)
-    #     inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE)
-    #     inner_training_loop = inner_training_loop.replace(
-    #         "train_dataloader = tpu_spmd_dataloader(train_dataloader)",
-    #         "raise RuntimeError('Unsloth: TPUs are not yet supported!')"
-    #     )
-    #     inner_training_loop = inner_training_loop.replace(
-    #         "self.accelerator.free_memory()",
-    #         "self.accelerator.free_memory()\n" + \
-    #         front_spaces + "if self.is_deepspeed_enabled:"\
-    #         "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1,
-    #     )
-
-    #     check_batches = """train_dataloader = self.get_train_dataloader()
-    #     ga  = args.gradient_accumulation_steps
-    #     bsz = self._train_batch_size
-    #     total_batches = bsz * ga * args.world_size
-    #     n_total_devices = total_batches // ga // bsz
-    #     if n_total_devices > 1:
-    #         logger.warning_once(
-    #             "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-    #             "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-    #             "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-    #             "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-    #         )
-    #         divisor = n_total_devices / 1
-    #         bsz = self._train_batch_size = max(int(bsz / divisor), 1)
-    #         if total_batches // ga // bsz > 1:
-    #             divisor = n_total_devices / 1
-    #             ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
-    #     check_batches = check_batches.split('\n')
-    #     check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
-    #     inner_training_loop = inner_training_loop.replace(
-    #         "train_dataloader = self.get_train_dataloader()",
-    #         check_batches, 1,
-    #     )
-    #     inner_training_loop = inner_training_loop.replace(
-    #         "_inner_training_loop",
-    #         "_fast_inner_training_loop", 1,
-    #     )
-    #     exec(inner_training_loop, globals())
-
-    #     Trainer._inner_training_loop = _fast_inner_training_loop
-    #     inner_training_loop = inner_training_loop.replace(
-    #         "is_torch_tpu_available()",
-    #         "False",
-    #     )
-    #     if "n_total_devices >" not in inner_training_loop:
-    #         raise RuntimeError(
-    #             "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-    #             "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
-    #             "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
-    #             "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-    #         )
-    #     pass
-    #     inner_training_loop = inner_training_loop.replace(
-    #         "is_sagemaker_mp_enabled()",
-    #         "False",
-    #     )
-    #     exec(inner_training_loop, globals())
-    #     Trainer._inner_training_loop = _fast_inner_training_loop
-
-    #     # Save max_seq_length
-    #     max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings)
-    #     model.max_seq_length = max_position_embeddings
-    #     internal_model = model
-    #     while hasattr(internal_model, "model"):
-    #         internal_model.max_seq_length = max_position_embeddings
-    #         internal_model = internal_model.model
-    #     pass
-    #     internal_model.max_seq_length = max_position_embeddings
-
-    #     # We check the tokenizer first for errors
-    #     if fix_tokenizer:
-    #         tokenizer = check_tokenizer(
-    #             model            = model,
-    #             tokenizer        = tokenizer,
-    #             model_name       = model_name,
-    #             model_max_length = max_position_embeddings,
-    #             padding_side     = "right",
-    #             token            = token,
-    #         )
-    #     pass
-    #     patch_saving_functions(tokenizer)
-
-    #     # Fix up config for transformers uploading PEFT
-    #     # Not necessary anymore since we require transformers>=4.37
-    #     if False:
-    #         name = model.config._name_or_path
-    #         if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
-    #             name = name[:len(name) - len("-bnb-4bit")]
-    #             model.config.update({"_name_or_path" : name})
-    #         pass
-        
-    #     # Log Unsloth version for future fastpaths for inference
-    #     model.config.update({"unsloth_version" : __version__})
-
-    #     # Add save modules
-    #     patch_saving_functions(model)
-    #     Trainer._inner_training_loop = _fast_inner_training_loop
-
-    #     # Save tokenizer for inference purposes
-    #     tokenizer.padding_side = "left" # Force inference
-    #     internal_model = model
-    #     while hasattr(internal_model, "model"):
-    #         internal_model._saved_temp_tokenizer = tokenizer
-    #         internal_model = internal_model.model
-    #     pass
-    #     internal_model._saved_temp_tokenizer = tokenizer
+    @staticmethod
+    def from_pretrained(
+        model_name     = "unsloth/llama-2-7b-bnb-4bit",
+        max_seq_length = None,
+        dtype          = None,
+        load_in_4bit   = True,
+        token          = None,
+        device_map     = "sequential",
+        rope_scaling   = None,
+        fix_tokenizer  = True,
+        model_patcher  = None,
+        tokenizer_name = None,
+        trust_remote_code = False,
+        **kwargs,
+    ):
+        if token is None and "HF_TOKEN" in os.environ:
+            token = os.environ["HF_TOKEN"]
+
+        if token is None and "HUGGINGFACE_TOKEN" in os.environ:
+            token = os.environ["HUGGINGFACE_TOKEN"]
+
+        if model_patcher is None: model_patcher = FastLlamaModel
+        SUPPORTS_BFLOAT16 = is_bfloat16_supported()
+        gpu_stats = torch.cuda.get_device_properties(0)
+        max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+
+        statistics = \
+           f"==((====))==  Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\
+           f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
+           f"O^O/ \_/ \\    Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
+           f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\
+           f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
+        print(statistics)
+        model_patcher.pre_patch()
+        # get_statistics()
+
+        if dtype is None:
+            dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
+        elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
+            logger.warning_once("Device does not support bfloat16. Will change to float16.")
+            dtype = torch.float16
+
+        assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
+
+        # RoPE scaling
+        model_max_seq_length = \
+            AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings
+
+        # If max_seq_length is not specified, use maximum fron config
+        if max_seq_length is None:
+            max_seq_length = model_max_seq_length
+        pass
+
+        if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
+            rope_scaling = max_seq_length / model_max_seq_length
+            logger.warning_once(
+                f"Unsloth: {model_name} can only handle sequence lengths of at most "\
+                f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
+                f"{round(rope_scaling, 3)}, it can be magically be extended to "\
+                f"{max_seq_length}!"
+            )
+            rope_scaling = {"type": "linear", "factor": rope_scaling,}
+        pass
+
+        bnb_config = None
+        if load_in_4bit:
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit              = True,
+                bnb_4bit_use_double_quant = True,
+                bnb_4bit_quant_type       = "nf4",
+                bnb_4bit_compute_dtype    = dtype,
+            )
+        pass
+
+        # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12
+        # RoPE Scaling's max_position_embeddings must be updated
+        max_position_embeddings = max(max_seq_length, model_max_seq_length)
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                device_map              = device_map,
+                torch_dtype             = dtype,
+                quantization_config     = bnb_config,
+                token                   = token,
+                rope_scaling            = rope_scaling,
+                max_position_embeddings = max_position_embeddings,
+                trust_remote_code       = trust_remote_code,
+                **kwargs,
+            )
+        except Exception as error:
+            if "rope_scaling" in str(error):
+                if rope_scaling is not None:
+                    raise TypeError("Unsloth: {model_name} does not support rope_scaling.")
+                pass
+
+                # Counteract missing rope_scaling
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    device_map              = device_map,
+                    torch_dtype             = dtype,
+                    quantization_config     = bnb_config,
+                    token                   = token,
+                    max_position_embeddings = max_position_embeddings,
+                    trust_remote_code       = trust_remote_code,
+                    **kwargs,
+                )
+            else:
+                raise error
+            pass
+        pass
+
+        # Counteract saved tokenizers
+        tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
+        tokenizer = load_correct_tokenizer(
+            tokenizer_name    = tokenizer_name,
+            model_max_length  = max_position_embeddings,
+            padding_side      = "right",
+            token             = token,
+            trust_remote_code = trust_remote_code,
+        )
+
+        model, tokenizer = patch_tokenizer(model, tokenizer)
+        model = model_patcher.post_patch(model)
+
+        # Patch up QKV / O and MLP
+        for idx, layer in enumerate(model.model.layers):
+            layer.self_attn.apply_qkv = original_apply_qkv
+            layer.self_attn.apply_o   = original_apply_o
+        pass
+
+        # Patch Trainer
+        from transformers.trainer import Trainer
+        try:
+            if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
+                inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
+                Trainer._original_training_loop = inner_training_loop
+            else:
+                inner_training_loop = Trainer._original_training_loop
+        except:
+            raise RuntimeError(
+                "Our OSS was designed for people with few GPU resources to level the playing field.\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
+                "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
+                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+            )
+        pass
+
+        import transformers.trainer
+        items_in_trainer = dir(transformers.trainer)
+        good_items = []
+        for item in items_in_trainer:
+            # TODO: Support Deepspeed
+            if item.startswith(("deepspeed", "xm", "met", "smp")): continue
+            if item in inner_training_loop: good_items.append(item)
+        pass
+        exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
+
+        start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
+        end = inner_training_loop.find("\n\n", start)
+        original_debug = inner_training_loop[start:end]
+        spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
+        front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)
+
+        debug_info = """debug_info = \\
+        f"==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\
+        f"   \\\\\\   /|    Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\
+        f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
+        f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
+        f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
+        logger.warning(debug_info)
+        import subprocess, re, gc
+        output = subprocess.check_output(
+            'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
+        output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
+        output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
+        if output > 1: raise RuntimeError(
+            'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()"""
+
+        debug_info = debug_info.split('\n')
+        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
+        inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
+
+        debug_info = """n_total_devices = total_train_batch_size // \\
+            args.gradient_accumulation_steps // self._train_batch_size
+        if n_total_devices > 1:
+            logger.warning_once(
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+            )
+        debug_info ="""
+        debug_info = debug_info.split('\n')
+        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
+        inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1)
+
+        front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0)
+        inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE)
+        inner_training_loop = inner_training_loop.replace(
+            "train_dataloader = tpu_spmd_dataloader(train_dataloader)",
+            "raise RuntimeError('Unsloth: TPUs are not yet supported!')"
+        )
+        inner_training_loop = inner_training_loop.replace(
+            "self.accelerator.free_memory()",
+            "self.accelerator.free_memory()\n" + \
+            front_spaces + "if self.is_deepspeed_enabled:"\
+            "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1,
+        )
+
+        check_batches = """train_dataloader = self.get_train_dataloader()
+        ga  = args.gradient_accumulation_steps
+        bsz = self._train_batch_size
+        total_batches = bsz * ga * args.world_size
+        n_total_devices = total_batches // ga // bsz
+        if n_total_devices > 1:
+            logger.warning_once(
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+            )
+            divisor = n_total_devices / 1
+            bsz = self._train_batch_size = max(int(bsz / divisor), 1)
+            if total_batches // ga // bsz > 1:
+                divisor = n_total_devices / 1
+                ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
+        check_batches = check_batches.split('\n')
+        check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
+        inner_training_loop = inner_training_loop.replace(
+            "train_dataloader = self.get_train_dataloader()",
+            check_batches, 1,
+        )
+        inner_training_loop = inner_training_loop.replace(
+            "_inner_training_loop",
+            "_fast_inner_training_loop", 1,
+        )
+        exec(inner_training_loop, globals())
+
+        Trainer._inner_training_loop = _fast_inner_training_loop
+        inner_training_loop = inner_training_loop.replace(
+            "is_torch_tpu_available()",
+            "False",
+        )
+        if "n_total_devices >" not in inner_training_loop:
+            raise RuntimeError(
+                "Our OSS was designed for people with few GPU resources to level the playing field.\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
+                "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
+                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+            )
+        pass
+        inner_training_loop = inner_training_loop.replace(
+            "is_sagemaker_mp_enabled()",
+            "False",
+        )
+        exec(inner_training_loop, globals())
+        Trainer._inner_training_loop = _fast_inner_training_loop
+
+        # Save max_seq_length
+        model.max_seq_length = max_position_embeddings
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model.max_seq_length = max_position_embeddings
+            internal_model = internal_model.model
+        pass
+        internal_model.max_seq_length = max_position_embeddings
+
+        # We check the tokenizer first for errors
+        if fix_tokenizer:
+            tokenizer = check_tokenizer(
+                model            = model,
+                tokenizer        = tokenizer,
+                model_name       = model_name,
+                model_max_length = max_position_embeddings,
+                padding_side     = "right",
+                token            = token,
+            )
+        pass
+        patch_saving_functions(tokenizer)
+
+        # Fix up config for transformers uploading PEFT
+        # Not necessary anymore since we require transformers>=4.37!
+        if False:
+            name = model.config._name_or_path
+            if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
+                name = name[:len(name) - len("-bnb-4bit")]
+                model.config.update({"_name_or_path" : name})
+            pass
+        pass
+
+        # Log Unsloth version for future fastpaths for inference
+        model.config.update({"unsloth_version" : __version__})
+
+        # Add save modules
+        patch_saving_functions(model)
+        Trainer._inner_training_loop = _fast_inner_training_loop
+
+        # Save tokenizer for inference purposes
+        tokenizer.padding_side = "left" # Force inference
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model._saved_temp_tokenizer = tokenizer
+            internal_model = internal_model.model
+        pass
+        internal_model._saved_temp_tokenizer = tokenizer
         
-    #     return model, tokenizer
-    # pass
+        return model, tokenizer
+    pass
 pass

From 7f28209ac5263ad997806da01dde2159185526f2 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 23:29:18 +1000
Subject: [PATCH 140/153] Update mistral.py

---
 unsloth/models/mistral.py | 84 ++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 54 deletions(-)

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 1b89929d..ff2e909f 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -289,13 +289,13 @@ def pre_patch():
 
     @staticmethod
     def from_pretrained(
-        model_name     = "unsloth/llama-2-7b-bnb-4bit",
+        model_name     = "unsloth/mistral-7b-bnb-4bit",
         max_seq_length = None,
         dtype          = None,
         load_in_4bit   = True,
         token          = None,
         device_map     = "sequential",
-        rope_scaling   = None,
+        rope_scaling   = None, # Mistral does not support RoPE scaling
         fix_tokenizer  = True,
         model_patcher  = None,
         tokenizer_name = None,
@@ -308,7 +308,12 @@ def from_pretrained(
         if token is None and "HUGGINGFACE_TOKEN" in os.environ:
             token = os.environ["HUGGINGFACE_TOKEN"]
 
-        if model_patcher is None: model_patcher = FastLlamaModel
+        if model_patcher is None: model_patcher = FastMistralModel
+        # Mistral does NOT support RoPE Scaling!
+        if rope_scaling is not None:
+            logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
+        pass
+
         SUPPORTS_BFLOAT16 = is_bfloat16_supported()
         gpu_stats = torch.cuda.get_device_properties(0)
         max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
@@ -331,24 +336,21 @@ def from_pretrained(
 
         assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
 
-        # RoPE scaling
-        model_max_seq_length = \
-            AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings
+        # Check max sequence length
+        model_config = AutoConfig.from_pretrained(model_name, token = token)
+        model_max_seq_length = model_config.max_position_embeddings
 
         # If max_seq_length is not specified, use maximum fron config
         if max_seq_length is None:
             max_seq_length = model_max_seq_length
         pass
 
-        if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
-            rope_scaling = max_seq_length / model_max_seq_length
-            logger.warning_once(
-                f"Unsloth: {model_name} can only handle sequence lengths of at most "\
-                f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
-                f"{round(rope_scaling, 3)}, it can be magically be extended to "\
-                f"{max_seq_length}!"
+        # Mistral does NOT support RoPE Scaling sadly so we have to error out.
+        if max_seq_length > model_max_seq_length:
+            raise RuntimeError(
+                f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\
+                f"The maximum sequence length supported is {model_max_seq_length}.",
             )
-            rope_scaling = {"type": "linear", "factor": rope_scaling,}
         pass
 
         bnb_config = None
@@ -359,49 +361,23 @@ def from_pretrained(
                 bnb_4bit_quant_type       = "nf4",
                 bnb_4bit_compute_dtype    = dtype,
             )
-        pass
 
-        # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12
-        # RoPE Scaling's max_position_embeddings must be updated
         max_position_embeddings = max(max_seq_length, model_max_seq_length)
-        try:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                device_map              = device_map,
-                torch_dtype             = dtype,
-                quantization_config     = bnb_config,
-                token                   = token,
-                rope_scaling            = rope_scaling,
-                max_position_embeddings = max_position_embeddings,
-                trust_remote_code       = trust_remote_code,
-                **kwargs,
-            )
-        except Exception as error:
-            if "rope_scaling" in str(error):
-                if rope_scaling is not None:
-                    raise TypeError("Unsloth: {model_name} does not support rope_scaling.")
-                pass
-
-                # Counteract missing rope_scaling
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    device_map              = device_map,
-                    torch_dtype             = dtype,
-                    quantization_config     = bnb_config,
-                    token                   = token,
-                    max_position_embeddings = max_position_embeddings,
-                    trust_remote_code       = trust_remote_code,
-                    **kwargs,
-                )
-            else:
-                raise error
-            pass
-        pass
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map          = device_map,
+            torch_dtype         = dtype,
+            quantization_config = bnb_config,
+            token               = token,
+            # rope_scaling      = rope_scaling,
+            trust_remote_code   = trust_remote_code,
+            **kwargs,
+        )
 
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
         tokenizer = load_correct_tokenizer(
-            tokenizer_name    = tokenizer_name,
+            tokenizer_name,
             model_max_length  = max_position_embeddings,
             padding_side      = "right",
             token             = token,
@@ -549,6 +525,7 @@ def from_pretrained(
         Trainer._inner_training_loop = _fast_inner_training_loop
 
         # Save max_seq_length
+        max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings)
         model.max_seq_length = max_position_embeddings
         internal_model = model
         while hasattr(internal_model, "model"):
@@ -571,15 +548,14 @@ def from_pretrained(
         patch_saving_functions(tokenizer)
 
         # Fix up config for transformers uploading PEFT
-        # Not necessary anymore since we require transformers>=4.37!
+        # Not necessary anymore since we require transformers>=4.37
         if False:
             name = model.config._name_or_path
             if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
                 name = name[:len(name) - len("-bnb-4bit")]
                 model.config.update({"_name_or_path" : name})
             pass
-        pass
-
+        
         # Log Unsloth version for future fastpaths for inference
         model.config.update({"unsloth_version" : __version__})
 

From 453cc48660967ddcf4e1eeb707d2f29e77dde7b0 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 23:32:58 +1000
Subject: [PATCH 141/153] Update mistral.py

---
 unsloth/models/mistral.py | 84 +++++++++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 30 deletions(-)

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index ff2e909f..6bf3fc84 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -289,13 +289,13 @@ def pre_patch():
 
     @staticmethod
     def from_pretrained(
-        model_name     = "unsloth/mistral-7b-bnb-4bit",
+        model_name     = "unsloth/llama-2-7b-bnb-4bit",
         max_seq_length = None,
         dtype          = None,
         load_in_4bit   = True,
         token          = None,
         device_map     = "sequential",
-        rope_scaling   = None, # Mistral does not support RoPE scaling
+        rope_scaling   = None,
         fix_tokenizer  = True,
         model_patcher  = None,
         tokenizer_name = None,
@@ -308,12 +308,7 @@ def from_pretrained(
         if token is None and "HUGGINGFACE_TOKEN" in os.environ:
             token = os.environ["HUGGINGFACE_TOKEN"]
 
-        if model_patcher is None: model_patcher = FastMistralModel
-        # Mistral does NOT support RoPE Scaling!
-        if rope_scaling is not None:
-            logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
-        pass
-
+        if model_patcher is None: model_patcher = FastLlamaModel
         SUPPORTS_BFLOAT16 = is_bfloat16_supported()
         gpu_stats = torch.cuda.get_device_properties(0)
         max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
@@ -336,21 +331,24 @@ def from_pretrained(
 
         assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
 
-        # Check max sequence length
-        model_config = AutoConfig.from_pretrained(model_name, token = token)
-        model_max_seq_length = model_config.max_position_embeddings
+        # RoPE scaling
+        model_max_seq_length = \
+            AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings
 
         # If max_seq_length is not specified, use maximum fron config
         if max_seq_length is None:
             max_seq_length = model_max_seq_length
         pass
 
-        # Mistral does NOT support RoPE Scaling sadly so we have to error out.
-        if max_seq_length > model_max_seq_length:
-            raise RuntimeError(
-                f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\
-                f"The maximum sequence length supported is {model_max_seq_length}.",
+        if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
+            rope_scaling = max_seq_length / model_max_seq_length
+            logger.warning_once(
+                f"Unsloth: {model_name} can only handle sequence lengths of at most "\
+                f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
+                f"{round(rope_scaling, 3)}, it can be magically be extended to "\
+                f"{max_seq_length}!"
             )
+            rope_scaling = {"type": "linear", "factor": rope_scaling,}
         pass
 
         bnb_config = None
@@ -361,23 +359,49 @@ def from_pretrained(
                 bnb_4bit_quant_type       = "nf4",
                 bnb_4bit_compute_dtype    = dtype,
             )
+        pass
 
+        # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12
+        # RoPE Scaling's max_position_embeddings must be updated
         max_position_embeddings = max(max_seq_length, model_max_seq_length)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            device_map          = device_map,
-            torch_dtype         = dtype,
-            quantization_config = bnb_config,
-            token               = token,
-            # rope_scaling      = rope_scaling,
-            trust_remote_code   = trust_remote_code,
-            **kwargs,
-        )
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                device_map              = device_map,
+                torch_dtype             = dtype,
+                quantization_config     = bnb_config,
+                token                   = token,
+                # rope_scaling            = rope_scaling,
+                max_position_embeddings = max_position_embeddings,
+                trust_remote_code       = trust_remote_code,
+                **kwargs,
+            )
+        except Exception as error:
+            if "rope_scaling" in str(error):
+                if rope_scaling is not None:
+                    raise TypeError("Unsloth: {model_name} does not support rope_scaling.")
+                pass
+
+                # Counteract missing rope_scaling
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    device_map              = device_map,
+                    torch_dtype             = dtype,
+                    quantization_config     = bnb_config,
+                    token                   = token,
+                    max_position_embeddings = max_position_embeddings,
+                    trust_remote_code       = trust_remote_code,
+                    **kwargs,
+                )
+            else:
+                raise error
+            pass
+        pass
 
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
         tokenizer = load_correct_tokenizer(
-            tokenizer_name,
+            tokenizer_name    = tokenizer_name,
             model_max_length  = max_position_embeddings,
             padding_side      = "right",
             token             = token,
@@ -525,7 +549,6 @@ def from_pretrained(
         Trainer._inner_training_loop = _fast_inner_training_loop
 
         # Save max_seq_length
-        max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings)
         model.max_seq_length = max_position_embeddings
         internal_model = model
         while hasattr(internal_model, "model"):
@@ -548,14 +571,15 @@ def from_pretrained(
         patch_saving_functions(tokenizer)
 
         # Fix up config for transformers uploading PEFT
-        # Not necessary anymore since we require transformers>=4.37
+        # Not necessary anymore since we require transformers>=4.37!
         if False:
             name = model.config._name_or_path
             if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
                 name = name[:len(name) - len("-bnb-4bit")]
                 model.config.update({"_name_or_path" : name})
             pass
-        
+        pass
+
         # Log Unsloth version for future fastpaths for inference
         model.config.update({"unsloth_version" : __version__})
 

From 6633d4a9bba8d94af0c6d1387a123edcd3aeb0b0 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Fri, 14 Jun 2024 23:38:46 +1000
Subject: [PATCH 142/153] Update mistral.py

---
 unsloth/models/mistral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 6bf3fc84..1b89929d 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -371,7 +371,7 @@ def from_pretrained(
                 torch_dtype             = dtype,
                 quantization_config     = bnb_config,
                 token                   = token,
-                # rope_scaling            = rope_scaling,
+                rope_scaling            = rope_scaling,
                 max_position_embeddings = max_position_embeddings,
                 trust_remote_code       = trust_remote_code,
                 **kwargs,

From e5bf125140975f709ca0d69dd5d2b5ca2e0d8e06 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 15 Jun 2024 00:31:38 +1000
Subject: [PATCH 143/153] Auto check rope scaling

---
 unsloth/models/llama.py   |  94 +++++------
 unsloth/models/mistral.py | 323 +++-----------------------------------
 unsloth/models/qwen2.py   |  46 +++---
 3 files changed, 91 insertions(+), 372 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 4cbbcf0a..9e84c6ee 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -51,6 +51,7 @@
 pass
 
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
 from transformers import set_seed as transformers_set_seed
 from peft import LoraConfig, TaskType, get_peft_model as _get_peft_model
 from peft import PeftModelForCausalLM
@@ -1028,16 +1029,16 @@ def pre_patch():
 
     @staticmethod
     def from_pretrained(
-        model_name     = "unsloth/llama-2-7b-bnb-4bit",
-        max_seq_length = None,
-        dtype          = None,
-        load_in_4bit   = True,
-        token          = None,
-        device_map     = "sequential",
-        rope_scaling   = None,
-        fix_tokenizer  = True,
-        model_patcher  = None,
-        tokenizer_name = None,
+        model_name        = "unsloth/llama-3-8b-bnb-4bit",
+        max_seq_length    = None,
+        dtype             = None,
+        load_in_4bit      = True,
+        token             = None,
+        device_map        = "sequential",
+        rope_scaling      = None,
+        fix_tokenizer     = True,
+        model_patcher     = None,
+        tokenizer_name    = None,
         trust_remote_code = False,
         **kwargs,
     ):
@@ -1070,9 +1071,17 @@ def from_pretrained(
 
         assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
 
-        # RoPE scaling
-        model_max_seq_length = \
-            AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings
+        # RoPE Scaling
+        model_config = AutoConfig.from_pretrained(model_name, token = token)
+        model_max_seq_length = model_config.max_position_embeddings
+
+        # Check if RoPE Scaling is even allowed
+        model_function = MODEL_FOR_CAUSAL_LM_MAPPING[model_config.__class__]
+        has_rope_scaling = False
+        try:
+            with open(inspect.getfile(model_function), "r") as file:
+                has_rope_scaling = "self.config.rope_scaling" in file.read()
+        except: pass
 
         # If max_seq_length is not specified, use maximum fron config
         if max_seq_length is None:
@@ -1080,6 +1089,18 @@ def from_pretrained(
         pass
 
         if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
+
+            # Warn RoPE scaling isn't allowed
+            if not has_rope_scaling:
+                raise RuntimeError(
+                    f"Unsloth: {model_name} can only handle sequence lengths of at most "\
+                    f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
+                    f"{round(rope_scaling, 3)}, it should be magically be extended to "\
+                    f"{max_seq_length}. However, {model_name} doesn't support RoPE Scaling!\n"\
+                    "Please file a feature request at https://github.com/unslothai/unsloth."
+                )
+            pass
+
             rope_scaling = max_seq_length / model_max_seq_length
             logger.warning_once(
                 f"Unsloth: {model_name} can only handle sequence lengths of at most "\
@@ -1088,6 +1109,9 @@ def from_pretrained(
                 f"{max_seq_length}!"
             )
             rope_scaling = {"type": "linear", "factor": rope_scaling,}
+
+            # Add to kwargs
+            kwargs["rope_scaling"] = rope_scaling
         pass
 
         bnb_config = None
@@ -1103,39 +1127,16 @@ def from_pretrained(
         # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12
         # RoPE Scaling's max_position_embeddings must be updated
         max_position_embeddings = max(max_seq_length, model_max_seq_length)
-        try:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                device_map              = device_map,
-                torch_dtype             = dtype,
-                quantization_config     = bnb_config,
-                token                   = token,
-                rope_scaling            = rope_scaling,
-                max_position_embeddings = max_position_embeddings,
-                trust_remote_code       = trust_remote_code,
-                **kwargs,
-            )
-        except Exception as error:
-            if "rope_scaling" in str(error):
-                if rope_scaling is not None:
-                    raise TypeError("Unsloth: {model_name} does not support rope_scaling.")
-                pass
-
-                # Counteract missing rope_scaling
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    device_map              = device_map,
-                    torch_dtype             = dtype,
-                    quantization_config     = bnb_config,
-                    token                   = token,
-                    max_position_embeddings = max_position_embeddings,
-                    trust_remote_code       = trust_remote_code,
-                    **kwargs,
-                )
-            else:
-                raise error
-            pass
-        pass
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map              = device_map,
+            torch_dtype             = dtype,
+            quantization_config     = bnb_config,
+            token                   = token,
+            max_position_embeddings = max_position_embeddings,
+            trust_remote_code       = trust_remote_code,
+            **kwargs,
+        )
 
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
@@ -1423,7 +1424,6 @@ def get_peft_model(
 
         if loftq_config is None: loftq_config = {}
 
-        import inspect
         signature = str(inspect.signature(LoraConfig))
         SUPPORTS_LOFTQ  = "loftq_config" in signature
         SUPPORTS_RSLORA = "use_rslora"   in signature
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 1b89929d..291f0aa5 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -289,313 +289,32 @@ def pre_patch():
 
     @staticmethod
     def from_pretrained(
-        model_name     = "unsloth/llama-2-7b-bnb-4bit",
-        max_seq_length = None,
-        dtype          = None,
-        load_in_4bit   = True,
-        token          = None,
-        device_map     = "sequential",
-        rope_scaling   = None,
-        fix_tokenizer  = True,
-        model_patcher  = None,
-        tokenizer_name = None,
+        model_name        = "unsloth/mistral-7b-bnb-4bit",
+        max_seq_length    = None,
+        dtype             = None,
+        load_in_4bit      = True,
+        token             = None,
+        device_map        = "sequential",
+        rope_scaling      = None, # Mistral does not support RoPE scaling
+        fix_tokenizer     = True,
+        model_patcher     = None,
+        tokenizer_name    = None,
         trust_remote_code = False,
         **kwargs,
     ):
-        if token is None and "HF_TOKEN" in os.environ:
-            token = os.environ["HF_TOKEN"]
-
-        if token is None and "HUGGINGFACE_TOKEN" in os.environ:
-            token = os.environ["HUGGINGFACE_TOKEN"]
-
-        if model_patcher is None: model_patcher = FastLlamaModel
-        SUPPORTS_BFLOAT16 = is_bfloat16_supported()
-        gpu_stats = torch.cuda.get_device_properties(0)
-        max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
-
-        statistics = \
-           f"==((====))==  Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\
-           f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
-           f"O^O/ \_/ \\    Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
-           f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\
-           f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
-        print(statistics)
-        model_patcher.pre_patch()
-        # get_statistics()
-
-        if dtype is None:
-            dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
-        elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
-            logger.warning_once("Device does not support bfloat16. Will change to float16.")
-            dtype = torch.float16
-
-        assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
-
-        # RoPE scaling
-        model_max_seq_length = \
-            AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings
-
-        # If max_seq_length is not specified, use maximum fron config
-        if max_seq_length is None:
-            max_seq_length = model_max_seq_length
-        pass
-
-        if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
-            rope_scaling = max_seq_length / model_max_seq_length
-            logger.warning_once(
-                f"Unsloth: {model_name} can only handle sequence lengths of at most "\
-                f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
-                f"{round(rope_scaling, 3)}, it can be magically be extended to "\
-                f"{max_seq_length}!"
-            )
-            rope_scaling = {"type": "linear", "factor": rope_scaling,}
-        pass
-
-        bnb_config = None
-        if load_in_4bit:
-            bnb_config = BitsAndBytesConfig(
-                load_in_4bit              = True,
-                bnb_4bit_use_double_quant = True,
-                bnb_4bit_quant_type       = "nf4",
-                bnb_4bit_compute_dtype    = dtype,
-            )
-        pass
-
-        # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12
-        # RoPE Scaling's max_position_embeddings must be updated
-        max_position_embeddings = max(max_seq_length, model_max_seq_length)
-        try:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                device_map              = device_map,
-                torch_dtype             = dtype,
-                quantization_config     = bnb_config,
-                token                   = token,
-                rope_scaling            = rope_scaling,
-                max_position_embeddings = max_position_embeddings,
-                trust_remote_code       = trust_remote_code,
-                **kwargs,
-            )
-        except Exception as error:
-            if "rope_scaling" in str(error):
-                if rope_scaling is not None:
-                    raise TypeError("Unsloth: {model_name} does not support rope_scaling.")
-                pass
-
-                # Counteract missing rope_scaling
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    device_map              = device_map,
-                    torch_dtype             = dtype,
-                    quantization_config     = bnb_config,
-                    token                   = token,
-                    max_position_embeddings = max_position_embeddings,
-                    trust_remote_code       = trust_remote_code,
-                    **kwargs,
-                )
-            else:
-                raise error
-            pass
-        pass
-
-        # Counteract saved tokenizers
-        tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
-        tokenizer = load_correct_tokenizer(
-            tokenizer_name    = tokenizer_name,
-            model_max_length  = max_position_embeddings,
-            padding_side      = "right",
+        return FastLlamaModel.from_pretrained(
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = dtype,
+            load_in_4bit      = load_in_4bit,
             token             = token,
+            device_map        = device_map,
+            rope_scaling      = rope_scaling,
+            fix_tokenizer     = fix_tokenizer,
+            model_patcher     = FastMistralModel,
+            tokenizer_name    = tokenizer_name,
             trust_remote_code = trust_remote_code,
+            **kwargs,
         )
-
-        model, tokenizer = patch_tokenizer(model, tokenizer)
-        model = model_patcher.post_patch(model)
-
-        # Patch up QKV / O and MLP
-        for idx, layer in enumerate(model.model.layers):
-            layer.self_attn.apply_qkv = original_apply_qkv
-            layer.self_attn.apply_o   = original_apply_o
-        pass
-
-        # Patch Trainer
-        from transformers.trainer import Trainer
-        try:
-            if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
-                inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
-                Trainer._original_training_loop = inner_training_loop
-            else:
-                inner_training_loop = Trainer._original_training_loop
-        except:
-            raise RuntimeError(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-            )
-        pass
-
-        import transformers.trainer
-        items_in_trainer = dir(transformers.trainer)
-        good_items = []
-        for item in items_in_trainer:
-            # TODO: Support Deepspeed
-            if item.startswith(("deepspeed", "xm", "met", "smp")): continue
-            if item in inner_training_loop: good_items.append(item)
-        pass
-        exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
-
-        start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
-        end = inner_training_loop.find("\n\n", start)
-        original_debug = inner_training_loop[start:end]
-        spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
-        front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)
-
-        debug_info = """debug_info = \\
-        f"==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\
-        f"   \\\\\\   /|    Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\
-        f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
-        f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
-        f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
-        logger.warning(debug_info)
-        import subprocess, re, gc
-        output = subprocess.check_output(
-            'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
-        output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
-        output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
-        if output > 1: raise RuntimeError(
-            'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
-        for _ in range(3):
-            gc.collect()
-            torch.cuda.empty_cache()"""
-
-        debug_info = debug_info.split('\n')
-        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
-        inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
-
-        debug_info = """n_total_devices = total_train_batch_size // \\
-            args.gradient_accumulation_steps // self._train_batch_size
-        if n_total_devices > 1:
-            logger.warning_once(
-                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-            )
-        debug_info ="""
-        debug_info = debug_info.split('\n')
-        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
-        inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1)
-
-        front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0)
-        inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE)
-        inner_training_loop = inner_training_loop.replace(
-            "train_dataloader = tpu_spmd_dataloader(train_dataloader)",
-            "raise RuntimeError('Unsloth: TPUs are not yet supported!')"
-        )
-        inner_training_loop = inner_training_loop.replace(
-            "self.accelerator.free_memory()",
-            "self.accelerator.free_memory()\n" + \
-            front_spaces + "if self.is_deepspeed_enabled:"\
-            "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1,
-        )
-
-        check_batches = """train_dataloader = self.get_train_dataloader()
-        ga  = args.gradient_accumulation_steps
-        bsz = self._train_batch_size
-        total_batches = bsz * ga * args.world_size
-        n_total_devices = total_batches // ga // bsz
-        if n_total_devices > 1:
-            logger.warning_once(
-                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-            )
-            divisor = n_total_devices / 1
-            bsz = self._train_batch_size = max(int(bsz / divisor), 1)
-            if total_batches // ga // bsz > 1:
-                divisor = n_total_devices / 1
-                ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
-        check_batches = check_batches.split('\n')
-        check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
-        inner_training_loop = inner_training_loop.replace(
-            "train_dataloader = self.get_train_dataloader()",
-            check_batches, 1,
-        )
-        inner_training_loop = inner_training_loop.replace(
-            "_inner_training_loop",
-            "_fast_inner_training_loop", 1,
-        )
-        exec(inner_training_loop, globals())
-
-        Trainer._inner_training_loop = _fast_inner_training_loop
-        inner_training_loop = inner_training_loop.replace(
-            "is_torch_tpu_available()",
-            "False",
-        )
-        if "n_total_devices >" not in inner_training_loop:
-            raise RuntimeError(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-            )
-        pass
-        inner_training_loop = inner_training_loop.replace(
-            "is_sagemaker_mp_enabled()",
-            "False",
-        )
-        exec(inner_training_loop, globals())
-        Trainer._inner_training_loop = _fast_inner_training_loop
-
-        # Save max_seq_length
-        model.max_seq_length = max_position_embeddings
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            internal_model.max_seq_length = max_position_embeddings
-            internal_model = internal_model.model
-        pass
-        internal_model.max_seq_length = max_position_embeddings
-
-        # We check the tokenizer first for errors
-        if fix_tokenizer:
-            tokenizer = check_tokenizer(
-                model            = model,
-                tokenizer        = tokenizer,
-                model_name       = model_name,
-                model_max_length = max_position_embeddings,
-                padding_side     = "right",
-                token            = token,
-            )
-        pass
-        patch_saving_functions(tokenizer)
-
-        # Fix up config for transformers uploading PEFT
-        # Not necessary anymore since we require transformers>=4.37!
-        if False:
-            name = model.config._name_or_path
-            if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
-                name = name[:len(name) - len("-bnb-4bit")]
-                model.config.update({"_name_or_path" : name})
-            pass
-        pass
-
-        # Log Unsloth version for future fastpaths for inference
-        model.config.update({"unsloth_version" : __version__})
-
-        # Add save modules
-        patch_saving_functions(model)
-        Trainer._inner_training_loop = _fast_inner_training_loop
-
-        # Save tokenizer for inference purposes
-        tokenizer.padding_side = "left" # Force inference
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            internal_model._saved_temp_tokenizer = tokenizer
-            internal_model = internal_model.model
-        pass
-        internal_model._saved_temp_tokenizer = tokenizer
-        
-        return model, tokenizer
     pass
 pass
diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
index 47327280..984bf7ca 100644
--- a/unsloth/models/qwen2.py
+++ b/unsloth/models/qwen2.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .mistral import *
+from .llama import *
 
 from transformers.models.qwen2.modeling_qwen2 import (
     Qwen2Attention,
@@ -32,7 +32,7 @@
 pass
 
 
-class FastQwen2Model(FastMistralModel):
+class FastQwen2Model(FastLlamaModel):
 
     @staticmethod
     def pre_patch():
@@ -57,30 +57,30 @@ def pre_patch():
 
     @staticmethod
     def from_pretrained(
-        model_name     = "Qwen/Qwen2-7B",
-        max_seq_length = 4096,
-        dtype          = None,
-        load_in_4bit   = True,
-        token          = None,
-        device_map     = "sequential",
-        rope_scaling   = None, # Qwen2 does not support RoPE scaling
-        fix_tokenizer  = True,
-        model_patcher  = None,
-        tokenizer_name = None,
+        model_name        = "Qwen/Qwen2-7B",
+        max_seq_length    = 4096,
+        dtype             = None,
+        load_in_4bit      = True,
+        token             = None,
+        device_map        = "sequential",
+        rope_scaling      = None, # Qwen2 does not support RoPE scaling
+        fix_tokenizer     = True,
+        model_patcher     = None,
+        tokenizer_name    = None,
         trust_remote_code = False,
         **kwargs,
     ):
-        return FastMistralModel.from_pretrained(
-            model_name     = model_name,
-            max_seq_length = max_seq_length,
-            dtype          = dtype,
-            load_in_4bit   = load_in_4bit,
-            token          = token,
-            device_map     = device_map,
-            rope_scaling   = rope_scaling,
-            fix_tokenizer  = fix_tokenizer,
-            model_patcher  = FastQwen2Model,
-            tokenizer_name = tokenizer_name,
+        return FastLlamaModel.from_pretrained(
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = dtype,
+            load_in_4bit      = load_in_4bit,
+            token             = token,
+            device_map        = device_map,
+            rope_scaling      = rope_scaling,
+            fix_tokenizer     = fix_tokenizer,
+            model_patcher     = FastQwen2Model,
+            tokenizer_name    = tokenizer_name,
             trust_remote_code = trust_remote_code,
             **kwargs,
         )

From 341565bba38753031369890bdfb561f1c6017217 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 15 Jun 2024 00:35:24 +1000
Subject: [PATCH 144/153] Update llama.py

---
 unsloth/models/llama.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 9e84c6ee..7cbdcfbd 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1090,24 +1090,23 @@ def from_pretrained(
 
         if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
 
-            # Warn RoPE scaling isn't allowed
-            if not has_rope_scaling:
-                raise RuntimeError(
-                    f"Unsloth: {model_name} can only handle sequence lengths of at most "\
-                    f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
-                    f"{round(rope_scaling, 3)}, it should be magically be extended to "\
-                    f"{max_seq_length}. However, {model_name} doesn't support RoPE Scaling!\n"\
-                    "Please file a feature request at https://github.com/unslothai/unsloth."
-                )
-            pass
-
             rope_scaling = max_seq_length / model_max_seq_length
+
             logger.warning_once(
                 f"Unsloth: {model_name} can only handle sequence lengths of at most "\
                 f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
                 f"{round(rope_scaling, 3)}, it can be magically be extended to "\
                 f"{max_seq_length}!"
             )
+
+            # Warn RoPE scaling isn't allowed
+            if not has_rope_scaling:
+                raise RuntimeError(
+                    "However, {model_name} doesn't support RoPE Scaling!\n"\
+                    "Please file a feature request at https://github.com/unslothai/unsloth."
+                )
+            pass
+
             rope_scaling = {"type": "linear", "factor": rope_scaling,}
 
             # Add to kwargs

From dd3c6b1d39dc253c50b71dcd133479c512a1cf35 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 15 Jun 2024 17:44:23 +1000
Subject: [PATCH 145/153] Update llama.py

---
 unsloth/models/llama.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 7cbdcfbd..6702aefb 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -51,7 +51,6 @@
 pass
 
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
 from transformers import set_seed as transformers_set_seed
 from peft import LoraConfig, TaskType, get_peft_model as _get_peft_model
 from peft import PeftModelForCausalLM
@@ -1076,12 +1075,7 @@ def from_pretrained(
         model_max_seq_length = model_config.max_position_embeddings
 
         # Check if RoPE Scaling is even allowed
-        model_function = MODEL_FOR_CAUSAL_LM_MAPPING[model_config.__class__]
-        has_rope_scaling = False
-        try:
-            with open(inspect.getfile(model_function), "r") as file:
-                has_rope_scaling = "self.config.rope_scaling" in file.read()
-        except: pass
+        has_rope_scaling = hasattr(model_config, "rope_scaling")
 
         # If max_seq_length is not specified, use maximum fron config
         if max_seq_length is None:

From 6d1ae234a4cf95f3dca9fb17967c4ba04a0e3408 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 15 Jun 2024 17:46:45 +1000
Subject: [PATCH 146/153] Update llama.py

---
 unsloth/models/llama.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6702aefb..7cbdcfbd 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -51,6 +51,7 @@
 pass
 
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
 from transformers import set_seed as transformers_set_seed
 from peft import LoraConfig, TaskType, get_peft_model as _get_peft_model
 from peft import PeftModelForCausalLM
@@ -1075,7 +1076,12 @@ def from_pretrained(
         model_max_seq_length = model_config.max_position_embeddings
 
         # Check if RoPE Scaling is even allowed
-        has_rope_scaling = hasattr(model_config, "rope_scaling")
+        model_function = MODEL_FOR_CAUSAL_LM_MAPPING[model_config.__class__]
+        has_rope_scaling = False
+        try:
+            with open(inspect.getfile(model_function), "r") as file:
+                has_rope_scaling = "self.config.rope_scaling" in file.read()
+        except: pass
 
         # If max_seq_length is not specified, use maximum fron config
         if max_seq_length is None:

From d855ef9f620b4d1e2c4ce3aa405185f68e59fe19 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 15 Jun 2024 18:14:21 +1000
Subject: [PATCH 147/153] GPU support

---
 unsloth/models/_utils.py  |  7 +++++--
 unsloth/models/gemma.py   | 11 ++++++++---
 unsloth/models/llama.py   | 27 +++++++++++++++++----------
 unsloth/models/mistral.py |  4 +++-
 4 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index a6933893..09d59c0a 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -372,6 +372,10 @@ def prepare_n_gradient_checkpoints(
 pass
 
 
+# Unsloth only works on NVIDIA GPUs for now
+device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+device = f"cuda:{device_ids[:device_ids.find(',')]}"
+
 class Unsloth_Offloaded_Gradient_Checkpointer(torch.autograd.Function):
     """
     Saves VRAM by smartly offloading to RAM.
@@ -393,7 +397,7 @@ def forward(ctx, forward_function, hidden_states, *args):
     @torch.cuda.amp.custom_bwd
     def backward(ctx, dY):
         (hidden_states,) = ctx.saved_tensors
-        hidden_states = hidden_states.to("cuda", non_blocking = True).detach()
+        hidden_states = hidden_states.to(device, non_blocking = True).detach()
         hidden_states.requires_grad = True
         with torch.enable_grad():
             (output,) = ctx.forward_function(hidden_states, *ctx.args)
@@ -457,7 +461,6 @@ def _prepare_backend(
 
 
 # Offloading to disk for modules (lm_head, embed_tokens)
-import os
 import pickle
 
 def offload_to_disk(W, model, name, temporary_location : str = "_unsloth_temporary_saved_buffers"):
diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index 5dd2a5ab..02355a42 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -38,6 +38,11 @@
     GemmaFlashAttention2 = GemmaAttention
 pass
 
+# Unsloth currently only works on one GPU
+import os
+device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+device = f"cuda:{device_ids[:device_ids.find(',')]}"
+# Please obtain a commercial license
 
 torch_nn_functional_gelu = torch.nn.functional.gelu
 def fast_geglu_inference(self, X):
@@ -45,7 +50,7 @@ def fast_geglu_inference(self, X):
     # up   = self.up_proj(X)
     bsz, _, hd = X.shape
     # mlp_size = self.config.intermediate_size
-    # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = "cuda")
+    # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = device)
 
     gate = fast_linear_forward(self.gate_proj, X)#, out = temp[0])
     up   = fast_linear_forward(self.  up_proj, X)#, out = temp[1])
@@ -72,7 +77,7 @@ def GemmaDecoderLayer_fast_forward(
     *args, **kwargs,
 ):
     if use_cache and hasattr(self, "_flag_for_generation"): #past_key_value is not None:
-        out_weight = torch.empty(self.input_layernorm.weight.shape, dtype = torch.float32, device = "cuda")
+        out_weight = torch.empty(self.input_layernorm.weight.shape, dtype = torch.float32, device = device)
 
         # Self Attention
         residual = hidden_states
@@ -134,7 +139,7 @@ def GemmaModel_fast_forward_inference(
     position_ids,
     attention_mask = None,
 ):
-    out_weight = torch.empty_like(self.model.layers[0].input_layernorm.weight, dtype = torch.float32, device = "cuda")
+    out_weight = torch.empty_like(self.model.layers[0].input_layernorm.weight, dtype = torch.float32, device = device)
     input_ids = input_ids[:,:self.max_seq_length]
     hidden_states = self.model.embed_tokens(input_ids)
     hidden_states = hidden_states.to(self.config.torch_dtype)
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 7cbdcfbd..274e6bfd 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -74,6 +74,9 @@ def original_apply_o(self, X):
     return O
 pass
 
+import os # Unsloth only works on NVIDIA GPUs for now
+device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+device = f"cuda:{device_ids[:device_ids.find(',')]}"
 
 from math import sqrt as math_sqrt
 KV_CACHE_INCREMENT = 256 # KV Cache update size
@@ -132,15 +135,15 @@ def LlamaAttention_fast_forward_inference(
     # Prefill phase
     # if not hasattr(self, "paged_attention"):
     if do_prefill:
-        self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = "cuda")
+        self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = device)
         self.paged_attention_K = self.paged_attention[:,0]
         self.paged_attention_V = self.paged_attention[:,1]
         self.paged_attention_K[:seq_len] = K1.permute(2, 0, 1, 3)
         self.paged_attention_V[:seq_len] = V1.permute(2, 0, 1, 3)
-        self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda")
-        self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda")
-        self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda")
-        self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda")
+        self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = device)
+        self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = device)
+        self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = device)
+        self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = device)
         self.scalar = 1.0 / math_sqrt(self.head_dim)
         self.half_head_dim = head_dim // 2
     elif kv_seq_len >= self.paged_attention.shape[0]:
@@ -170,7 +173,7 @@ def LlamaAttention_fast_forward_inference(
     Qn *= cos
     Qn.addcmul_(RH_Q, sin)
 
-    RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = "cuda")
+    RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = device)
     RH_K[:,:,:,:h] = Kn[:,:,:,h:]
     RH_K[:,:,:,h:] = Kn[:,:,:,:h]
     torch.neg(RH_K[:,:,:,:h], out = RH_K[:,:,:,:h])
@@ -232,7 +235,7 @@ def fast_swiglu_inference(self, X):
     # up   = self.up_proj(X)
     bsz, _, hd = X.shape
     # mlp_size = self.config.intermediate_size
-    # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = "cuda")
+    # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = device)
 
     gate = fast_linear_forward(self.gate_proj, X)#, out = temp[0])
     up   = fast_linear_forward(self.  up_proj, X)#, out = temp[1])
@@ -522,7 +525,7 @@ def LlamaModel_fast_forward(
         position_ids = torch.arange(
             past_key_values_length, seq_length + past_key_values_length,
             dtype  = torch.int32,
-            device = "cuda",
+            device = device,
         )
         position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
     elif position_ids is not None:
@@ -842,8 +845,10 @@ def _CausalLM_fast_forward(
         if labels is not None:
             shift_logits = logits
             if not hasattr(self, "extra_ignored_labels"):
+                device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+                device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
                 # Fixes https://github.com/unslothai/unsloth/issues/10
-                self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda")
+                self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device)
             pass
             
             shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]]))
@@ -1822,7 +1827,9 @@ def patch_peft_model(
         # Patch cross entropy loss labels
         # Fixes https://github.com/unslothai/unsloth/issues/10
         max_seq_length = model.max_seq_length
-        extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda")
+        device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+        device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
+        extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = device)
         model.model.extra_ignored_labels = extra_ignored_labels
         internal_model = model
         while hasattr(internal_model, "model"):
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 291f0aa5..d41de54d 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -239,8 +239,10 @@ def MistralForCausalLM_fast_forward(
     if labels is not None:
         shift_logits = logits
         if not hasattr(self, "extra_ignored_labels"):
+            device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+            device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
             # Fixes https://github.com/unslothai/unsloth/issues/10
-            self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda")
+            self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device)
         pass
         
         shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]]))

From 66564461513dee04e897f83dbc3dd16c1cd82550 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 15 Jun 2024 18:21:33 +1000
Subject: [PATCH 148/153] Typo

---
 unsloth/models/_utils.py  | 2 +-
 unsloth/models/gemma.py   | 2 +-
 unsloth/models/llama.py   | 6 +++---
 unsloth/models/mistral.py | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 09d59c0a..1b122fc8 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -373,7 +373,7 @@ def prepare_n_gradient_checkpoints(
 
 
 # Unsloth only works on NVIDIA GPUs for now
-device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
 device = f"cuda:{device_ids[:device_ids.find(',')]}"
 
 class Unsloth_Offloaded_Gradient_Checkpointer(torch.autograd.Function):
diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index 02355a42..98502836 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -40,7 +40,7 @@
 
 # Unsloth currently only works on one GPU
 import os
-device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
 device = f"cuda:{device_ids[:device_ids.find(',')]}"
 # Please obtain a commercial license
 
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 274e6bfd..9327b1bb 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -75,7 +75,7 @@ def original_apply_o(self, X):
 pass
 
 import os # Unsloth only works on NVIDIA GPUs for now
-device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
 device = f"cuda:{device_ids[:device_ids.find(',')]}"
 
 from math import sqrt as math_sqrt
@@ -845,7 +845,7 @@ def _CausalLM_fast_forward(
         if labels is not None:
             shift_logits = logits
             if not hasattr(self, "extra_ignored_labels"):
-                device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+                device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
                 device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
                 # Fixes https://github.com/unslothai/unsloth/issues/10
                 self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device)
@@ -1827,7 +1827,7 @@ def patch_peft_model(
         # Patch cross entropy loss labels
         # Fixes https://github.com/unslothai/unsloth/issues/10
         max_seq_length = model.max_seq_length
-        device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+        device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
         device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
         extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = device)
         model.model.extra_ignored_labels = extra_ignored_labels
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index d41de54d..e147f215 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -239,7 +239,7 @@ def MistralForCausalLM_fast_forward(
     if labels is not None:
         shift_logits = logits
         if not hasattr(self, "extra_ignored_labels"):
-            device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+            device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
             device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
             # Fixes https://github.com/unslothai/unsloth/issues/10
             self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device)

From 9bd5fad07d5961069b895eca530e09913d15812d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 15 Jun 2024 22:04:08 +1000
Subject: [PATCH 149/153] Update gemma.py

---
 unsloth/models/gemma.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index 98502836..0cc047d2 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -38,11 +38,9 @@
     GemmaFlashAttention2 = GemmaAttention
 pass
 
-# Unsloth currently only works on one GPU
 import os
 device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
-device = f"cuda:{device_ids[:device_ids.find(',')]}"
-# Please obtain a commercial license
+device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
 
 torch_nn_functional_gelu = torch.nn.functional.gelu
 def fast_geglu_inference(self, X):

From a3061b624baebf6fa99eeff1417f0595b8085883 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 15 Jun 2024 22:10:05 +1000
Subject: [PATCH 150/153] gpu

---
 unsloth/models/_utils.py  | 3 ++-
 unsloth/models/llama.py   | 9 ++++++---
 unsloth/models/mistral.py | 3 ++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 1b122fc8..49b8ba39 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -374,7 +374,8 @@ def prepare_n_gradient_checkpoints(
 
 # Unsloth only works on NVIDIA GPUs for now
 device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
-device = f"cuda:{device_ids[:device_ids.find(',')]}"
+device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
+device = f"cuda:{device if device.isdigit() else '0'}"
 
 class Unsloth_Offloaded_Gradient_Checkpointer(torch.autograd.Function):
     """
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 9327b1bb..f2f79de8 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -76,7 +76,8 @@ def original_apply_o(self, X):
 
 import os # Unsloth only works on NVIDIA GPUs for now
 device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
-device = f"cuda:{device_ids[:device_ids.find(',')]}"
+device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
+device = f"cuda:{device if device.isdigit() else '0'}"
 
 from math import sqrt as math_sqrt
 KV_CACHE_INCREMENT = 256 # KV Cache update size
@@ -846,7 +847,8 @@ def _CausalLM_fast_forward(
             shift_logits = logits
             if not hasattr(self, "extra_ignored_labels"):
                 device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
-                device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
+                device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
+                device = f"cuda:{device if device.isdigit() else '0'}"
                 # Fixes https://github.com/unslothai/unsloth/issues/10
                 self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device)
             pass
@@ -1828,7 +1830,8 @@ def patch_peft_model(
         # Fixes https://github.com/unslothai/unsloth/issues/10
         max_seq_length = model.max_seq_length
         device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
-        device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
+        device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
+        device = f"cuda:{device if device.isdigit() else '0'}"
         extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = device)
         model.model.extra_ignored_labels = extra_ignored_labels
         internal_model = model
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index e147f215..832189be 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -240,7 +240,8 @@ def MistralForCausalLM_fast_forward(
         shift_logits = logits
         if not hasattr(self, "extra_ignored_labels"):
             device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
-            device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
+            device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
+            device = f"cuda:{device if device.isdigit() else '0'}"
             # Fixes https://github.com/unslothai/unsloth/issues/10
             self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device)
         pass

From 513bd4d28981893327f0c0ff49af0b529522994a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 16 Jun 2024 00:06:05 +1000
Subject: [PATCH 151/153] Multiple GGUF saving

---
 unsloth/save.py | 224 +++++++++++++++++++++++++++++-------------------
 1 file changed, 134 insertions(+), 90 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index cae59cae..17d6962f 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -418,6 +418,11 @@ def unsloth_save_model(
         print("Unsloth: Saving model...", end = "")
         if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
 
+        # [TODO] Is this correct?
+        if save_method == "lora":
+            save_pretrained_settings["selected_adapters"] = None
+        pass
+
         model.save_pretrained(**save_pretrained_settings)
 
         if push_to_hub and hasattr(model, "config"):
@@ -649,8 +654,9 @@ def unsloth_save_model(
     model.config = new_config
 
     # Save!
-    
+    # [TODO] --> is this correct?
     save_pretrained_settings["selected_adapters"] = None
+
     # Check if pushing to an organization
     if save_pretrained_settings["push_to_hub"] and (username != actual_username):
         print(f"Unsloth: Saving to organization with address {new_save_directory}")
@@ -834,7 +840,7 @@ def save_to_gguf(
     model_dtype          : str,
     is_sentencepiece     : bool = False,
     model_directory      : str = "unsloth_finetuned_model",
-    quantization_method  : str = "fast_quantized",
+    quantization_method  = "fast_quantized", # Can be a list of options! ["q4_k_m", "q8_0", "q5_k_m"]
     first_conversion     : str = None,
     _run_installer = None, # Non blocking install of llama.cpp
 ):
@@ -846,6 +852,10 @@ def save_to_gguf(
     assert(model_dtype == "float16" or model_dtype == "bfloat16")
     model_dtype = "f16" if model_dtype == "float16" else "bf16"
 
+    # Convert quantization_method to list
+    quantization_method = \
+        quantization_method if type(quantization_method) is list else list(quantization_method)
+
     # Check if bfloat16 is supported
     if model_dtype == "bf16" and not torch.cuda.is_bf16_supported():
         logger.warning(
@@ -860,8 +870,11 @@ def save_to_gguf(
         first_conversion = model_dtype
     pass
 
-    if quantization_method.startswith("iq2"):
-        raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
+    # Check I quants
+    for quant_method in quantization_method: 
+        if quant_method.startswith("iq2"):
+            raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
+    pass
 
     # Careful convert.py is only for Llama / Mistral based archs
     use_fast_convert = False
@@ -871,25 +884,32 @@ def save_to_gguf(
     pass
     logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.")
 
-    if   quantization_method == "not_quantized":  quantization_method = model_dtype
-    elif quantization_method == "fast_quantized": quantization_method = "q8_0"
-    elif quantization_method == "quantized":      quantization_method = "q4_k_m"
-    elif quantization_method is None:             quantization_method = "q8_0"
-    pass
+    # Map quant methods
+    new_quantization_method = []
+    for quant_method in quantization_method:
+        if   quant_method == "not_quantized":  quantization_method = model_dtype
+        elif quant_method == "fast_quantized": quantization_method = "q8_0"
+        elif quant_method == "quantized":      quantization_method = "q4_k_m"
+        elif quant_method is None:             quantization_method = "q8_0"
+
+        # Check if wrong method
+        if quantization_method not in ALLOWED_QUANTS.keys():
+            error = f"Unsloth: Quant method = [{quantization_method}] not supported. Choose from below:\n"
+            for key, value in ALLOWED_QUANTS.items():
+                error += f"[{key}] => {value}\n"
+            raise RuntimeError(error)
+        pass
 
-    if quantization_method not in ALLOWED_QUANTS.keys():
-        error = f"Unsloth: Quant method = [{quantization_method}] not supported. Choose from below:\n"
-        for key, value in ALLOWED_QUANTS.items():
-            error += f"[{key}] => {value}\n"
-        raise RuntimeError(error)
+        new_quantization_method.append(quant_method)
     pass
+    quantization_method = new_quantization_method
 
     print_info = \
         f"==((====))==  Unsloth: Conversion from QLoRA to GGUF information\n"\
         f"   \\\   /|    [0] Installing llama.cpp will take 3 minutes.\n"\
         f"O^O/ \_/ \\    [1] Converting HF to GUUF 16bits will take 3 minutes.\n"\
-        f"\        /    [2] Converting GGUF 16bits to {quantization_method} will take 20 minutes.\n"\
-        f' "-____-"     In total, you will have to wait around 26 minutes.\n'
+        f"\        /    [2] Converting GGUF 16bits to {quantization_method} will take 10 minutes each.\n"\
+        f' "-____-"     In total, you will have to wait at least 16 minutes.\n'
     print(print_info)
 
     # Check first_conversion format
@@ -928,24 +948,37 @@ def save_to_gguf(
         install_llama_cpp_old(-10)
     pass
 
-    if   quantization_method == "f32":  first_conversion = "f32"
-    elif quantization_method == "f16":  first_conversion = "f16"
-    elif quantization_method == "bf16": first_conversion = "bf16"
-    elif quantization_method == "q8_0": first_conversion = "q8_0"
-    else:
-        # Quantized models must have f16 as the default argument
-        if   first_conversion == "f32"  : pass
-        elif first_conversion == "f16"  : pass
-        elif first_conversion == "bf16" : pass
-        elif first_conversion == "q8_0":
-            logger.warning_once(
-                "Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\
-                "but saves disk space!"
-            )
-            # first_conversion = "f16"
+    # Determine maximum first_conversion state
+    if   first_conversion == "f32"  : strength = 3
+    elif first_conversion == "f16"  : strength = 2
+    elif first_conversion == "bf16" : strength = 1
+    elif first_conversion == "q8_0" : strength = 0
+
+    for quant_method in quantization_method:
+        if   quant_method == "f32":  strength = max(strength, 3)
+        elif quant_method == "f16":  strength = max(strength, 2)
+        elif quant_method == "bf16": strength = max(strength, 1)
+        elif quant_method == "q8_0": strength = max(strength, 0)
+        else:
+            # Quantized models must have f16 as the default argument
+            if   first_conversion == "f32"  : pass
+            elif first_conversion == "f16"  : pass
+            elif first_conversion == "bf16" : pass
+            elif first_conversion == "q8_0":
+                logger.warning_once(
+                    "Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\
+                    "but saves disk space!"
+                )
+                # first_conversion = "f16"
+            pass
         pass
     pass
 
+    if   strength >= 3: first_conversion = "f32"
+    elif strength >= 2: first_conversion = "f16"
+    elif strength >= 1: first_conversion = "bf16"
+    else: first_conversion = "q8_0"
+
     # Non llama/mistral needs can only use f32 or f16
     if not use_fast_convert and \
         (first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"):
@@ -1033,52 +1066,58 @@ def save_to_gguf(
     pass
     print(f"Unsloth: Conversion completed! Output location: {final_location}")
 
-    if quantization_method != first_conversion:
-        old_location = final_location
-        print(f"Unsloth: [2] Converting GGUF 16bit into {quantization_method}. This will take 20 minutes...")
-        final_location = f"./{model_directory}-unsloth.{quantization_method.upper()}.gguf"
+    full_precision_location = final_location
 
-        command = f"./{quantize_location} {old_location} "\
-            f"{final_location} {quantization_method} {n_cpus}"
-        
-        # quantize uses stderr
-        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
-            for line in sp.stdout:
-                line = line.decode("utf-8", errors = "replace")
-                if "undefined reference" in line:
-                    raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
-                print(line, flush = True, end = "")
-            if sp.returncode is not None and sp.returncode != 0:
-                raise subprocess.CalledProcessError(sp.returncode, sp.args)
-        pass
+    all_saved_locations = []
+    # Convert each type!
+    for quant_method in quantization_method:
+        if quant_method != first_conversion:
+            print(f"Unsloth: [2] Converting GGUF 16bit into {quant_method}. This will take 20 minutes...")
+            final_location = f"./{model_directory}-unsloth.{quant_method.upper()}.gguf"
 
-        # Check if quantization succeeded!
-        if not os.path.isfile(final_location):
-            if IS_KAGGLE_ENVIRONMENT:
-                raise RuntimeError(
-                    f"Unsloth: Quantization failed for {final_location}\n"\
-                    "You are in a Kaggle environment, which might be the reason this is failing.\n"\
-                    "Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
-                    "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
-                    "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
-                    "I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
-                )
-            else:
-                raise RuntimeError(
-                    "Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\
-                    "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
-                    "You must run this in the same folder as you're saving your model.\n"\
-                    "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
-                    "cd llama.cpp && make clean && make all -j\n"\
-                    "Once that's done, redo the quantization."
-                )
+            command = f"./{quantize_location} {full_precision_location} "\
+                f"{final_location} {quant_method} {n_cpus}"
+            
+            # quantize uses stderr
+            with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
+                for line in sp.stdout:
+                    line = line.decode("utf-8", errors = "replace")
+                    if "undefined reference" in line:
+                        raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
+                    print(line, flush = True, end = "")
+                if sp.returncode is not None and sp.returncode != 0:
+                    raise subprocess.CalledProcessError(sp.returncode, sp.args)
             pass
-        pass
 
-        print(f"Unsloth: Conversion completed! Output location: {final_location}")
+            # Check if quantization succeeded!
+            if not os.path.isfile(final_location):
+                if IS_KAGGLE_ENVIRONMENT:
+                    raise RuntimeError(
+                        f"Unsloth: Quantization failed for {final_location}\n"\
+                        "You are in a Kaggle environment, which might be the reason this is failing.\n"\
+                        "Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
+                        "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
+                        "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
+                        "I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
+                    )
+                else:
+                    raise RuntimeError(
+                        "Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\
+                        "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
+                        "You must run this in the same folder as you're saving your model.\n"\
+                        "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
+                        "cd llama.cpp && make clean && make all -j\n"\
+                        "Once that's done, redo the quantization."
+                    )
+                pass
+            pass
+
+            print(f"Unsloth: Conversion completed! Output location: {final_location}")
+            all_saved_locations.append(final_location)
+        pass
     pass
 
-    return final_location
+    return all_saved_locations
 pass
 
 
@@ -1453,7 +1492,7 @@ def unsloth_save_pretrained_gguf(
     is_sentencepiece_model = check_if_sentencepiece_model(self)
 
     # Save to GGUF
-    file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, 
+    all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, 
         new_save_directory, quantization_method, first_conversion, makefile,
     )
 
@@ -1466,14 +1505,17 @@ def unsloth_save_pretrained_gguf(
 
     if push_to_hub:
         print("Unsloth: Uploading GGUF to Huggingface Hub...")
-        username = upload_to_huggingface(
-            self, save_directory, token,
-            "GGUF converted", "gguf", file_location, old_username, private,
-        )
-        link = f"{username}/{new_save_directory.lstrip('/.')}" \
-            if username not in new_save_directory else \
-            new_save_directory.lstrip('/.')
-        print(f"Saved GGUF to https://huggingface.co/{link}")
+
+        for file_location in all_file_locations:
+            username = upload_to_huggingface(
+                self, save_directory, token,
+                "GGUF converted", "gguf", file_location, old_username, private,
+            )
+            link = f"{username}/{new_save_directory.lstrip('/.')}" \
+                if username not in new_save_directory else \
+                new_save_directory.lstrip('/.')
+            print(f"Saved GGUF to https://huggingface.co/{link}")
+        pass
     pass
 pass
 
@@ -1604,20 +1646,22 @@ def unsloth_push_to_hub_gguf(
     is_sentencepiece_model = check_if_sentencepiece_model(self)
 
     # Save to GGUF
-    file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, 
+    all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, 
         new_save_directory, quantization_method, first_conversion, makefile,
     )
 
-    print("Unsloth: Uploading GGUF to Huggingface Hub...")
-    username = upload_to_huggingface(
-        self, repo_id, token,
-        "GGUF converted", "gguf", file_location, old_username, private,
-    )
-    link = f"{username}/{new_save_directory.lstrip('/.')}" \
-        if username not in new_save_directory else \
-        new_save_directory.lstrip('/.')
+    for file_location in all_file_locations:
+        print("Unsloth: Uploading GGUF to Huggingface Hub...")
+        username = upload_to_huggingface(
+            self, repo_id, token,
+            "GGUF converted", "gguf", file_location, old_username, private,
+        )
+        link = f"{username}/{new_save_directory.lstrip('/.')}" \
+            if username not in new_save_directory else \
+            new_save_directory.lstrip('/.')
 
-    print(f"Saved GGUF to https://huggingface.co/{link}")
+        print(f"Saved GGUF to https://huggingface.co/{link}")
+    pass
 
     if fix_bos_token:
         logger.warning(

From fb54fbbc005595468ddc5e1d4f3107395a7d69c6 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 16 Jun 2024 01:16:05 +1000
Subject: [PATCH 152/153] Update save.py

---
 unsloth/save.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 17d6962f..f7efcc44 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -893,8 +893,8 @@ def save_to_gguf(
         elif quant_method is None:             quantization_method = "q8_0"
 
         # Check if wrong method
-        if quantization_method not in ALLOWED_QUANTS.keys():
-            error = f"Unsloth: Quant method = [{quantization_method}] not supported. Choose from below:\n"
+        if quant_method not in ALLOWED_QUANTS.keys():
+            error = f"Unsloth: Quant method = [{quant_method}] not supported. Choose from below:\n"
             for key, value in ALLOWED_QUANTS.items():
                 error += f"[{key}] => {value}\n"
             raise RuntimeError(error)

From 4cba3e2c6c1c844821cf3225fe9dbeb9a23004ae Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 16 Jun 2024 02:57:33 +1000
Subject: [PATCH 153/153] Update save.py

---
 unsloth/save.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index f7efcc44..940feb40 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -655,7 +655,7 @@ def unsloth_save_model(
 
     # Save!
     # [TODO] --> is this correct?
-    save_pretrained_settings["selected_adapters"] = None
+    # save_pretrained_settings["selected_adapters"] = None
 
     # Check if pushing to an organization
     if save_pretrained_settings["push_to_hub"] and (username != actual_username):