From 7df08c4ac10ad8d95c8123da366e02af94fe8e4c Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 19 May 2024 16:22:58 +1000 Subject: [PATCH 001/153] Update llama.py --- unsloth/models/llama.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index f7fd5f13..77d0a6ab 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1490,19 +1490,19 @@ def get_peft_model( final_modules = [] for module in target_modules: if module == "lm_head": - logger.warning_once( - "Unsloth: `lm_head` should be placed in `modules_to_save` and not `target_modules`. "\ - "Luckily, we shall do it for you!" - ) + # logger.warning_once( + # "Unsloth: `lm_head` should be placed in `modules_to_save` and not `target_modules`. "\ + # "Luckily, we shall do it for you!" + # ) train_lm_head = True if modules_to_save is None: modules_to_save = ["lm_head"] else: modules_to_save.append("lm_head") elif module == "embed_tokens": - logger.warning_once( - "Unsloth: `embed_tokens` should be placed in `modules_to_save` and not `target_modules`. "\ - "Luckily, we shall do it for you!" - ) + # logger.warning_once( + # "Unsloth: `embed_tokens` should be placed in `modules_to_save` and not `target_modules`. "\ + # "Luckily, we shall do it for you!" + # ) train_embed_tokens = True if modules_to_save is None: modules_to_save = ["embed_tokens"] else: modules_to_save.append("embed_tokens") From ba5b6ce37a528464305f1f08af05d50a2f5f9188 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 19 May 2024 18:08:31 +1000 Subject: [PATCH 002/153] offload --- unsloth/models/_utils.py | 46 ++++++++++++++++++++++++++++++++++++++++ unsloth/models/llama.py | 21 +++++++++++++++--- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index a53de42c..4d82d067 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -72,6 +72,9 @@ "patch_tokenizer", "get_statistics", "Unsloth_Offloaded_Gradient_Checkpointer", + "offload_to_disk", + "offload_input_embeddings", + "offload_output_embeddings", ] @@ -421,3 +424,46 @@ def backward(ctx, dY): "Luckily, your training run will still work in the meantime!" ) pass + + +# Offloading to disk for modules (lm_head, embed_tokens) +import os +import pickle + +def offload_to_disk(W, model, name, temporary_location : str = "_unsloth_temporary_saved_buffers"): + file_location = os.path.join(temporary_location, model.config._name_or_path) + if not os.path.exists(file_location): + os.makedirs(file_location) + pass + + filename = os.path.join(file_location, f"{name}.pt") + W = W.weight if hasattr(W, "weight") else W + torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,) + offloaded_W = torch.load(filename, map_location = "cpu", mmap = True) + offloaded_W._offloaded_file_location = filename + return offloaded_W +pass + + +def offload_input_embeddings(model, temporary_location : str = "_unsloth_temporary_saved_buffers"): + offloaded_W = offload_to_disk(model.get_input_embeddings(), model, "input_embeddings", temporary_location) + new_input_embeddings = torch.nn.Embedding.from_pretrained(offloaded_W) + new_input_embeddings._offloaded_file_location = offloaded_W._offloaded_file_location + model.set_input_embeddings(new_input_embeddings) + return +pass + + +def offload_output_embeddings(model, temporary_location : str = "_unsloth_temporary_saved_buffers"): + offloaded_W = offload_to_disk(model.get_output_embeddings(), model, "output_embeddings", temporary_location) + + new_output_embeddings = torch.nn.Linear(1, 1, bias = None) + del new_output_embeddings.weight + new_output_embeddings.weight = offloaded_W + new_output_embeddings.in_features = offloaded_W.shape[1] + new_output_embeddings.out_features = offloaded_W.shape[0] + + new_output_embeddings._offloaded_file_location = offloaded_W._offloaded_file_location + model.set_output_embeddings(new_output_embeddings) + return +pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 77d0a6ab..4f754c6a 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1396,6 +1396,7 @@ def get_peft_model( modules_to_save = None, init_lora_weights = True, loftq_config = {}, + temporary_location = "_unsloth_temporary_saved_buffers", **kwargs, ): transformers_set_seed(random_state) @@ -1579,7 +1580,19 @@ def get_peft_model( _saved_temp_tokenizer = model._saved_temp_tokenizer lora_config = LoraConfig(**arguments) - model = _get_peft_model(model, lora_config) + + # First offload lm_head and embed_tokens to disk + original_device = model.get_input_embeddings.device + if use_gradient_checkpointing == "unsloth": + if train_embed_tokens: + print("Unsloth: Offloading input_embeddings to disk to save VRAM") + offload_input_embeddings(model, temporary_location) + pass + if train_lm_head: + print("Unsloth: Offloading output_embeddings to disk to save VRAM") + offload_output_embeddings(model, temporary_location) + pass + pass model._saved_temp_tokenizer = _saved_temp_tokenizer @@ -1589,14 +1602,16 @@ def get_peft_model( if train_embed_tokens: print("Unsloth: Casting embed_tokens to float32") assert(hasattr(model.model.model.embed_tokens, "modules_to_save")) - model.model.model.embed_tokens.modules_to_save.default.to(torch.float32) + model.model.model.embed_tokens.modules_to_save.default\ + .to(torch.float32, device = original_device, non_blocking = True) model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True) pass if train_lm_head: print("Unsloth: Casting lm_head to float32") assert(hasattr(model.model.lm_head, "modules_to_save")) - model.model.lm_head.modules_to_save.default.to(torch.float32) + model.model.lm_head.modules_to_save.default\ + .to(torch.float32, device = original_device, non_blocking = True) model.model.lm_head.modules_to_save.default.requires_grad_(True) pass From a07057e6b8ea66aaa98b7f839933532e960c6c5c Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 19 May 2024 18:09:21 +1000 Subject: [PATCH 003/153] Update llama.py --- unsloth/models/llama.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 4f754c6a..31f73fb1 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -13,6 +13,7 @@ # limitations under the License. import torch +import gc from typing import Optional, Tuple, List, Union from torch.nn.functional import scaled_dot_product_attention from transformers.models.llama.modeling_llama import ( @@ -1370,7 +1371,6 @@ def post_patch(model): pass # Clear deleted GPU items - import gc for _ in range(3): gc.collect() torch.cuda.empty_cache() @@ -1592,6 +1592,12 @@ def get_peft_model( print("Unsloth: Offloading output_embeddings to disk to save VRAM") offload_output_embeddings(model, temporary_location) pass + + # Remove old items to save VRAM + for _ in range(3): + gc.collect() + torch.cuda.empty_cache() + pass pass model._saved_temp_tokenizer = _saved_temp_tokenizer From 4be9063a46a987d2c6a7c0f3a3852fa499711206 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 19 May 2024 18:10:36 +1000 Subject: [PATCH 004/153] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 31f73fb1..5276ec9c 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1582,7 +1582,7 @@ def get_peft_model( lora_config = LoraConfig(**arguments) # First offload lm_head and embed_tokens to disk - original_device = model.get_input_embeddings.device + original_device = model.get_input_embeddings().weight.device if use_gradient_checkpointing == "unsloth": if train_embed_tokens: print("Unsloth: Offloading input_embeddings to disk to save VRAM") From 3dc3d3ff7109ccc4a9db943477c8bc29571d2499 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 19 May 2024 18:13:02 +1000 Subject: [PATCH 005/153] Update llama.py --- unsloth/models/llama.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 5276ec9c..133138d4 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1599,6 +1599,8 @@ def get_peft_model( torch.cuda.empty_cache() pass pass + + model = _get_peft_model(model, lora_config) model._saved_temp_tokenizer = _saved_temp_tokenizer From f1cc1e8e4c3fe6f30ac2eae4f2ba4226ea791fcd Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 19 May 2024 18:14:32 +1000 Subject: [PATCH 006/153] Update llama.py --- unsloth/models/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 133138d4..0fc4257e 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1599,7 +1599,7 @@ def get_peft_model( torch.cuda.empty_cache() pass pass - + model = _get_peft_model(model, lora_config) model._saved_temp_tokenizer = _saved_temp_tokenizer @@ -1611,7 +1611,7 @@ def get_peft_model( print("Unsloth: Casting embed_tokens to float32") assert(hasattr(model.model.model.embed_tokens, "modules_to_save")) model.model.model.embed_tokens.modules_to_save.default\ - .to(torch.float32, device = original_device, non_blocking = True) + .to(device = original_device, dtype = torch.float32, non_blocking = True) model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True) pass @@ -1619,7 +1619,7 @@ def get_peft_model( print("Unsloth: Casting lm_head to float32") assert(hasattr(model.model.lm_head, "modules_to_save")) model.model.lm_head.modules_to_save.default\ - .to(torch.float32, device = original_device, non_blocking = True) + .to(device = original_device, dtype = torch.float32, non_blocking = True) model.model.lm_head.modules_to_save.default.requires_grad_(True) pass From 5cb531a3ddca5b37714495050725cb5cec39b742 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 19 May 2024 19:03:23 +1000 Subject: [PATCH 007/153] Update llama.py --- unsloth/models/llama.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 0fc4257e..eff35eef 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1588,6 +1588,13 @@ def get_peft_model( print("Unsloth: Offloading input_embeddings to disk to save VRAM") offload_input_embeddings(model, temporary_location) pass + + # Remove old items to save VRAM + for _ in range(3): + gc.collect() + torch.cuda.empty_cache() + pass + if train_lm_head: print("Unsloth: Offloading output_embeddings to disk to save VRAM") offload_output_embeddings(model, temporary_location) From 6bd8e600d72aeccb1108c83d50df07471ad0d400 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 19 May 2024 19:04:01 +1000 Subject: [PATCH 008/153] Update llama.py --- unsloth/models/llama.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index eff35eef..cad4c6a0 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1582,7 +1582,9 @@ def get_peft_model( lora_config = LoraConfig(**arguments) # First offload lm_head and embed_tokens to disk - original_device = model.get_input_embeddings().weight.device + input_embeddings_device = model. get_input_embeddings().weight.device + output_embeddings_device = model.get_output_embeddings().weight.device + if use_gradient_checkpointing == "unsloth": if train_embed_tokens: print("Unsloth: Offloading input_embeddings to disk to save VRAM") @@ -1594,7 +1596,7 @@ def get_peft_model( gc.collect() torch.cuda.empty_cache() pass - + if train_lm_head: print("Unsloth: Offloading output_embeddings to disk to save VRAM") offload_output_embeddings(model, temporary_location) @@ -1618,7 +1620,7 @@ def get_peft_model( print("Unsloth: Casting embed_tokens to float32") assert(hasattr(model.model.model.embed_tokens, "modules_to_save")) model.model.model.embed_tokens.modules_to_save.default\ - .to(device = original_device, dtype = torch.float32, non_blocking = True) + .to(device = input_embeddings_device, dtype = torch.float32, non_blocking = True) model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True) pass @@ -1626,7 +1628,7 @@ def get_peft_model( print("Unsloth: Casting lm_head to float32") assert(hasattr(model.model.lm_head, "modules_to_save")) model.model.lm_head.modules_to_save.default\ - .to(device = original_device, dtype = torch.float32, non_blocking = True) + .to(device = output_embeddings_device, dtype = torch.float32, non_blocking = True) model.model.lm_head.modules_to_save.default.requires_grad_(True) pass From d1d57ff99079d0ada0fde31cb67c637dd7ac27cc Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 20 May 2024 02:08:26 +1000 Subject: [PATCH 009/153] Update llama.py --- unsloth/models/llama.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index cad4c6a0..175a62ec 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1194,7 +1194,11 @@ def from_pretrained( f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\ f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' - logger.warning_once(debug_info)""" + logger.warning_once(debug_info) + import gc + for _ in range(3): + gc.collect() + torch.cuda.empty_cache()""" debug_info = debug_info.split('\n') debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) @@ -1644,6 +1648,12 @@ def get_peft_model( internal_model._saved_temp_tokenizer.padding_side = "right" pass + # Clear deleted GPU items + for _ in range(3): + gc.collect() + torch.cuda.empty_cache() + pass + return model pass From 7470f672bd596373a931d9a5d13c8b31eb57141b Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 21 May 2024 03:17:13 +1000 Subject: [PATCH 010/153] continued pretraining trainer --- unsloth/__init__.py | 1 + unsloth/trainer.py | 94 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 unsloth/trainer.py diff --git a/unsloth/__init__.py b/unsloth/__init__.py index d4ca45d7..2dcf1e6a 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -114,3 +114,4 @@ from .save import * from .chat_templates import * from .tokenizer_utils import * +from .trainer import * diff --git a/unsloth/trainer.py b/unsloth/trainer.py new file mode 100644 index 00000000..226eb4ed --- /dev/null +++ b/unsloth/trainer.py @@ -0,0 +1,94 @@ +# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from dataclasses import dataclass, field +from typing import Optional +from transformers import TrainingArguments +from trl import SFTTrainer + +__all__ = [ + "UnslothTrainingArguments", + "UnslothTrainer", +] + + +@dataclass +class UnslothTrainingArguments(TrainingArguments): + embedding_learning_rate : Optional[float] = field( + default = None, + metadata = {"help" : "Different learning rates for embeddings and lm_head."} + ) +pass + + +def _create_unsloth_optimizer( + model, + optimizer_cls, + optimizer_kwargs, + embedding_lr = 5e-5, +): + lr = optimizer_kwargs["lr"] + weight_decay = optimizer_kwargs.get("weight_decay", 0.0) + + param_groups = \ + { + "non_embeddings" : {}, + "embeddings" : {}, + } + + for name, param in model.named_parameters(): + if not param.requires_grad: continue + if "modules_to_save.default" in name: + print(f"Unsloth: Setting lr = {embedding_lr} instead of {lr} for {name}.") + param_groups["embeddings"] [name] = param + else: + param_groups["non_embeddings"][name] = param + pass + pass + + optimizer_grouped_parameters = [ + { + "params" : list(param_groups["non_embeddings"].values()), + "weight_decay" : weight_decay, + "lr" : lr, + }, + { + "params" : list(param_groups["embeddings"].values()), + "weight_decay" : weight_decay, + "lr" : embedding_lr, + }, + ] + optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) + return optimizer +pass + + +class UnslothTrainer(SFTTrainer): + def create_optimizer(self): + embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None) + if embedding_learning_rate is None: return super().create_optimizer() + + if self.optimizer is None: + optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args) + self.optimizer = _create_unsloth_optimizer( + self.model, + optimizer_cls, + optimizer_kwargs, + embedding_learning_rate, + ) + pass + return self.optimizer + pass +pass From da9c1a602c3a1b29e08dab013ccf43ee5ad64fe9 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 21 May 2024 03:51:37 +1000 Subject: [PATCH 011/153] Update trainer.py --- unsloth/trainer.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/unsloth/trainer.py b/unsloth/trainer.py index 226eb4ed..150124b9 100644 --- a/unsloth/trainer.py +++ b/unsloth/trainer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from dataclasses import dataclass, field from typing import Optional from transformers import TrainingArguments @@ -26,10 +25,11 @@ @dataclass class UnslothTrainingArguments(TrainingArguments): - embedding_learning_rate : Optional[float] = field( - default = None, - metadata = {"help" : "Different learning rates for embeddings and lm_head."} - ) + pass + # embedding_learning_rate : Optional[float] = field( + # default = None, + # metadata = {"help" : "Different learning rates for embeddings and lm_head."} + # ) pass @@ -76,19 +76,20 @@ def _create_unsloth_optimizer( class UnslothTrainer(SFTTrainer): - def create_optimizer(self): - embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None) - if embedding_learning_rate is None: return super().create_optimizer() - - if self.optimizer is None: - optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args) - self.optimizer = _create_unsloth_optimizer( - self.model, - optimizer_cls, - optimizer_kwargs, - embedding_learning_rate, - ) - pass - return self.optimizer pass + # def create_optimizer(self): + # embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None) + # if embedding_learning_rate is None: return super().create_optimizer() + + # if self.optimizer is None: + # optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args) + # self.optimizer = _create_unsloth_optimizer( + # self.model, + # optimizer_cls, + # optimizer_kwargs, + # embedding_learning_rate, + # ) + # pass + # return self.optimizer + # pass pass From 2c68f5635a325b3847aa585d57083050b5dfbe6b Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 21 May 2024 03:58:21 +1000 Subject: [PATCH 012/153] Update trainer.py --- unsloth/trainer.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/unsloth/trainer.py b/unsloth/trainer.py index 150124b9..63a4398a 100644 --- a/unsloth/trainer.py +++ b/unsloth/trainer.py @@ -25,11 +25,10 @@ @dataclass class UnslothTrainingArguments(TrainingArguments): - pass - # embedding_learning_rate : Optional[float] = field( - # default = None, - # metadata = {"help" : "Different learning rates for embeddings and lm_head."} - # ) + embedding_learning_rate : Optional[float] = field( + default = None, + metadata = {"help" : "Different learning rates for embeddings and lm_head."} + ) pass @@ -76,20 +75,19 @@ def _create_unsloth_optimizer( class UnslothTrainer(SFTTrainer): - pass - # def create_optimizer(self): - # embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None) - # if embedding_learning_rate is None: return super().create_optimizer() + def create_optimizer(self): + embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None) + if embedding_learning_rate is None: return super().create_optimizer() - # if self.optimizer is None: - # optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args) - # self.optimizer = _create_unsloth_optimizer( - # self.model, - # optimizer_cls, - # optimizer_kwargs, - # embedding_learning_rate, - # ) - # pass - # return self.optimizer - # pass + if self.optimizer is None: + optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args) + self.optimizer = _create_unsloth_optimizer( + self.model, + optimizer_cls, + optimizer_kwargs, + embedding_learning_rate, + ) + pass + return self.optimizer + pass pass From 217bf9d9eed9b6706b67917fd27b32923d50594f Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 21 May 2024 04:27:15 +1000 Subject: [PATCH 013/153] Update trainer.py --- unsloth/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/trainer.py b/unsloth/trainer.py index 63a4398a..85e47aa0 100644 --- a/unsloth/trainer.py +++ b/unsloth/trainer.py @@ -50,7 +50,7 @@ def _create_unsloth_optimizer( for name, param in model.named_parameters(): if not param.requires_grad: continue if "modules_to_save.default" in name: - print(f"Unsloth: Setting lr = {embedding_lr} instead of {lr} for {name}.") + print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {name}.") param_groups["embeddings"] [name] = param else: param_groups["non_embeddings"][name] = param From 6e85384ab2e2ad0aaa3b4c5090f87d6cb0d83256 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 22 May 2024 00:21:09 +1000 Subject: [PATCH 014/153] Update trainer.py --- unsloth/trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unsloth/trainer.py b/unsloth/trainer.py index 85e47aa0..b234a98d 100644 --- a/unsloth/trainer.py +++ b/unsloth/trainer.py @@ -49,8 +49,10 @@ def _create_unsloth_optimizer( for name, param in model.named_parameters(): if not param.requires_grad: continue - if "modules_to_save.default" in name: - print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {name}.") + if name.endswith("modules_to_save.default.weight"): + partial_name = name[:-len(".modules_to_save.default.weight")] + partial_name = partial_name[partial_name.rfind(".")+1:] + print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {partial_name}.") param_groups["embeddings"] [name] = param else: param_groups["non_embeddings"][name] = param From 77f9c516050e1997cbf16c2d9db0ee886b1ff222 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 22 May 2024 04:35:36 +1000 Subject: [PATCH 015/153] is_bfloat16_supported --- unsloth/models/_utils.py | 10 ++++++++++ unsloth/models/llama.py | 2 +- unsloth/models/mistral.py | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 4d82d067..2c1eb4d5 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -35,7 +35,10 @@ # Get Flash Attention v2 if Ampere (RTX 30xx, A100) major_version, minor_version = torch.cuda.get_device_capability() +SUPPORTS_BFLOAT16 = False + if major_version >= 8: + SUPPORTS_BFLOAT16 = True try: from flash_attn import flash_attn_func # Check for CUDA linking errors "undefined symbol: _ZNK3c106SymIntltEl" @@ -75,6 +78,7 @@ "offload_to_disk", "offload_input_embeddings", "offload_output_embeddings", + "is_bfloat16_supported", ] @@ -467,3 +471,9 @@ def offload_output_embeddings(model, temporary_location : str = "_unsloth_tempor model.set_output_embeddings(new_output_embeddings) return pass + + +# Fixes a weird Torch 2.3 bug which says T4s have bfloat16 +def is_bfloat16_supported(): + return SUPPORTS_BFLOAT16 +pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 175a62ec..2d6021a3 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1047,7 +1047,7 @@ def from_pretrained( token = os.environ["HUGGINGFACE_TOKEN"] if model_patcher is None: model_patcher = FastLlamaModel - SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported() + SUPPORTS_BFLOAT16 = is_bfloat16_supported() gpu_stats = torch.cuda.get_device_properties(0) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 4594919b..365d60a3 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -314,7 +314,7 @@ def from_pretrained( logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.") pass - SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported() + SUPPORTS_BFLOAT16 = is_bfloat16_supported() gpu_stats = torch.cuda.get_device_properties(0) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) From c0e1d27b7e56b2778c295e1e6b0152ff52d18a44 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 22 May 2024 04:41:01 +1000 Subject: [PATCH 016/153] Update __init__.py --- unsloth/models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py index ff7129e0..e67a9e5f 100644 --- a/unsloth/models/__init__.py +++ b/unsloth/models/__init__.py @@ -17,3 +17,4 @@ from .mistral import FastMistralModel from .qwen2 import FastQwen2Model from .dpo import PatchDPOTrainer +from ._utils import is_bfloat16_supported From 2b23b9357aba25ab2f3a49d899045547d7dde1d7 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 22 May 2024 04:44:04 +1000 Subject: [PATCH 017/153] Update README.md --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ca5b6533..4d934eb0 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,8 @@ python -m bitsandbytes - We're in 🤗Hugging Face's official docs! Check out the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth)! ```python -from unsloth import FastLanguageModel +from unsloth import FastLanguageModel +from unsloth import is_bfloat16_supported import torch from trl import SFTTrainer from transformers import TrainingArguments @@ -238,8 +239,8 @@ trainer = SFTTrainer( gradient_accumulation_steps = 4, warmup_steps = 10, max_steps = 60, - fp16 = not torch.cuda.is_bf16_supported(), - bf16 = torch.cuda.is_bf16_supported(), + fp16 = not is_bfloat16_supported(), + bf16 = is_bfloat16_supported(), logging_steps = 1, output_dir = "outputs", optim = "adamw_8bit", @@ -263,6 +264,7 @@ We're in 🤗Hugging Face's official docs! We're on the [SFT docs](https://huggi ```python from unsloth import FastLanguageModel, PatchDPOTrainer +from unsloth import is_bfloat16_supported PatchDPOTrainer() import torch from transformers import TrainingArguments @@ -298,8 +300,8 @@ dpo_trainer = DPOTrainer( gradient_accumulation_steps = 8, warmup_ratio = 0.1, num_train_epochs = 3, - fp16 = not torch.cuda.is_bf16_supported(), - bf16 = torch.cuda.is_bf16_supported(), + fp16 = not is_bfloat16_supported(), + bf16 = is_bfloat16_supported(), logging_steps = 1, optim = "adamw_8bit", seed = 42, From 902e23af08e63790bc4b1801f3366c76e88b4d83 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 22 May 2024 04:45:32 +1000 Subject: [PATCH 018/153] Update llama.py --- unsloth/models/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 2d6021a3..1d6a282a 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1757,7 +1757,7 @@ def patch_peft_model( n_mlp += 1 else: logger.warning_once( - "Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters\n"\ + "Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters\n"\ "are not enabled or a bias term (like in Qwen) is used." ) pass @@ -1780,7 +1780,7 @@ def patch_peft_model( n_qkv += 1 else: logger.warning_once( - "Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters\n"\ + "Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters\n"\ "are not enabled or a bias term (like in Qwen) is used." ) pass @@ -1795,7 +1795,7 @@ def patch_peft_model( n_o += 1 else: logger.warning_once( - "Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters\n"\ + "Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters\n"\ "are not enabled or a bias term (like in Qwen) is used." ) pass From 3193cac8813d38cb9f7c57cb02ad7c09fb8b5b51 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 22 May 2024 20:35:24 +1000 Subject: [PATCH 019/153] is_bfloat16_supported --- unsloth/__init__.py | 6 ++++++ unsloth/trainer.py | 1 + 2 files changed, 7 insertions(+) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 2dcf1e6a..05755d43 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -38,6 +38,12 @@ try: import torch + + # Fix up is_bf16_supported https://github.com/unslothai/unsloth/issues/504 + major_version, minor_version = torch.cuda.get_device_capability() + SUPPORTS_BFLOAT16 = (major_version >= 8) + def is_bf16_supported(): return SUPPORTS_BFLOAT16 + torch.cuda.is_bf16_supported = is_bf16_supported except: raise ImportError("Pytorch is not installed. Go to https://pytorch.org/.\n"\ "We have some installation instructions on our Github page.") diff --git a/unsloth/trainer.py b/unsloth/trainer.py index b234a98d..c8e00be2 100644 --- a/unsloth/trainer.py +++ b/unsloth/trainer.py @@ -16,6 +16,7 @@ from typing import Optional from transformers import TrainingArguments from trl import SFTTrainer +from . import is_bfloat16_supported __all__ = [ "UnslothTrainingArguments", From dfeaf4bf116226cdcae339135d90168c7e45f582 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 22 May 2024 20:37:48 +1000 Subject: [PATCH 020/153] Update __init__.py --- unsloth/__init__.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 05755d43..c8f4ca10 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -38,15 +38,16 @@ try: import torch - - # Fix up is_bf16_supported https://github.com/unslothai/unsloth/issues/504 - major_version, minor_version = torch.cuda.get_device_capability() - SUPPORTS_BFLOAT16 = (major_version >= 8) - def is_bf16_supported(): return SUPPORTS_BFLOAT16 - torch.cuda.is_bf16_supported = is_bf16_supported except: raise ImportError("Pytorch is not installed. Go to https://pytorch.org/.\n"\ "We have some installation instructions on our Github page.") +pass + +# Fix up is_bf16_supported https://github.com/unslothai/unsloth/issues/504 +major_version, minor_version = torch.cuda.get_device_capability() +SUPPORTS_BFLOAT16 = (major_version >= 8) +def is_bf16_supported(): return SUPPORTS_BFLOAT16 +torch.cuda.is_bf16_supported = is_bf16_supported # We support Pytorch 2 # Fixes https://github.com/unslothai/unsloth/issues/38 From 1e84090231fa2157bb2695b91044e398c2fa9b6d Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 23 May 2024 04:12:45 +1000 Subject: [PATCH 021/153] Mistral v3 --- unsloth/__init__.py | 1 + unsloth/models/mapper.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index c8f4ca10..d85eca00 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -17,6 +17,7 @@ # Currently only supports 1 GPU, or else seg faults will occur. if "CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" devices = os.environ["CUDA_VISIBLE_DEVICES"] # Check if there are multiple cuda devices set in env if not devices.isdigit(): diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index b4fbe573..29896ef2 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -144,6 +144,14 @@ "unsloth/Phi-3-mini-4k-instruct", "microsoft/Phi-3-mini-4k-instruct", ), + "unsloth/mistral-7b-v0.3-bnb-4bit" : ( + "unsloth/mistral-7b-v0.3", + "mistralai/Mistral-7B-v0.3", + ), + "unsloth/mistral-7b-instruct-v0.3-bnb-4bit" : ( + "unsloth/mistral-7b-instruct-v0.3", + "mistralai/Mistral-7B-Instruct-v0.3", + ), } INT_TO_FLOAT_MAPPER = {} From 57ad8e784645d3b5f437d5edfa8486d8998a9829 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 24 May 2024 03:03:56 +1000 Subject: [PATCH 022/153] Phi 3 medium --- unsloth/models/_utils.py | 13 +++++++++---- unsloth/models/mapper.py | 4 ++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 2c1eb4d5..0217c7b5 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -159,7 +159,7 @@ def patch_tokenizer(model, tokenizer): Check if pad_token is not the same as eos_token otherwise the loss will ignore it!! Fixes https://github.com/unslothai/unsloth/issues/5 """ - possible_reserved_tokens = ("<|reserved", "<|placeholder",) + possible_reserved_tokens = ("<|reserved", "<|placeholder", "[control") if model is not None: model.config.update({"unsloth_version" : __version__}) @@ -176,14 +176,19 @@ def patch_tokenizer(model, tokenizer): if bad_pad_token: # Find a better pad token - added_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()] + aadded_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()] possible_pad_token = None + n_possible_pad_tokens = 0 for added_token in added_tokens[::-1]: if added_token.startswith(possible_reserved_tokens): - possible_pad_token = added_token - break + if possible_pad_token is None: possible_pad_token = added_token + n_possible_pad_tokens += 1 + # We must see at least 3 of the reserved tokens + if n_possible_pad_tokens >= 3: break pass pass + if n_possible_pad_tokens < 3: possible_pad_token = None + if possible_pad_token is None: # Try unk_token possible_pad_token = tokenizer.unk_token diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 29896ef2..777f310c 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -152,6 +152,10 @@ "unsloth/mistral-7b-instruct-v0.3", "mistralai/Mistral-7B-Instruct-v0.3", ), + "unsloth/Phi-3-medium-4k-instruct-bnb-4bit" : ( + "unsloth/Phi-3-medium-4k-instruct", + "microsoft/Phi-3-medium-4k-instruct", + ), } INT_TO_FLOAT_MAPPER = {} From 2b994b2ef0c4fe8cbb3c8fada9c1f1fc6c6bc46a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 24 May 2024 03:49:50 +1000 Subject: [PATCH 023/153] Update chat_templates.py --- unsloth/chat_templates.py | 40 ++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 3af4c4e9..e6b981ad 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -52,7 +52,7 @@ "{{ '>>> Assistant: ' }}"\ "{% endif %}" unsloth_eos_token = "eos_token" -CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token,) +CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False,) # Zephyr has no BOS! @@ -70,7 +70,7 @@ "{{ '<|assistant|>\n' }}"\ "{% endif %}" zephyr_eos_token = "eos_token" -CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token,) +CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False,) # ChatML has no BOS and not EOS! Rather <|im_start|> and <|im_end|> acts as BOS / EOS. @@ -88,7 +88,7 @@ "{{ '<|im_start|>assistant\n' }}"\ "{% endif %}" chatml_eos_token = "<|im_end|>" -CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token,) +CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token, True,) # Mistral Instruct doesn't allow system prompts, so we append it to the user message. @@ -115,7 +115,7 @@ "{% endif %}"\ "{% endfor %}" mistral_eos_token = "eos_token" -CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token,) +CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False,) # Adds BOS to every convo! And weird <> system messages. @@ -141,7 +141,7 @@ "{% endif %}"\ "{% endfor %}" llama_eos_token = "eos_token" -CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token,) +CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False,) # https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template @@ -167,7 +167,7 @@ "{{ 'ASSISTANT:' }}"\ "{% endif %}" vicuna_eos_token = "eos_token" -CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token,) +CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False,) # https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template @@ -193,7 +193,7 @@ "{{ '### Assistant:' }}"\ "{% endif %}" vicuna_old_eos_token = "eos_token" -CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token,) +CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False,) # https://github.com/tatsu-lab/stanford_alpaca Changed for multi-turn convos @@ -219,7 +219,7 @@ "{{ '### Response:\n' }}"\ "{% endif %}" alpaca_eos_token = "eos_token" -CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token,) +CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token, False,) # https://huggingface.co/google/gemma-7b-it @@ -240,7 +240,7 @@ "{{ 'model\n' }}"\ "{% endif %}" gemma_eos_token = "" -CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token,) +CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token, True,) # Gemma with ChatML instead @@ -250,7 +250,7 @@ {"" : "<|im_start|>", "" : "<|im_end|>"}, "<|im_end|>", ) -CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token,) +CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token, True,) # Llama-3 @@ -264,27 +264,33 @@ "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\ "{% endif %}" llama3_template_eos_token = "eos_token" -CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token,) +CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token, False,) # Phi-3 phi3_template = \ "{{ bos_token }}"\ "{% for message in messages %}"\ - "{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\ + "{% if message['role'] == 'user' %}"\ + "{{'<|user|>\n' + message['content'] + '<|end|>\n'}}"\ + "{% elif message['role'] == 'assistant' %}"\ + "{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}"\ + "{% else %}"\ + "{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\ + "{% endif %}"\ "{% endfor %}"\ "{% if add_generation_prompt %}"\ "{{ '<|assistant|>\n' }}"\ "{% endif %}" phi3_template_eos_token = "<|end|>" -CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token,) +CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False,) def get_chat_template( tokenizer, chat_template = "chatml", mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}, - map_eos_token = True, + map_eos_token = False, ): assert(type(map_eos_token) is bool) old_tokenizer = tokenizer @@ -319,7 +325,11 @@ def get_chat_template( elif type(chat_template) is str: - chat_template, stop_word = CHAT_TEMPLATES[chat_template] + chat_template, stop_word, yes_map_eos_token = CHAT_TEMPLATES[chat_template] + + # Check mapping to eos_token + if not map_eos_token and yes_map_eos_token: map_eos_token = True + if not yes_map_eos_token and map_eos_token: map_eos_token = False if type(stop_word) in (list, tuple,): token_mapping, stop_word = stop_word From ff8171fc1bb3fa23d1855bed71442bff2ea38b1f Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 24 May 2024 04:07:02 +1000 Subject: [PATCH 024/153] Update chat_templates.py --- unsloth/chat_templates.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index e6b981ad..3decdf7f 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -258,7 +258,13 @@ llama3_template = \ "{{ bos_token }}"\ "{% for message in messages %}"\ - "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\ + "{% if message['role'] == 'user' %}"\ + "{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\ + "{% elif message['role'] == 'assistant' %}"\ + "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\ + "{% else %}"\ + "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\ + "{% endif %}"\ "{% endfor %}"\ "{% if add_generation_prompt %}"\ "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\ @@ -290,7 +296,7 @@ def get_chat_template( tokenizer, chat_template = "chatml", mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}, - map_eos_token = False, + map_eos_token = True, ): assert(type(map_eos_token) is bool) old_tokenizer = tokenizer From 5ca8b58b63585caac050cf0f84414c0dc7ec7281 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 24 May 2024 04:50:12 +1000 Subject: [PATCH 025/153] Phi-3 --- README.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index ab06abe9..1d335101 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ -### Finetune Llama 3, Mistral & Gemma 2-5x faster with 80% less memory! +### Finetune Llama 3, Mistral, Phi-3 & Gemma 2-5x faster with 80% less memory! ![](https://i.ibb.co/sJ7RhGG/image-41.png) @@ -24,24 +24,24 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and |-----------|---------|--------|----------| | **Llama 3 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | | **Mistral v3 (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) | 2.2x faster | 73% less | -| **Mistral v1 (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | +| **Phi-3 (medium)** | [▶️ Start for free](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) | 2x faster | 50% less | +| **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing) | 2x faster | 50% less | | **Gemma (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | | **ORPO** | [▶️ Start for free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | | **DPO Zephyr** | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | -| **Phi-3 (3.8B)** | [▶️ Start for free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | 2x faster | 50% less | | **TinyLlama** | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | -- Benchmarking compared to FA2 + Hugging Face combined. -- **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) -- Also [Llama-3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing). [Mistral 7b v1 ChatML](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). [Mistral 7b v3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing). -- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. +- **Kaggle Notebooks** for [Llama 3 8B](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7B](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7B](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) +- Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral 7B v3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) +- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text + ## 🦥 Unsloth.ai News -- 📣 NEW! Mistral v3 Base and Instruct now supported! 2x faster, 70% less VRAM notebooks for the [base model](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) and [instruct with ShareGPT](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) +- 📣 NEW! [Phi-3 medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) and [Phi-3 mini](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing) support is here! +- 📣 NEW! [Mistral v3 Base](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) and [Mistral v3 Instruct](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) support is here! - 📣 NEW! Qwen1.5-7B, Qwen1.5-14B, Qwen1.5-32B, Qwen1.5-72B now work, courtesy of Firefly's PR [#428](https://github.com/unslothai/unsloth/pull/428) - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). - 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here! -- 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here! - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line: ```python model = FastLanguageModel.get_peft_model( @@ -195,15 +195,15 @@ dataset = load_dataset("json", data_files = {"train" : url}, split = "train") # 4bit pre quantized models we support for 4x faster downloading + no OOMs. fourbit_models = [ + "unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster! + "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", + "unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster! + "unsloth/llama-3-8b-Instruct-bnb-4bit", + "unsloth/llama-3-70b-bnb-4bit", + "unsloth/Phi-3-mini-4k-instruct", # Phi-3 2x faster! + "unsloth/Phi-3-medium-4k-instruct", "unsloth/mistral-7b-bnb-4bit", - "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", - "unsloth/llama-2-7b-bnb-4bit", - "unsloth/gemma-7b-bnb-4bit", - "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b - "unsloth/gemma-2b-bnb-4bit", - "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b - "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3 - "unsloth/Phi-3-mini-4k-instruct-bnb-4bit", + "unsloth/gemma-7b-bnb-4bit", # Gemma 2.2x faster! ] # More models at https://huggingface.co/unsloth model, tokenizer = FastLanguageModel.from_pretrained( From a1328f619d92b7ec391f40979129b0d1e78d714e Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 24 May 2024 20:08:05 +1000 Subject: [PATCH 026/153] Update save.py --- unsloth/save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/save.py b/unsloth/save.py index 033f6eb1..304d3bee 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -94,7 +94,7 @@ def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencep temp_tokenizer = model._saved_temp_tokenizer sentencepiece_model = False - file_location = f"{temporary_location}/{temp_tokenizer.name_or_path}" + file_location = os.path.join(temporary_location, temp_tokenizer.name_or_path) if not os.path.exists(file_location): os.makedirs(file_location) pass From fb296737878e747b804a43258ecef0eb0b0e6ef0 Mon Sep 17 00:00:00 2001 From: Michael Han <107991372+shimmyshimmer@users.noreply.github.com> Date: Sat, 25 May 2024 23:56:36 +1000 Subject: [PATCH 027/153] Update README.md Mistral v3 to Mistral v0.3 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1d335101..3537b8de 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | Unsloth supports | Free Notebooks | Performance | Memory use | |-----------|---------|--------|----------| | **Llama 3 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | -| **Mistral v3 (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) | 2.2x faster | 73% less | +| **Mistral v0.3 (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) | 2.2x faster | 73% less | | **Phi-3 (medium)** | [▶️ Start for free](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) | 2x faster | 50% less | | **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing) | 2x faster | 50% less | | **Gemma (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | @@ -38,7 +38,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and ## 🦥 Unsloth.ai News - 📣 NEW! [Phi-3 medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) and [Phi-3 mini](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing) support is here! -- 📣 NEW! [Mistral v3 Base](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) and [Mistral v3 Instruct](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) support is here! +- 📣 NEW! [Mistral v0.3 Base](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) and [Mistral v0.3 Instruct](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) support is here! - 📣 NEW! Qwen1.5-7B, Qwen1.5-14B, Qwen1.5-32B, Qwen1.5-72B now work, courtesy of Firefly's PR [#428](https://github.com/unslothai/unsloth/pull/428) - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). - 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here! From fa85556f638ab7eadb7e483936740f08e2b5a42d Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:02:30 +1000 Subject: [PATCH 028/153] Untrained tokens --- unsloth/models/loader.py | 16 ++--- unsloth/tokenizer_utils.py | 133 +++++++++++++++++++++++++++++-------- 2 files changed, 112 insertions(+), 37 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 86a0f5d7..b2f0e4ef 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -45,18 +45,18 @@ def _get_model_name(model_name, load_in_4bit = True): elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER: new_model_name = INT_TO_FLOAT_MAPPER[model_name] - logger.warning_once( - f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\ - f"`load_in_4bit = False`. We shall load `{new_model_name}` instead." - ) + # logger.warning_once( + # f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\ + # f"`load_in_4bit = False`. We shall load `{new_model_name}` instead." + # ) model_name = new_model_name elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER: new_model_name = FLOAT_TO_INT_MAPPER[model_name] - logger.warning_once( - f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\ - f"We shall load `{new_model_name}` for 4x faster loading." - ) + # logger.warning_once( + # f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\ + # f"We shall load `{new_model_name}` for 4x faster loading." + # ) model_name = new_model_name pass diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index a0349166..1240fc8e 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -554,7 +554,7 @@ def check_tokenizer( @torch.inference_mode -def fix_untrained_tokens(model, eps = 1e-16): +def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): """ Llama-3 for eg has untrained vectors in the base model. These include <|eot_id|>, <|start_header_id|>, <|end_header_id|> @@ -568,30 +568,104 @@ def fix_untrained_tokens(model, eps = 1e-16): where_untrained = torch.where(indicator_untrained)[0] n_untrained = where_untrained.shape[0] n_trained = embedding_matrix.shape[0] - n_untrained - if n_untrained != 0: - print( - f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\ - "We shall set them to the mean of the other trained tokens." + + # Get set and actual tokens + where_untrained = where_untrained.tolist() + if len(where_untrained) == 0: return + + where_untrained_set = frozenset(where_untrained) + actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained) + + # Check if tokenizer and training datasets have bad tokens + if_bad_first = False + if_bad_second = False + # Check tokenizer's chat template for any untrained tokens + if hasattr(tokenizer, "chat_template"): + chat_template = tokenizer.chat_template + if_bad_first = any(x in chat_template for x in actual_bad_tokens) + pass + + # Check the first 250, last 250 input_ids + size_dataset = len(train_dataset) + size = min(size_dataset, 250) + for j in range(size): + input_ids = train_dataset[j] + if "input_ids" in input_ids: + input_ids = input_ids["input_ids"] + if_bad = any(item in where_untrained_set for item in input_ids) + if if_bad: + if_bad_second = True + break + pass + pass + pass + + # Check last 250 + if not if_bad_second: + left = max(size_dataset-250, 0) + for j in range(left, size_dataset): + input_ids = train_dataset[j] + if "input_ids" in input_ids: + input_ids = input_ids["input_ids"] + if_bad = any(item in where_untrained_set for item in input_ids) + if if_bad: + if_bad_second = True + break + pass + pass + pass + pass + + # Check if bad tokens exists! + if not if_bad_first and not if_bad_second: return + + # Check if lm_head / embed_token are trainable! + bad_not_trainable = False + if not embedding_matrix.requires_grad: bad_not_trainable = True + if not lm_head_matrix .requires_grad: bad_not_trainable = True + + if bad_not_trainable: + raise ValueError( + 'Unsloth: Untrained tokens found, but embed_tokens & lm_head not trainable, causing NaNs. '\ + 'Restart then add `embed_tokens` & `lm_head` to '\ + '`FastLanguageModel.get_peft_model(target_modules = [..., "embed_tokens", "lm_head",])`', ) pass - # First set untrained to all 0s - sometimes it's not! 1e-23 for bfloat16 - embedding_matrix[where_untrained] = 0 - lm_head_matrix [where_untrained] = 0 + # Get sum of all items + sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0) + sum_lm_head = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0) - # Find sum - sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0) - sum_lm_head = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0) + # Remove bad tokens + sum_embedding -= torch.sum(embedding_matrix[where_untrained], dtype = torch.float32, axis = 0) + sum_lm_head -= torch.sum(lm_head_matrix [where_untrained], dtype = torch.float32, axis = 0) # Find correct average by dividing by sum of trained tokens - mean_embedding = (sum_embedding / n_trained).to(embedding_matrix.dtype) - mean_lm_head = (sum_lm_head / n_trained).to(lm_head_matrix .dtype) + mean_embedding = (sum_embedding / n_trained) + mean_lm_head = (sum_lm_head / n_trained) + + # Scale by the smallest correct item to make distribution correct + smallest_items = torch.amin(embedding_matrix, axis = 1).abs() + smallest_items[where_untrained] = torch.inf + smallest_item = smallest_items.min().abs() + mean_embedding *= (smallest_item / mean_embedding.abs()).min() * 0.1 + mean_embedding = mean_embedding.to(embedding_matrix.dtype) + + # Do for lm_head + smallest_items = torch.amin(lm_head_matrix, axis = 1).abs() + smallest_items[where_untrained] = torch.inf + smallest_item = smallest_items.min().abs() + mean_lm_head *= (smallest_item / mean_lm_head.abs()).min() * 0.1 + mean_lm_head = mean_lm_head.to(lm_head_matrix.dtype) # Set them to the mean + logger.warning( + "Unsloth: Setting embed_tokens & lm_head untrained tokens to "\ + "mean(trained) to counteract NaNs during training." + ) embedding_matrix[where_untrained] = mean_embedding lm_head_matrix [where_untrained] = mean_lm_head - - return mean_embedding, mean_lm_head + return pass @@ -610,24 +684,24 @@ def mean_of_trained_tokens(model, eps = 1e-16): where_untrained = torch.where(indicator_untrained)[0] n_untrained = where_untrained.shape[0] n_trained = embedding_matrix.shape[0] - n_untrained - if n_untrained != 0: - print( - f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\ - "We shall set them to the mean of the other trained tokens." - ) - pass + # if n_untrained != 0: + # print( + # f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\ + # "We shall set them to the mean of the other trained tokens." + # ) + # pass - # First set untrained to all 0s - sometimes it's not! 1e-23 for bfloat16 - embedding_matrix[where_untrained] = 0 - lm_head_matrix [where_untrained] = 0 + # Get sum of all items + sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0) + sum_lm_head = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0) - # Find sum - sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0) - sum_lm_head = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0) + # Remove bad tokens + sum_embedding -= torch.sum(embedding_matrix[where_untrained], dtype = torch.float32, axis = 0) + sum_lm_head -= torch.sum(lm_head_matrix [where_untrained], dtype = torch.float32, axis = 0) # Find correct average by dividing by sum of trained tokens - mean_embedding = (sum_embedding / n_trained).to(embedding_matrix.dtype) - mean_lm_head = (sum_lm_head / n_trained).to(lm_head_matrix .dtype) + mean_embedding = (sum_embedding / n_trained) + mean_lm_head = (sum_lm_head / n_trained) return mean_embedding, mean_lm_head pass @@ -734,6 +808,7 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ + "print(self.model)\n"\ "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\ "chat_template = getattr(tokenizer, 'chat_template', None)\n"\ "chat_template = '' if chat_template is None else chat_template\n"\ From c511aca47ff849a522589af384cb1eaf45f27e09 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:08:42 +1000 Subject: [PATCH 029/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 1240fc8e..7dfa9d02 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -808,7 +808,6 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ - "print(self.model)\n"\ "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\ "chat_template = getattr(tokenizer, 'chat_template', None)\n"\ "chat_template = '' if chat_template is None else chat_template\n"\ @@ -824,6 +823,25 @@ def patch_sft_trainer_tokenizer(): exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals()) pass + + # Patch _prepare_dataset + replacer = "if dataset is None:" + function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer._prepare_dataset")) + where = function.find("def") + function = function.split("\n") + function = "\n".join(x[where:] for x in function) + + check_text = \ + "\n"\ + "print(dir(self))\n\n" + + check_text = check_text.split("\n") + check_text = "\n".join(" "*where + x for x in check_text) + + function = function.replace(replacer, check_text + replacer) + exec(function, globals()) + + exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals()) pass patch_sft_trainer_tokenizer() From 35e7355f7584dc00257c0a9697f0fb1ee79215e3 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:10:48 +1000 Subject: [PATCH 030/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 7dfa9d02..376ae97f 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -825,8 +825,8 @@ def patch_sft_trainer_tokenizer(): pass # Patch _prepare_dataset - replacer = "if dataset is None:" - function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer._prepare_dataset")) + function_name, replacer = "_prepare_dataset", "if dataset is None:" + function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}")) where = function.find("def") function = function.split("\n") function = "\n".join(x[where:] for x in function) From cc0bf44e78b1bcc0322a029df9acea999a247c07 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:17:05 +1000 Subject: [PATCH 031/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 376ae97f..3ba60e06 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -824,8 +824,8 @@ def patch_sft_trainer_tokenizer(): exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals()) pass - # Patch _prepare_dataset - function_name, replacer = "_prepare_dataset", "if dataset is None:" + # Patch __init__ with fix_untrained_tokens + function_name, replacer = "__init__", "if self.args.max_steps > 0 and packing:" function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}")) where = function.find("def") function = function.split("\n") @@ -833,7 +833,8 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ - "print(dir(self))\n\n" + "print('Fixing!')\n" + "fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16)\n\n" check_text = check_text.split("\n") check_text = "\n".join(" "*where + x for x in check_text) From 674ba66d0556e71f0905b5709a488851910d1472 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:21:16 +1000 Subject: [PATCH 032/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 3ba60e06..2208d886 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -825,7 +825,7 @@ def patch_sft_trainer_tokenizer(): pass # Patch __init__ with fix_untrained_tokens - function_name, replacer = "__init__", "if self.args.max_steps > 0 and packing:" + function_name, replacer = "train", "if resume_from_checkpoint is False:" function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}")) where = function.find("def") function = function.split("\n") From 9823f52bd2aad44728d575738b32724c94298281 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:22:07 +1000 Subject: [PATCH 033/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 2208d886..ec7cc6cb 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -834,7 +834,7 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ "print('Fixing!')\n" - "fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16)\n\n" + "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" check_text = check_text.split("\n") check_text = "\n".join(" "*where + x for x in check_text) From c0c761b19423ed964c4b01a59b5c4baf3d652de7 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:23:10 +1000 Subject: [PATCH 034/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index ec7cc6cb..bcdd2c65 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -792,6 +792,7 @@ def add_new_tokens( from inspect import getsource import trl.trainer.sft_trainer from trl.trainer.sft_trainer import * +from transformers.trainer import * def patch_sft_trainer_tokenizer(): """ From e2850c07a4f77dce31ae688c97621adcd776744c Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:26:15 +1000 Subject: [PATCH 035/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index bcdd2c65..b1aad831 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -834,7 +834,6 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ - "print('Fixing!')\n" "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" check_text = check_text.split("\n") From 8e12780357988263d02326dc949c3f1464982c8d Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:26:49 +1000 Subject: [PATCH 036/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index b1aad831..f5a5cb9f 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -834,6 +834,7 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ + "print(self.train_dataset)\n"\ "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" check_text = check_text.split("\n") From 6f1855e50ff8053235cc3c901388bfd6c752dd8e Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:29:21 +1000 Subject: [PATCH 037/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index f5a5cb9f..5225e251 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -580,8 +580,8 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): if_bad_first = False if_bad_second = False # Check tokenizer's chat template for any untrained tokens - if hasattr(tokenizer, "chat_template"): - chat_template = tokenizer.chat_template + chat_template = getattr(tokenizer, "chat_template", None): + if chat_template is not None: if_bad_first = any(x in chat_template for x in actual_bad_tokens) pass From d27b173423c1d734b9e97e50cff96302ac3bbcfe Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:30:15 +1000 Subject: [PATCH 038/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 5225e251..f39508ca 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -580,7 +580,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): if_bad_first = False if_bad_second = False # Check tokenizer's chat template for any untrained tokens - chat_template = getattr(tokenizer, "chat_template", None): + chat_template = getattr(tokenizer, "chat_template", None) if chat_template is not None: if_bad_first = any(x in chat_template for x in actual_bad_tokens) pass From 7bf7399e179769bd947614a7a294a10633ac9b94 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:43:11 +1000 Subject: [PATCH 039/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index f39508ca..0f799859 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -553,7 +553,7 @@ def check_tokenizer( pass -@torch.inference_mode +@torch.no_grad def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): """ Llama-3 for eg has untrained vectors in the base model. From 31ecef989a290cc53fe23fcc0d4626418eb606f8 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:46:38 +1000 Subject: [PATCH 040/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 0f799859..d74bedb4 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -553,15 +553,15 @@ def check_tokenizer( pass -@torch.no_grad +@torch.inference_mode def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): """ Llama-3 for eg has untrained vectors in the base model. These include <|eot_id|>, <|start_header_id|>, <|end_header_id|> We reset them to the mean of the rest of the tokens """ - embedding_matrix = model.get_input_embeddings ().weight.data - lm_head_matrix = model.get_output_embeddings().weight.data + embedding_matrix = model.get_input_embeddings ().weight + lm_head_matrix = model.get_output_embeddings().weight # Get untrained tokens indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps @@ -676,8 +676,8 @@ def mean_of_trained_tokens(model, eps = 1e-16): These include <|eot_id|>, <|start_header_id|>, <|end_header_id|> We reset them to the mean of the rest of the tokens """ - embedding_matrix = model.get_input_embeddings ().weight.data.clone() - lm_head_matrix = model.get_output_embeddings().weight.data.clone() + embedding_matrix = model.get_input_embeddings ().weight.clone() + lm_head_matrix = model.get_output_embeddings().weight.clone() # Get untrained tokens indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps From b67d93f7391d014b90263aee3c9461a01103a1d9 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:47:17 +1000 Subject: [PATCH 041/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index d74bedb4..712c9db7 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -750,8 +750,8 @@ def add_new_tokens( # If we use interpolation, we interpolate between the mean embeddings and # the Word2Vec sum of the other vectors - embedding_matrix = model.get_input_embeddings ().weight.data - lm_head_matrix = model.get_output_embeddings().weight.data + embedding_matrix = model.get_input_embeddings ().weight + lm_head_matrix = model.get_output_embeddings().weight if method == "interpolation": print( From e874ccdbb39cfc63284fd3b953a61b77bd8da758 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 04:47:54 +1000 Subject: [PATCH 042/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 712c9db7..df2809e9 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -834,7 +834,6 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ - "print(self.train_dataset)\n"\ "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" check_text = check_text.split("\n") From d7b54ffeee9fbd5be61c6a656737ac5fcd0ddaf7 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 17:55:19 +1000 Subject: [PATCH 043/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index df2809e9..6a8dc954 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -648,14 +648,14 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): smallest_items = torch.amin(embedding_matrix, axis = 1).abs() smallest_items[where_untrained] = torch.inf smallest_item = smallest_items.min().abs() - mean_embedding *= (smallest_item / mean_embedding.abs()).min() * 0.1 + mean_embedding *= (smallest_item / mean_embedding.abs()).min() * 0.01 mean_embedding = mean_embedding.to(embedding_matrix.dtype) # Do for lm_head smallest_items = torch.amin(lm_head_matrix, axis = 1).abs() smallest_items[where_untrained] = torch.inf smallest_item = smallest_items.min().abs() - mean_lm_head *= (smallest_item / mean_lm_head.abs()).min() * 0.1 + mean_lm_head *= (smallest_item / mean_lm_head.abs()).min() * 0.01 mean_lm_head = mean_lm_head.to(lm_head_matrix.dtype) # Set them to the mean From 5a4a512db9afad17c3db1bd347199f5054a5168e Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 19:59:06 +1000 Subject: [PATCH 044/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 48 ++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 6a8dc954..3e84f9ce 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -20,6 +20,10 @@ from transformers.models.llama.modeling_llama import logger from peft import PeftModelForCausalLM import torch +import itertools +import collections +import numpy as np +import gc __all__ = [ "load_correct_tokenizer", @@ -274,12 +278,10 @@ def fix_sentencepiece_gguf(saved_location): user defined tokens. Inspiration from https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py """ - import numpy as np from copy import deepcopy from transformers.utils import sentencepiece_model_pb2 import json from enum import IntEnum - import os class SentencePieceTokenTypes(IntEnum): NORMAL = 1 @@ -632,6 +634,15 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): ) pass + # Count all the possible bad tokens + final_counts = np.zeros(len(tokenizer), dtype = np.int64) + def mapping(examples): + input_ids = examples["input_ids"] + counter = np.fromiter(itertools.chain.from_iterable(input_ids), dtype = np.int32) + np.add.at(final_counts, counter, 1) + pass + train_dataset.map(mapping, batched = True, desc = "Counting untrained tokens") + # Get sum of all items sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0) sum_lm_head = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0) @@ -644,27 +655,28 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): mean_embedding = (sum_embedding / n_trained) mean_lm_head = (sum_lm_head / n_trained) - # Scale by the smallest correct item to make distribution correct - smallest_items = torch.amin(embedding_matrix, axis = 1).abs() - smallest_items[where_untrained] = torch.inf - smallest_item = smallest_items.min().abs() - mean_embedding *= (smallest_item / mean_embedding.abs()).min() * 0.01 - mean_embedding = mean_embedding.to(embedding_matrix.dtype) - - # Do for lm_head - smallest_items = torch.amin(lm_head_matrix, axis = 1).abs() - smallest_items[where_untrained] = torch.inf - smallest_item = smallest_items.min().abs() - mean_lm_head *= (smallest_item / mean_lm_head.abs()).min() * 0.01 - mean_lm_head = mean_lm_head.to(lm_head_matrix.dtype) + # Scale each to be equal to 1/max_frequency. Also set some to 0 if none seen + scaling = final_counts[where_untrained] / max(final_counts.max(), 1) + scaling = torch.tensor(scaling, device = mean_embedding.device).unsqueeze(1) + mean_embedding = mean_embedding.repeat((n_untrained, 1,)) * scaling + mean_lm_head = mean_lm_head .repeat((n_untrained, 1,)) * scaling + where_null = scaling.ravel() == 0 + mean_embedding[where_null] = 0 + mean_lm_head [where_null] = 0 # Set them to the mean logger.warning( "Unsloth: Setting embed_tokens & lm_head untrained tokens to "\ "mean(trained) to counteract NaNs during training." ) - embedding_matrix[where_untrained] = mean_embedding - lm_head_matrix [where_untrained] = mean_lm_head + embedding_matrix[where_untrained] = mean_embedding.to(embedding_matrix.dtype) + lm_head_matrix [where_untrained] = mean_lm_head .to(lm_head_matrix .dtype) + + # Clean up + for _ in range(3): + gc.collect() + torch.cuda.empty_cache() + pass return pass @@ -825,7 +837,7 @@ def patch_sft_trainer_tokenizer(): exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals()) pass - # Patch __init__ with fix_untrained_tokens + # Patch train with fix_untrained_tokens function_name, replacer = "train", "if resume_from_checkpoint is False:" function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}")) where = function.find("def") From 82c040e3d634681206bd5e9a996e1d226c0c0958 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 20:14:09 +1000 Subject: [PATCH 045/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 3e84f9ce..4669f8bd 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -846,6 +846,7 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ + "print(dir(self))\n"\ "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" check_text = check_text.split("\n") From 8e227b2391958b2007144e145086c6f82c608e10 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 20:21:18 +1000 Subject: [PATCH 046/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 4669f8bd..0ca2967c 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -846,7 +846,12 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ - "print(dir(self))\n"\ + "if Trainer._inner_training_loop.__name__ != '_fast_inner_training_loop':"\ + " raise RuntimeError(\n"\ + " 'You must not edit specific areas of Unsloth's codebase since you'll make it slower.\n'"\ + " 'Please revert your changes back otherwise you might get CUDA segfaults.\n'"\ + " )\n"\ + "pass\n"\ "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" check_text = check_text.split("\n") From 250d386060e50c2a80840b09bb20e33aa3a81620 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 20:24:11 +1000 Subject: [PATCH 047/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 0ca2967c..7521e1f7 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -848,8 +848,7 @@ def patch_sft_trainer_tokenizer(): "\n"\ "if Trainer._inner_training_loop.__name__ != '_fast_inner_training_loop':"\ " raise RuntimeError(\n"\ - " 'You must not edit specific areas of Unsloth's codebase since you'll make it slower.\n'"\ - " 'Please revert your changes back otherwise you might get CUDA segfaults.\n'"\ + " 'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'"\ " )\n"\ "pass\n"\ "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" From e6db3bae1f211d70fc0ffb6665308d149d08cf1d Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 27 May 2024 20:30:51 +1000 Subject: [PATCH 048/153] Update llama.py --- unsloth/models/llama.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 1d6a282a..31455630 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1166,7 +1166,7 @@ def from_pretrained( except: raise RuntimeError( "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n" + "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" "We're a 2 person team, so we still have to fund our development costs - thanks!\n" "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) @@ -1194,7 +1194,7 @@ def from_pretrained( f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\ f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' - logger.warning_once(debug_info) + logger.warning(debug_info) import gc for _ in range(3): gc.collect() @@ -1209,7 +1209,7 @@ def from_pretrained( if n_total_devices > 2: logger.warning_once( "Our OSS was designed for people with few GPU resources to level the playing field.\\n" - "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\\n" + "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" "We're a 2 person team, so we still have to fund our development costs - thanks!\\n" "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) @@ -1238,9 +1238,10 @@ def from_pretrained( n_total_devices = total_batches // ga // bsz if n_total_devices > 2: logger.warning_once( - "Please consider a commercial license - Unsloth was designed for the GPU Poor.\\n" - "The OSS currently works on 4 GPUs - we're a 2 person team, so please help fund\\n" - "our development costs by supporting us through Ko-fi or buying a license! Thanks!", + "Our OSS was designed for people with few GPU resources to level the playing field.\\n" + "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" + "We're a 2 person team, so we still have to fund our development costs - thanks!\\n" + "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) divisor = n_total_devices / 2 bsz = self._train_batch_size = max(int(bsz / divisor), 1) @@ -1267,7 +1268,7 @@ def from_pretrained( if "n_total_devices >" not in inner_training_loop: raise RuntimeError( "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n" + "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" "We're a 2 person team, so we still have to fund our development costs - thanks!\n" "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) @@ -1703,7 +1704,7 @@ def patch_peft_model( if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": raise RuntimeError( "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n" + "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" "We're a 2 person team, so we still have to fund our development costs - thanks!\n" "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) From e673fa26f38fd2c987401a04aadf7181c183e1a4 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 28 May 2024 00:05:26 +1000 Subject: [PATCH 049/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 7521e1f7..d33ea621 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -846,7 +846,8 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ - "if Trainer._inner_training_loop.__name__ != '_fast_inner_training_loop':"\ + "print(self.args)\n"\ + "if self._inner_training_loop.__name__ != '_fast_inner_training_loop':"\ " raise RuntimeError(\n"\ " 'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'"\ " )\n"\ From 222b8355bc1fe7b37be555fd590763be1972329f Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 28 May 2024 00:23:21 +1000 Subject: [PATCH 050/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index d33ea621..f9a175f9 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -852,7 +852,17 @@ def patch_sft_trainer_tokenizer(): " 'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'"\ " )\n"\ "pass\n"\ - "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" + "n_devices = torch.cuda.device_count()\n"\ + "more_than = 0\n"\ + "for j in range(n_devices):\n"\ + " vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\ + " more_than += (vram > 4)\n"\ + "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')"\ + "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n"\ + "for _ in range(3):\n"\ + " gc.collect()\n"\ + " torch.cuda.empty_cache()\n"\ + "pass\n\n" check_text = check_text.split("\n") check_text = "\n".join(" "*where + x for x in check_text) From 6404aa563cbb29f8e02a2357cc3052272c7dfeb6 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 28 May 2024 00:23:35 +1000 Subject: [PATCH 051/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index f9a175f9..903c2e85 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -857,7 +857,7 @@ def patch_sft_trainer_tokenizer(): "for j in range(n_devices):\n"\ " vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\ " more_than += (vram > 4)\n"\ - "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')"\ + "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')\n"\ "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n"\ "for _ in range(3):\n"\ " gc.collect()\n"\ From cfea7b2d237ad8da6ec8453acccaa1781fea9f55 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 28 May 2024 00:25:24 +1000 Subject: [PATCH 052/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 903c2e85..8ed7142d 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -846,10 +846,9 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ - "print(self.args)\n"\ - "if self._inner_training_loop.__name__ != '_fast_inner_training_loop':"\ + "if self._inner_training_loop.__name__ != '_fast_inner_training_loop':\n"\ " raise RuntimeError(\n"\ - " 'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'"\ + " 'Do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'\n"\ " )\n"\ "pass\n"\ "n_devices = torch.cuda.device_count()\n"\ From 083e5ba5179bd0d3346879f74310787c5e57543d Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 28 May 2024 03:03:04 +1000 Subject: [PATCH 053/153] Update save.py --- unsloth/save.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/unsloth/save.py b/unsloth/save.py index 304d3bee..8840dedc 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1390,6 +1390,20 @@ def unsloth_save_pretrained_gguf( model_type = self.config.model_type is_sentencepiece_model = check_if_sentencepiece_model(self) + + # Check if BOS added already, then warn + if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): + chat_template = getattr(tokenizer, "chat_template", None) + if chat_template is not None and tokenizer.bos_token in chat_template: + logger.warning( + "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\ + "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\ + f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token." + ) + pass + pass + + # Save to GGUF file_location = save_to_gguf(model_type, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, ) @@ -1513,6 +1527,20 @@ def unsloth_push_to_hub_gguf( model_type = self.config.model_type is_sentencepiece_model = check_if_sentencepiece_model(self) + + # Check if BOS added already, then warn + if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): + chat_template = getattr(tokenizer, "chat_template", None) + if chat_template is not None and tokenizer.bos_token in chat_template: + logger.warning( + "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\ + "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\ + f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token." + ) + pass + pass + + # Save to GGUF file_location = save_to_gguf(model_type, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, ) From 6f2565cfe4d4f36c61e3a31ca757d5bce587533b Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 28 May 2024 03:23:40 +1000 Subject: [PATCH 054/153] Update save.py --- unsloth/save.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 8840dedc..74b06d3e 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1394,7 +1394,8 @@ def unsloth_save_pretrained_gguf( # Check if BOS added already, then warn if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): chat_template = getattr(tokenizer, "chat_template", None) - if chat_template is not None and tokenizer.bos_token in chat_template: + if chat_template is not None and \ + (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): logger.warning( "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\ "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\ @@ -1531,7 +1532,8 @@ def unsloth_push_to_hub_gguf( # Check if BOS added already, then warn if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): chat_template = getattr(tokenizer, "chat_template", None) - if chat_template is not None and tokenizer.bos_token in chat_template: + if chat_template is not None and \ + (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): logger.warning( "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\ "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\ From c19b04ecb4d27445747d2f76648271bf11b45af0 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 28 May 2024 03:39:45 +1000 Subject: [PATCH 055/153] Update save.py --- unsloth/save.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 74b06d3e..7af62809 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1392,14 +1392,15 @@ def unsloth_save_pretrained_gguf( is_sentencepiece_model = check_if_sentencepiece_model(self) # Check if BOS added already, then warn + print_bos_token_message = False if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): chat_template = getattr(tokenizer, "chat_template", None) if chat_template is not None and \ (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): + print_bos_token_message = True logger.warning( - "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\ - "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\ - f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token." + f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ + "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template." ) pass pass @@ -1420,6 +1421,13 @@ def unsloth_save_pretrained_gguf( new_save_directory.lstrip('/.') print(f"Saved GGUF to https://huggingface.co/{link}") pass + + if print_bos_token_message: + logger.warning( + f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ + "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template." + ) + pass pass @@ -1530,14 +1538,15 @@ def unsloth_push_to_hub_gguf( is_sentencepiece_model = check_if_sentencepiece_model(self) # Check if BOS added already, then warn + print_bos_token_message = False if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): chat_template = getattr(tokenizer, "chat_template", None) if chat_template is not None and \ (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): + print_bos_token_message = True logger.warning( - "Unsloth: ##### Your tokenizer adds a BOS token, and your chat template has a BOS token.\n"\ - "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template.\n"\ - f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token." + f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ + "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template." ) pass pass @@ -1555,7 +1564,15 @@ def unsloth_push_to_hub_gguf( link = f"{username}/{new_save_directory.lstrip('/.')}" \ if username not in new_save_directory else \ new_save_directory.lstrip('/.') + print(f"Saved GGUF to https://huggingface.co/{link}") + + if print_bos_token_message: + logger.warning( + f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ + "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template." + ) + pass pass From 64b12a2d2ec243f803f43dcdda9e1317ad2fd514 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 28 May 2024 20:23:02 +1000 Subject: [PATCH 056/153] checkpoint --- unsloth/models/_utils.py | 2 +- unsloth/models/llama.py | 2 +- unsloth/tokenizer_utils.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index a2d4d50c..22fb5114 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -381,7 +381,7 @@ class Unsloth_Offloaded_Gradient_Checkpointer(torch.autograd.Function): def forward(ctx, forward_function, hidden_states, *args): saved_hidden_states = hidden_states.to("cpu", non_blocking = True) with torch.no_grad(): - (output,) = forward_function(hidden_states, *args) + output = forward_function(hidden_states, *args) ctx.save_for_backward(saved_hidden_states) ctx.forward_function = forward_function ctx.args = args diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 31455630..9aeb55e4 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -657,7 +657,7 @@ def LlamaModel_fast_forward( past_key_values, output_attentions, use_cache, - ) + )[0] elif gradient_checkpointing: def create_custom_forward(module): diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 8ed7142d..03f3e341 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -857,11 +857,12 @@ def patch_sft_trainer_tokenizer(): " vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\ " more_than += (vram > 4)\n"\ "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')\n"\ - "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n"\ "for _ in range(3):\n"\ " gc.collect()\n"\ " torch.cuda.empty_cache()\n"\ - "pass\n\n" + "pass\n"\ + "\n"\ + "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n" check_text = check_text.split("\n") check_text = "\n".join(" "*where + x for x in check_text) From 196faeca19336479ee3a10449a5538d5b1978d88 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 29 May 2024 04:37:59 +1000 Subject: [PATCH 057/153] Update _utils.py --- unsloth/models/_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 22fb5114..b7333f00 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -79,6 +79,7 @@ "offload_input_embeddings", "offload_output_embeddings", "is_bfloat16_supported", + "unsloth_offloaded_gradient_checkpoint", ] @@ -402,6 +403,12 @@ def backward(ctx, dY): pass +@torch._disable_dynamo +def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, **kwargs): + return Unsloth_Offloaded_Gradient_Checkpointer.apply(function, *args) +pass + + """ Remove warnings about missing kwargs """ From 235be40450a4a766a5de66be15f77c09059b2081 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 29 May 2024 14:18:27 +1000 Subject: [PATCH 058/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 03f3e341..f0ea73be 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -577,6 +577,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): where_untrained_set = frozenset(where_untrained) actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained) + print(actual_bad_tokens) # Check if tokenizer and training datasets have bad tokens if_bad_first = False From cf9090acf80ada275286e90b11f1d351c6684bee Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 29 May 2024 14:22:15 +1000 Subject: [PATCH 059/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index f0ea73be..d7f86457 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -577,7 +577,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): where_untrained_set = frozenset(where_untrained) actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained) - print(actual_bad_tokens) + print(where_untrained) # Check if tokenizer and training datasets have bad tokens if_bad_first = False From 1fb11107a92d6912556730736ec52c84689e9781 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 29 May 2024 14:28:17 +1000 Subject: [PATCH 060/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index d7f86457..6e4d6910 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -577,8 +577,9 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): where_untrained_set = frozenset(where_untrained) actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained) - print(where_untrained) - + # Remove None items in actual_bad_tokens + actual_bad_tokens = [x for x in actual_bad_tokens if x is not None] + # Check if tokenizer and training datasets have bad tokens if_bad_first = False if_bad_second = False From d1bd60cb90d2c795e767fee71ca8266a666136f3 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 30 May 2024 20:06:13 +1000 Subject: [PATCH 061/153] Update llama.py --- unsloth/models/llama.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 9aeb55e4..2390146f 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1336,18 +1336,18 @@ def post_patch(model): layers = model.model.layers # Torch.compile fails on embedding matrix?? - # Workaround randomnly fixes it for torch versions < 2.2 - model.model.embed_tokens = torch.nn.Embedding.from_pretrained(model.model.embed_tokens.weight) + # Workaround randomnly fixes it for torch versions < 2. + model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight)) model.config.update({"unsloth_version" : __version__}) # We also do this for the lm_head lm_head = torch.nn.Linear(1, 1, bias = None) del lm_head.weight - lm_head.weight = model.lm_head.weight + lm_head.weight = model.get_output_embeddings().weight lm_head.in_features = lm_head.weight.shape[1] lm_head.out_features = lm_head.weight.shape[0] model.lm_head = lm_head - + # Also patch all dtypes - BnB seems to not allocate the correct type? # BnB default dtype seems to be float16! correct_dtype = lm_head.weight.dtype From 732ead0e4053bd7b36bfcf75f0fca76e1c0884e4 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 30 May 2024 22:29:36 +1000 Subject: [PATCH 062/153] accelerate --- unsloth/models/_utils.py | 14 ++++++++++++++ unsloth/models/llama.py | 2 ++ 2 files changed, 16 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index b7333f00..de6b864c 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -442,6 +442,20 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, pass +# Fix up Accelerate +import accelerate.accelerator +prepare = inspect.getsource(accelerate.accelerator.Accelerator.prepare) +prepare = prepare.split("\n") +spaces = prepare[0].find("def") +prepare = "\n".join(x[spaces:] for x in prepare) +replace = "for obj in args:" +s = " "*spaces +prepare = prepare.replace(replace, f'self.distributed_type = DistributedType.MULTI_CPU\n{s}{replace}', 1) +prepare = prepare.replace("prepare", "_fast_prepare") +exec(prepare, globals()) +accelerate.accelerator.Accelerator.prepare = _fast_prepare + + # Offloading to disk for modules (lm_head, embed_tokens) import os import pickle diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 2390146f..7dec8624 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1277,6 +1277,7 @@ def from_pretrained( "is_sagemaker_mp_enabled()", "False", ) + exec(inner_training_loop, globals()) Trainer._inner_training_loop = _fast_inner_training_loop # Save max_seq_length @@ -1316,6 +1317,7 @@ def from_pretrained( # Add save modules patch_saving_functions(model) + Trainer._inner_training_loop = _fast_inner_training_loop # Save tokenizer for inference purposes tokenizer.padding_side = "left" # Force inference From 359ae5c134c8cfb037d8735d64d3cfa8e3866369 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 30 May 2024 22:43:36 +1000 Subject: [PATCH 063/153] Update _utils.py --- unsloth/models/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index de6b864c..6ba0fd33 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -444,6 +444,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, # Fix up Accelerate import accelerate.accelerator +from accelerate.utils.dataclasses import DistributedType prepare = inspect.getsource(accelerate.accelerator.Accelerator.prepare) prepare = prepare.split("\n") spaces = prepare[0].find("def") From 8dcfad3ad7eb8fcfd097320fb1b29384ff70bd98 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 30 May 2024 22:46:50 +1000 Subject: [PATCH 064/153] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 6ba0fd33..d9d75bbd 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -451,7 +451,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, prepare = "\n".join(x[spaces:] for x in prepare) replace = "for obj in args:" s = " "*spaces -prepare = prepare.replace(replace, f'self.distributed_type = DistributedType.MULTI_CPU\n{s}{replace}', 1) +prepare = prepare.replace(replace, f'print(self.distributed_type)\n{s}{replace}', 1) prepare = prepare.replace("prepare", "_fast_prepare") exec(prepare, globals()) accelerate.accelerator.Accelerator.prepare = _fast_prepare From 2bafc57756d74ce4b771f3e6b71f7afcf8d54f25 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 30 May 2024 22:49:59 +1000 Subject: [PATCH 065/153] Update _utils.py --- unsloth/models/_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index d9d75bbd..0ab3ecfc 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -452,9 +452,8 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, replace = "for obj in args:" s = " "*spaces prepare = prepare.replace(replace, f'print(self.distributed_type)\n{s}{replace}', 1) -prepare = prepare.replace("prepare", "_fast_prepare") exec(prepare, globals()) -accelerate.accelerator.Accelerator.prepare = _fast_prepare +accelerate.accelerator.Accelerator.prepare = prepare # Offloading to disk for modules (lm_head, embed_tokens) From 90f631162c04217bc3835c3d5a2dce40462cf380 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 30 May 2024 22:54:03 +1000 Subject: [PATCH 066/153] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 0ab3ecfc..eb4fbac1 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -451,7 +451,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, prepare = "\n".join(x[spaces:] for x in prepare) replace = "for obj in args:" s = " "*spaces -prepare = prepare.replace(replace, f'print(self.distributed_type)\n{s}{replace}', 1) +prepare = prepare.replace(replace, f'try: self.distributed_type = DistributedType.MULTI_CPU\n{s}except: pass\n{s}print(self.distributed_type)\n{s}{replace}', 1) exec(prepare, globals()) accelerate.accelerator.Accelerator.prepare = prepare From 7b84ff768d2970900e7eccdac22ad72864ab3c3a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 30 May 2024 23:25:24 +1000 Subject: [PATCH 067/153] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index eb4fbac1..7b020426 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -451,7 +451,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, prepare = "\n".join(x[spaces:] for x in prepare) replace = "for obj in args:" s = " "*spaces -prepare = prepare.replace(replace, f'try: self.distributed_type = DistributedType.MULTI_CPU\n{s}except: pass\n{s}print(self.distributed_type)\n{s}{replace}', 1) +prepare = prepare.replace(replace, f'self.state.distributed_type = DistributedType.NO\n{s}print(self.distributed_type)\n{s}{replace}', 1) exec(prepare, globals()) accelerate.accelerator.Accelerator.prepare = prepare From 60f4b9a1983c227c9918bfc2bd79452e9f65ad75 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 30 May 2024 23:28:55 +1000 Subject: [PATCH 068/153] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 7b020426..173e5ddb 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -451,7 +451,7 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, prepare = "\n".join(x[spaces:] for x in prepare) replace = "for obj in args:" s = " "*spaces -prepare = prepare.replace(replace, f'self.state.distributed_type = DistributedType.NO\n{s}print(self.distributed_type)\n{s}{replace}', 1) +prepare = prepare.replace(replace, f'self.state.distributed_type = DistributedType.MULTI_CPU\n{s}print(self.distributed_type)\n{s}{replace}', 1) exec(prepare, globals()) accelerate.accelerator.Accelerator.prepare = prepare From 3ebe5a5da4cce238da656d8b255b2c64099970f7 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 30 May 2024 23:36:40 +1000 Subject: [PATCH 069/153] Update _utils.py --- unsloth/models/_utils.py | 66 ++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 173e5ddb..bcd4a7b3 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -410,51 +410,51 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None, """ - Remove warnings about missing kwargs + Remove warnings about missing kwargs and patch stuff """ -try: - from transformers.utils.quantization_config import BitsAndBytesConfig, QuantizationMethod - from inspect import getsource - import re - BitsAndBytesConfig__init__ = getsource(BitsAndBytesConfig.__init__) - BitsAndBytesConfig__init__ = re.sub( - r"if[\s]{1,}kwargs\:[\s]{1,}.+?\n", - "", - BitsAndBytesConfig__init__, - flags = re.MULTILINE, - ) - BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.split("\n") - length_spaces = len(re.match(r"[\s]{1,}", BitsAndBytesConfig__init__[0]).group(0)) - BitsAndBytesConfig__init__ = "\n".join(x[length_spaces:] for x in BitsAndBytesConfig__init__) - BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.replace( - "__init__", - "_BitsAndBytesConfig__init__", - ) - exec(BitsAndBytesConfig__init__, globals()) - - import transformers.utils.quantization_config - transformers.utils.quantization_config.BitsAndBytesConfig.__init__ = _BitsAndBytesConfig__init__ -except: - logger.warning_once( - "Unsloth unsuccessfully patched bitsandbytes. Please file a bug report.\n"\ - "Luckily, your training run will still work in the meantime!" - ) +from transformers.utils.quantization_config import BitsAndBytesConfig, QuantizationMethod +from inspect import getsource +from accelerate.utils.dataclasses import DistributedType +import re +BitsAndBytesConfig__init__ = getsource(BitsAndBytesConfig.__init__) +BitsAndBytesConfig__init__ = re.sub( + r"if[\s]{1,}kwargs\:[\s]{1,}.+?\n", + "", + BitsAndBytesConfig__init__, + flags = re.MULTILINE, +) +BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.split("\n") +length_spaces = len(re.match(r"[\s]{1,}", BitsAndBytesConfig__init__[0]).group(0)) +BitsAndBytesConfig__init__ = "\n".join(x[length_spaces:] for x in BitsAndBytesConfig__init__) +BitsAndBytesConfig__init__ = BitsAndBytesConfig__init__.replace( + "__init__", + "_BitsAndBytesConfig__init__", +) + +def _prepare_backend( + self, cpu: bool = False, sagemaker_dp = False, backend: str = None, +) -> tuple[str, DistributedType]: + return None, DistributedType.NO pass +import accelerate.state +accelerate.state.PartialState._prepare_backend = _prepare_backend - -# Fix up Accelerate import accelerate.accelerator -from accelerate.utils.dataclasses import DistributedType prepare = inspect.getsource(accelerate.accelerator.Accelerator.prepare) prepare = prepare.split("\n") spaces = prepare[0].find("def") prepare = "\n".join(x[spaces:] for x in prepare) -replace = "for obj in args:" +x = "for obj in args:" s = " "*spaces -prepare = prepare.replace(replace, f'self.state.distributed_type = DistributedType.MULTI_CPU\n{s}print(self.distributed_type)\n{s}{replace}', 1) +prepare = prepare.replace(x, f'self.state.distributed_type = DistributedType.NO\n{s}{x}', 1) exec(prepare, globals()) accelerate.accelerator.Accelerator.prepare = prepare +exec(BitsAndBytesConfig__init__, globals()) + +import transformers.utils.quantization_config +transformers.utils.quantization_config.BitsAndBytesConfig.__init__ = _BitsAndBytesConfig__init__ + # Offloading to disk for modules (lm_head, embed_tokens) import os From 7bbc8cee218c66e8a9c2bd519c74098d477c6284 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 31 May 2024 00:23:08 +1000 Subject: [PATCH 070/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 6e4d6910..7d003a1d 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -579,7 +579,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained) # Remove None items in actual_bad_tokens actual_bad_tokens = [x for x in actual_bad_tokens if x is not None] - + # Check if tokenizer and training datasets have bad tokens if_bad_first = False if_bad_second = False @@ -855,6 +855,7 @@ def patch_sft_trainer_tokenizer(): "pass\n"\ "n_devices = torch.cuda.device_count()\n"\ "more_than = 0\n"\ + "print(n_devices)\n"\ "for j in range(n_devices):\n"\ " vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\ " more_than += (vram > 4)\n"\ From 6f5c84c09a8859f429c4d889d14203993ed7c872 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 31 May 2024 00:27:27 +1000 Subject: [PATCH 071/153] train_dataloader --- unsloth/models/llama.py | 3 ++- unsloth/tokenizer_utils.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 7dec8624..2a184b8b 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1236,6 +1236,7 @@ def from_pretrained( bsz = self._train_batch_size total_batches = bsz * ga * args.world_size n_total_devices = total_batches // ga // bsz + print('N total devices = ', n_total_devices) if n_total_devices > 2: logger.warning_once( "Our OSS was designed for people with few GPU resources to level the playing field.\\n" @@ -1349,7 +1350,7 @@ def post_patch(model): lm_head.in_features = lm_head.weight.shape[1] lm_head.out_features = lm_head.weight.shape[0] model.lm_head = lm_head - + # Also patch all dtypes - BnB seems to not allocate the correct type? # BnB default dtype seems to be float16! correct_dtype = lm_head.weight.dtype diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 7d003a1d..6afea680 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -855,7 +855,6 @@ def patch_sft_trainer_tokenizer(): "pass\n"\ "n_devices = torch.cuda.device_count()\n"\ "more_than = 0\n"\ - "print(n_devices)\n"\ "for j in range(n_devices):\n"\ " vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\ " more_than += (vram > 4)\n"\ From 0d269ca54380fbc4e3a2d10aace295ee66d6b614 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 31 May 2024 00:30:10 +1000 Subject: [PATCH 072/153] Update llama.py --- unsloth/models/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 2a184b8b..d0718310 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1237,6 +1237,9 @@ def from_pretrained( total_batches = bsz * ga * args.world_size n_total_devices = total_batches // ga // bsz print('N total devices = ', n_total_devices) + print('N total devices = ', n_total_devices) + print('N total devices = ', n_total_devices) + print('N total devices = ', n_total_devices) if n_total_devices > 2: logger.warning_once( "Our OSS was designed for people with few GPU resources to level the playing field.\\n" From 6b7c142fd7fe575a44b6c4c4af1ee3c7b9b283bd Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 31 May 2024 00:32:18 +1000 Subject: [PATCH 073/153] Update llama.py --- unsloth/models/llama.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index d0718310..bdb881f4 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1213,6 +1213,16 @@ def from_pretrained( "We're a 2 person team, so we still have to fund our development costs - thanks!\\n" "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) + n_devices = torch.cuda.device_count() + more_than = 0 + for j in range(n_devices): + vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024 + more_than += (vram > 4) + if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.') + for _ in range(3): + gc.collect() + torch.cuda.empty_cache() + debug_info =""" debug_info = debug_info.split('\n') debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) @@ -1236,10 +1246,6 @@ def from_pretrained( bsz = self._train_batch_size total_batches = bsz * ga * args.world_size n_total_devices = total_batches // ga // bsz - print('N total devices = ', n_total_devices) - print('N total devices = ', n_total_devices) - print('N total devices = ', n_total_devices) - print('N total devices = ', n_total_devices) if n_total_devices > 2: logger.warning_once( "Our OSS was designed for people with few GPU resources to level the playing field.\\n" From 54f3a741df03f9440fe14e8630d1c6819a99690a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 31 May 2024 00:34:30 +1000 Subject: [PATCH 074/153] Update llama.py --- unsloth/models/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index bdb881f4..b9690b3d 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1214,15 +1214,16 @@ def from_pretrained( "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) n_devices = torch.cuda.device_count() + print(n_devices) more_than = 0 for j in range(n_devices): vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024 more_than += (vram > 4) + print(more_than) if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.') for _ in range(3): gc.collect() torch.cuda.empty_cache() - debug_info =""" debug_info = debug_info.split('\n') debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) From 0bc96c54368377fcba90e09bb6d30d5d37d468ad Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 31 May 2024 00:39:25 +1000 Subject: [PATCH 075/153] use_fast_convert --- unsloth/models/llama.py | 13 +------------ unsloth/save.py | 1 + 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index b9690b3d..7dec8624 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1213,17 +1213,6 @@ def from_pretrained( "We're a 2 person team, so we still have to fund our development costs - thanks!\\n" "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) - n_devices = torch.cuda.device_count() - print(n_devices) - more_than = 0 - for j in range(n_devices): - vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024 - more_than += (vram > 4) - print(more_than) - if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.') - for _ in range(3): - gc.collect() - torch.cuda.empty_cache() debug_info =""" debug_info = debug_info.split('\n') debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) @@ -1360,7 +1349,7 @@ def post_patch(model): lm_head.in_features = lm_head.weight.shape[1] lm_head.out_features = lm_head.weight.shape[0] model.lm_head = lm_head - + # Also patch all dtypes - BnB seems to not allocate the correct type? # BnB default dtype seems to be float16! correct_dtype = lm_head.weight.dtype diff --git a/unsloth/save.py b/unsloth/save.py index 7af62809..5d6f925d 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -975,6 +975,7 @@ def save_to_gguf( vocab_type = "bpe" pass + use_fast_convert = False if use_fast_convert: command = f"python llama.cpp/convert.py {model_directory} "\ f"--outfile {final_location} --vocab-type {vocab_type} "\ From a8b5d894d18bc18f654701cf94f28a35b4280d86 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 31 May 2024 03:56:12 +1000 Subject: [PATCH 076/153] Update save.py --- unsloth/save.py | 86 ++++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 5d6f925d..93fc1b49 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1344,6 +1344,26 @@ def unsloth_save_pretrained_gguf( del arguments["quantization_method"] del arguments["first_conversion"] + # Check if BOS added already, then warn + fix_bos_token = False + chat_template = getattr(tokenizer, "chat_template", None) + new_chat_template = None + + if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): + if chat_template is not None and \ + (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): + + fix_bos_token = True + logger.warning( + f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ + "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily." + ) + new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template) + tokenizer.chat_template = new_chat_template + + pass + pass + # Non blocking install GGUF first if not os.path.exists("llama.cpp"): @@ -1386,26 +1406,17 @@ def unsloth_save_pretrained_gguf( pass pass + # Use old chat template if the bos is removed + if fix_bos_token: + tokenizer.chat_template = chat_template + pass + for _ in range(3): gc.collect() model_type = self.config.model_type is_sentencepiece_model = check_if_sentencepiece_model(self) - # Check if BOS added already, then warn - print_bos_token_message = False - if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): - chat_template = getattr(tokenizer, "chat_template", None) - if chat_template is not None and \ - (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): - print_bos_token_message = True - logger.warning( - f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ - "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template." - ) - pass - pass - # Save to GGUF file_location = save_to_gguf(model_type, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, @@ -1422,13 +1433,6 @@ def unsloth_save_pretrained_gguf( new_save_directory.lstrip('/.') print(f"Saved GGUF to https://huggingface.co/{link}") pass - - if print_bos_token_message: - logger.warning( - f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ - "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template." - ) - pass pass @@ -1490,6 +1494,26 @@ def unsloth_push_to_hub_gguf( del arguments["quantization_method"] del arguments["first_conversion"] + # Check if BOS added already, then warn + fix_bos_token = False + chat_template = getattr(tokenizer, "chat_template", None) + new_chat_template = None + + if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): + if chat_template is not None and \ + (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): + + fix_bos_token = True + logger.warning( + f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ + "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily." + ) + new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template) + tokenizer.chat_template = new_chat_template + + pass + pass + # Non blocking install GGUF first if not os.path.exists("llama.cpp"): @@ -1532,26 +1556,17 @@ def unsloth_push_to_hub_gguf( pass pass + # Use old chat template if the bos is removed + if fix_bos_token: + tokenizer.chat_template = chat_template + pass + for _ in range(3): gc.collect() model_type = self.config.model_type is_sentencepiece_model = check_if_sentencepiece_model(self) - # Check if BOS added already, then warn - print_bos_token_message = False - if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): - chat_template = getattr(tokenizer, "chat_template", None) - if chat_template is not None and \ - (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): - print_bos_token_message = True - logger.warning( - f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ - "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template." - ) - pass - pass - # Save to GGUF file_location = save_to_gguf(model_type, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, @@ -1579,7 +1594,6 @@ def unsloth_push_to_hub_gguf( def patch_saving_functions(model): import inspect - import re import types from typing import Callable, Optional, Union, List From 872d569f98304fe103effc478139f44155e36857 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 31 May 2024 04:10:03 +1000 Subject: [PATCH 077/153] Update save.py --- unsloth/save.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 93fc1b49..b8e03bd2 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1355,7 +1355,7 @@ def unsloth_save_pretrained_gguf( fix_bos_token = True logger.warning( - f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ + f"Unsloth: ##### The current model auto adds a BOS token.\n"\ "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily." ) new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template) @@ -1505,7 +1505,7 @@ def unsloth_push_to_hub_gguf( fix_bos_token = True logger.warning( - f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ + f"Unsloth: ##### The current model auto adds a BOS token.\n"\ "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily." ) new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template) From 3a1f5f27d782a3670cead876f70a601b62c27ec6 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 31 May 2024 04:26:37 +1000 Subject: [PATCH 078/153] Update save.py --- unsloth/save.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index b8e03bd2..574010ee 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1422,6 +1422,13 @@ def unsloth_save_pretrained_gguf( new_save_directory, quantization_method, first_conversion, makefile, ) + if fix_bos_token: + logger.warning( + f"Unsloth: ##### The current model auto adds a BOS token.\n"\ + "Unsloth: ##### We removed in GGUF's chat template for you." + ) + pass + if push_to_hub: print("Unsloth: Uploading GGUF to Huggingface Hub...") username = upload_to_huggingface( @@ -1583,10 +1590,10 @@ def unsloth_push_to_hub_gguf( print(f"Saved GGUF to https://huggingface.co/{link}") - if print_bos_token_message: + if fix_bos_token: logger.warning( - f"Unsloth: ##### The current model type of {model_type} auto adds a BOS token.\n"\ - "Unsloth: ##### If you're using Ollama or GGUF etc, do not add a BOS in the chat template." + f"Unsloth: ##### The current model auto adds a BOS token.\n"\ + "Unsloth: ##### We removed in GGUF's chat template for you." ) pass pass From bcadc8cb997d803d84ed1834fbb0e253bd5255c8 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 3 Jun 2024 02:52:30 +1000 Subject: [PATCH 079/153] Update save.py --- unsloth/save.py | 78 ++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 574010ee..1a3e532f 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1281,6 +1281,38 @@ def upload_to_huggingface( pass +def fix_tokenizer_bos_token(tokenizer): + # Check if BOS added already, then warn + fix_bos_token = False + chat_template = getattr(tokenizer, "chat_template", None) + + if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): + if chat_template is not None and \ + ( + tokenizer.bos_token in chat_template or \ + "{bos_token}" in chat_template.replace(" ", "") or \ + "{bos_token+" in chat_template.replace(" ", "") + ): + + fix_bos_token = True + logger.warning( + f"Unsloth: ##### The current model auto adds a BOS token.\n"\ + "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily." + ) + + # Remove {{bos_token}} + new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template) + # Remove {{bos_token + + new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\+[\s]{0,}", "", new_chat_template) + + tokenizer.chat_template = new_chat_template + + pass + pass + return fix_bos_token, chat_template +pass + + def unsloth_save_pretrained_gguf( self, save_directory : Union[str, os.PathLike], @@ -1344,25 +1376,8 @@ def unsloth_save_pretrained_gguf( del arguments["quantization_method"] del arguments["first_conversion"] - # Check if BOS added already, then warn - fix_bos_token = False - chat_template = getattr(tokenizer, "chat_template", None) - new_chat_template = None - - if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): - if chat_template is not None and \ - (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): - - fix_bos_token = True - logger.warning( - f"Unsloth: ##### The current model auto adds a BOS token.\n"\ - "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily." - ) - new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template) - tokenizer.chat_template = new_chat_template - - pass - pass + # Fix tokenizer adding an extra BOS token at the front + fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer) # Non blocking install GGUF first if not os.path.exists("llama.cpp"): @@ -1408,7 +1423,7 @@ def unsloth_save_pretrained_gguf( # Use old chat template if the bos is removed if fix_bos_token: - tokenizer.chat_template = chat_template + tokenizer.chat_template = old_chat_template pass for _ in range(3): @@ -1501,25 +1516,8 @@ def unsloth_push_to_hub_gguf( del arguments["quantization_method"] del arguments["first_conversion"] - # Check if BOS added already, then warn - fix_bos_token = False - chat_template = getattr(tokenizer, "chat_template", None) - new_chat_template = None - - if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): - if chat_template is not None and \ - (tokenizer.bos_token in chat_template or "{bos_token}" in chat_template.replace(" ", "")): - - fix_bos_token = True - logger.warning( - f"Unsloth: ##### The current model auto adds a BOS token.\n"\ - "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily." - ) - new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template) - tokenizer.chat_template = new_chat_template - - pass - pass + # Fix tokenizer adding an extra BOS token at the front + fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer) # Non blocking install GGUF first if not os.path.exists("llama.cpp"): @@ -1565,7 +1563,7 @@ def unsloth_push_to_hub_gguf( # Use old chat template if the bos is removed if fix_bos_token: - tokenizer.chat_template = chat_template + tokenizer.chat_template = old_chat_template pass for _ in range(3): From 1381820342e3f04b2baeaef838555672bab160c8 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 3 Jun 2024 03:22:59 +1000 Subject: [PATCH 080/153] remove_special_tokens --- unsloth/chat_templates.py | 26 +++++++++++++++++++++----- unsloth/models/_utils.py | 2 +- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 3decdf7f..a5b7a196 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -16,6 +16,7 @@ "get_chat_template", "test_chat_templates", "test_hf_gguf_equivalence", + "remove_special_tokens", ] from transformers import StoppingCriteria, StoppingCriteriaList @@ -500,6 +501,19 @@ def get_chat_template( pass +def remove_special_tokens(tokenizer, prompt): + # Removes double BOS token + if tokenizer("A").input_ids[0] == tokenizer.bos_token_id: + input_ids = tokenizer(prompt).input_ids + for j, input_id in enumerate(input_ids): + if input_id != tokenizer.bos_token_id: break + input_ids = input_ids[j:] + prompt = tokenizer.decode(input_ids) + pass + return prompt +pass + + def create_stopping_criteria(tokenizer, stop_word = "eos_token"): class StoppingCriteriaSub(StoppingCriteria): __slots__ = "stop_token", "single_match", "length", @@ -670,7 +684,8 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf") if tokenizer.chat_template is not None: prompt = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) prompt = prompt.replace("'", "") # Subprocess does not like '' - prompts.append(prompts) + prompt = remove_special_tokens(tokenizer, prompt) + prompts.append(prompt) pass for prompt in prompts: @@ -688,9 +703,9 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf") gguf_tokenized = re.findall("([\d]{1,}) \-\> \'([^\']{1,})\'", gguf_tokens, flags = re.MULTILINE) gguf_tokenized = [(int(x[0]), x[1],) for x in gguf_tokenized] input_ids = tokenizer(prompt).input_ids + tokens = tokenizer.batch_decode(input_ids) hf_tokenized = list(zip(input_ids, tokens)) - print(gguf_tokenized[:5]) # Compare to Huggingface for j, (hf_token, gguf_token) in enumerate(zip(hf_tokenized, gguf_tokenized)): @@ -698,9 +713,10 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf") print("Failed GGUF != HF at", j) print("HF =", hf_token) print("GGUF =", gguf_token) - print(hf_tokenized[:j+1]) - print(gguf_tokenized[:j+1]) - print(gguf_tokens) + print(hf_tokenized) + print() + print(gguf_tokenized) + print() raise RuntimeError("Failed comparing GGUF to HF.") pass pass diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index bcd4a7b3..a6933893 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -31,7 +31,7 @@ import os import psutil -__version__ = "2024.5" +__version__ = "2024.6" # Get Flash Attention v2 if Ampere (RTX 30xx, A100) major_version, minor_version = torch.cuda.get_device_capability() From e01b87da7c81b4666b8f140dd62c7ccd93fce571 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 3 Jun 2024 04:55:35 +1000 Subject: [PATCH 081/153] Ollama --- unsloth/chat_templates.py | 165 ++++++++++++++++++++++++++++++++------ 1 file changed, 139 insertions(+), 26 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index a5b7a196..6d473f60 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -30,6 +30,7 @@ CHAT_TEMPLATES = {} +# =========================================== Unsloth # Unsloth efficient template leverages from Zephyr unsloth_template = \ "{{ bos_token }}"\ @@ -54,8 +55,9 @@ "{% endif %}" unsloth_eos_token = "eos_token" CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False,) +pass - +# =========================================== Zephyr # Zephyr has no BOS! zephyr_template = \ "{% for message in messages %}"\ @@ -72,8 +74,9 @@ "{% endif %}" zephyr_eos_token = "eos_token" CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False,) +pass - +# =========================================== ChatML # ChatML has no BOS and not EOS! Rather <|im_start|> and <|im_end|> acts as BOS / EOS. chatml_template = \ "{% for message in messages %}"\ @@ -88,10 +91,27 @@ "{% if add_generation_prompt %}"\ "{{ '<|im_start|>assistant\n' }}"\ "{% endif %}" -chatml_eos_token = "<|im_end|>" -CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token, True,) +pass +chatml_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }}{{ if .Prompt }}<|im_start|>user +{{ .Prompt }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ .Response }}<|im_end|> +""" +PARAMETER stop <|im_start|> +PARAMETER stop <|im_end|> +''' +chatml_eos_token = "<|im_end|>" +CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token, True, chatml_ollama,) +pass + +# =========================================== Mistral-1 # Mistral Instruct doesn't allow system prompts, so we append it to the user message. mistral_template = \ "{{ bos_token }}"\ @@ -117,8 +137,9 @@ "{% endfor %}" mistral_eos_token = "eos_token" CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False,) +pass - +# =========================================== Llama-2 # Adds BOS to every convo! And weird <> system messages. llama_template = \ "{% if messages[0]['role'] == 'system' %}"\ @@ -143,8 +164,9 @@ "{% endfor %}" llama_eos_token = "eos_token" CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False,) +pass - +# =========================================== Vicuna # https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template vicuna_template = \ "{{ bos_token }}"\ @@ -169,8 +191,9 @@ "{% endif %}" vicuna_eos_token = "eos_token" CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False,) +pass - +# =========================================== Vicuna Old # https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template vicuna_old_template = \ "{{ bos_token }}"\ @@ -195,8 +218,9 @@ "{% endif %}" vicuna_old_eos_token = "eos_token" CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False,) +pass - +# =========================================== Alpaca multi turn # https://github.com/tatsu-lab/stanford_alpaca Changed for multi-turn convos alpaca_template = \ "{{ bos_token }}"\ @@ -219,42 +243,98 @@ "{% if add_generation_prompt %}"\ "{{ '### Response:\n' }}"\ "{% endif %}" -alpaca_eos_token = "eos_token" -CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token, False,) +pass +alpaca_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .System }}{{ .System }} + +{{ end }}{{ if .Prompt }}### Instruction: +{{ .Prompt }} + +{{ end }}### Response: +{{ .Response }}{__EOS_TOKEN__} + +""" +PARAMETER stop {__EOS_TOKEN__} +''' + +alpaca_eos_token = "eos_token" +CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token, False, alpaca_ollama,) +pass +# =========================================== Gemma # https://huggingface.co/google/gemma-7b-it # Notice we must use |trim for lstrip and rstrip. maps to 106. # maps to 107. user and model are normal 1 word tokens. gemma_template = \ "{{ bos_token }}"\ + "{% if messages[0]['role'] == 'system' %}"\ + "{{'user\n' + messages[0]['content'] | trim + ' ' + messages[1]['content'] | trim + '\n'}}"\ + "{% set loop_messages = messages[2:] %}"\ + "{% endif %}"\ "{% for message in messages %}"\ "{% if message['role'] == 'user' %}"\ "{{'user\n' + message['content'] | trim + '\n'}}"\ "{% elif message['role'] == 'assistant' %}"\ "{{'model\n' + message['content'] | trim + '\n' }}"\ "{% else %}"\ - "{{ 'system\n' + message['content'] | trim + '\n' }}"\ + "{{ raise_exception('Only user and assistant roles are supported!') }}"\ "{% endif %}"\ "{% endfor %}"\ "{% if add_generation_prompt %}"\ "{{ 'model\n' }}"\ "{% endif %}" -gemma_eos_token = "" -CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token, True,) +pass +gemma_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """user +{{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} +model +{{ .Response }} +""" +PARAMETER repeat_penalty 1 +PARAMETER stop +PARAMETER stop +PARAMETER penalize_newline false +''' -# Gemma with ChatML instead +gemma_eos_token = "" +CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token, True, gemma_ollama,) +pass + +# =========================================== Gemma with ChatML instead # We find using is still more appropriate! gemma_chatml_template = "{{ bos_token }}" + chatml_template +pass + +gemma_chatml_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }}{{ if .Prompt }}<|im_start|>user +{{ .Prompt }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ .Response }}<|im_end|> +""" +PARAMETER repeat_penalty 1 +PARAMETER stop <|im_start|> +PARAMETER stop <|im_end|> +PARAMETER penalize_newline false +''' + gemma_chatml_eos_token = ( {"" : "<|im_start|>", "" : "<|im_end|>"}, "<|im_end|>", ) -CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token, True,) - +CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token, True, gemma_chatml_ollama,) +pass -# Llama-3 +# =========================================== Llama-3 # Weirdly \n\n is needed? llama3_template = \ "{{ bos_token }}"\ @@ -270,11 +350,30 @@ "{% if add_generation_prompt %}"\ "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\ "{% endif %}" +pass + +llama3_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|> + +{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|> + +{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|> + +{{ .Response }}<|eot_id|>""" +PARAMETER stop "<|start_header_id|>" +PARAMETER stop "<|end_header_id|>" +PARAMETER stop "<|eot_id|>" +PARAMETER stop "<|reserved_special_token" +''' + llama3_template_eos_token = "eos_token" -CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token, False,) +CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token, False, llama3_ollama,) +pass -# Phi-3 +# =========================================== Phi-3 phi3_template = \ "{{ bos_token }}"\ "{% for message in messages %}"\ @@ -289,8 +388,26 @@ "{% if add_generation_prompt %}"\ "{{ '<|assistant|>\n' }}"\ "{% endif %}" +pass + +phi3_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .System }}<|system|> +{{ .System }}<|end|> +{{ end }}{{ if .Prompt }}<|user|> +{{ .Prompt }}<|end|> +{{ end }}<|assistant|> +{{ .Response }}<|end|> +""" +PARAMETER stop <|end|> +PARAMETER stop <|user|> +PARAMETER stop <|assistant|> +''' + phi3_template_eos_token = "<|end|>" -CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False,) +CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False, phi3_ollama,) +pass def get_chat_template( @@ -503,12 +620,8 @@ def get_chat_template( def remove_special_tokens(tokenizer, prompt): # Removes double BOS token - if tokenizer("A").input_ids[0] == tokenizer.bos_token_id: - input_ids = tokenizer(prompt).input_ids - for j, input_id in enumerate(input_ids): - if input_id != tokenizer.bos_token_id: break - input_ids = input_ids[j:] - prompt = tokenizer.decode(input_ids) + if prompt.startswith(tokenizer.bos_token): + prompt = prompt[len(tokenizer.bos_token):] pass return prompt pass From b3479c7bfb55bb23e841488bc610b6c939e17bd2 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 4 Jun 2024 03:36:03 +1000 Subject: [PATCH 082/153] Update chat_templates.py --- unsloth/chat_templates.py | 121 +++++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 7 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 6d473f60..2509b09e 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -17,6 +17,7 @@ "test_chat_templates", "test_hf_gguf_equivalence", "remove_special_tokens", + "create_ollama_modelfile", ] from transformers import StoppingCriteria, StoppingCriteriaList @@ -53,8 +54,20 @@ "{% if add_generation_prompt %}"\ "{{ '>>> Assistant: ' }}"\ "{% endif %}" +pass + +unsloth_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .System }}{{ .System }} +{{ end }}{{ if .Prompt }}>>> User: {{ .Prompt }} +{{ end }}>>> Assistant: {{ .Response }}{__EOS_TOKEN__} +""" +PARAMETER stop {__EOS_TOKEN__} +''' + unsloth_eos_token = "eos_token" -CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False,) +CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False, unsloth_ollama,) pass # =========================================== Zephyr @@ -72,8 +85,23 @@ "{% if add_generation_prompt %}"\ "{{ '<|assistant|>\n' }}"\ "{% endif %}" +pass + +zephyr_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .System }}<|system|> +{{ .System }}{__EOS_TOKEN__} +{{ end }}{{ if .Prompt }}<|user|> +{{ .Prompt }}{__EOS_TOKEN__} +{{ end }}<|assistant|> +{{ .Response }}{__EOS_TOKEN__} +""" +PARAMETER stop {__EOS_TOKEN__} +''' + zephyr_eos_token = "eos_token" -CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False,) +CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False, zephyr_ollama,) pass # =========================================== ChatML @@ -135,8 +163,17 @@ "{{ raise_exception('Only user and assistant roles are supported!') }}"\ "{% endif %}"\ "{% endfor %}" +pass + +mistral_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST]""" +PARAMETER stop {__EOS_TOKEN__} +''' + mistral_eos_token = "eos_token" -CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False,) +CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False, mistral_ollama,) pass # =========================================== Llama-2 @@ -162,8 +199,19 @@ "{{ raise_exception('Only user and assistant roles are supported!') }}"\ "{% endif %}"\ "{% endfor %}" +pass + +llama_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """[INST] <>{{ .System }}<> + +{{ .Prompt }} [/INST]""" +PARAMETER stop {__EOS_TOKEN__} +''' + llama_eos_token = "eos_token" -CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False,) +CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False, llama_ollama,) pass # =========================================== Vicuna @@ -189,8 +237,17 @@ "{% if add_generation_prompt %}"\ "{{ 'ASSISTANT:' }}"\ "{% endif %}" +pass + +vicuna_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} {{ end }}ASSISTANT: {{ .Response }} {__EOS_TOKEN__}""" +PARAMETER stop {__EOS_TOKEN__} +''' + vicuna_eos_token = "eos_token" -CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False,) +CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False, vicuna_ollama,) pass # =========================================== Vicuna Old @@ -216,8 +273,20 @@ "{% if add_generation_prompt %}"\ "{{ '### Assistant:' }}"\ "{% endif %}" +pass + +vicuna_old_ollama = \ +''' +FROM {__FILE_LOCATION__} +TEMPLATE """{{ if .System }}{{ .System }} +{{ end }}{{ if .Prompt }}### Human: {{ .Prompt }} +{{ end }}### Assistant: {{ .Response }}{__EOS_TOKEN__} +""" +PARAMETER stop {__EOS_TOKEN__} +''' + vicuna_old_eos_token = "eos_token" -CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False,) +CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False, vicuna_old_ollama,) pass # =========================================== Alpaca multi turn @@ -415,6 +484,7 @@ def get_chat_template( chat_template = "chatml", mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}, map_eos_token = True, + system_message = None, ): assert(type(map_eos_token) is bool) old_tokenizer = tokenizer @@ -449,7 +519,7 @@ def get_chat_template( elif type(chat_template) is str: - chat_template, stop_word, yes_map_eos_token = CHAT_TEMPLATES[chat_template] + chat_template, stop_word, yes_map_eos_token, ollama_modelfile = CHAT_TEMPLATES[chat_template] # Check mapping to eos_token if not map_eos_token and yes_map_eos_token: map_eos_token = True @@ -614,6 +684,9 @@ def get_chat_template( # Patch saving functions tokenizer = patch_saving_functions(tokenizer) + # Add Ollama + tokenizer._ollama_modelfile = ollama_modelfile + tokenizer._system_message = system_message return tokenizer#, stopping_criteria pass @@ -627,6 +700,40 @@ def remove_special_tokens(tokenizer, prompt): pass +def create_ollama_modelfile(tokenizer, gguf_location): + + modelfile = getattr(tokenizer, "ollama_modelfile", None) + if modelfile is None: + raise RuntimeError( + "Unsloth: Tokenizer does not have a `ollama_modelfile` attribute.\n"\ + "Please use get_chat_template(...)." + ) + pass + + system_message = getattr(tokenizer, "_system_message", None) + if system_message is None: + __SYSTEM_MESSAGE__ = "" + else: + __SYSTEM_MESSAGE__ = f'SYSTEM """{system_message}"""' + pass + + modelfile = modelfile\ + .replace("{{", "⚫@✅#🦥")\ + .replace("}}", "⚡@🦥#⛵")\ + .format( + __FILE_LOCATION__ = gguf_location, + __SYSTEM_MESSAGE__ = __SYSTEM_MESSAGE__, + __EOS_TOKEN__ = tokenizer.eos_token, + )\ + .replace("⚫@✅#🦥", "{{")\ + .replace("⚡@🦥#⛵", "}}")\ + .rstrip() + pass + + return modelfile +pass + + def create_stopping_criteria(tokenizer, stop_word = "eos_token"): class StoppingCriteriaSub(StoppingCriteria): __slots__ = "stop_token", "single_match", "length", From 86804dc98c9c0c8860545691e429a9e8cdf8ba28 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 4 Jun 2024 03:37:08 +1000 Subject: [PATCH 083/153] Update chat_templates.py --- unsloth/chat_templates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 2509b09e..b8713789 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -702,7 +702,7 @@ def remove_special_tokens(tokenizer, prompt): def create_ollama_modelfile(tokenizer, gguf_location): - modelfile = getattr(tokenizer, "ollama_modelfile", None) + modelfile = getattr(tokenizer, "_ollama_modelfile", None) if modelfile is None: raise RuntimeError( "Unsloth: Tokenizer does not have a `ollama_modelfile` attribute.\n"\ From 87fdd3a4a414d62b6e1f8e68a72b544207819bc1 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 4 Jun 2024 03:43:34 +1000 Subject: [PATCH 084/153] Update chat_templates.py --- unsloth/chat_templates.py | 82 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index b8713789..fb9c929c 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -18,6 +18,7 @@ "test_hf_gguf_equivalence", "remove_special_tokens", "create_ollama_modelfile", + "standardize_dataset", ] from transformers import StoppingCriteria, StoppingCriteriaList @@ -700,8 +701,87 @@ def remove_special_tokens(tokenizer, prompt): pass -def create_ollama_modelfile(tokenizer, gguf_location): +def standardize_dataset( + dataset, + conversation_key = "conversations", + system_message = None, + aliases_for_system = ["system",], + aliases_for_user = ["user", "human", "input",], + aliases_for_assistant = ["gpt", "assistant", "output",], +): + """ + Standardizes ShareGPT and other formats to user/assistant Hugging Face format. + """ + import collections + import itertools + + convos = dataset[:10][conversation_key] + uniques = collections.defaultdict(list) + for convo in convos: + for message in convo: + for key, value in message.items(): + uniques[key].append(value) + pass + + # Must be only 2 entries + assert(len(uniques.keys()) == 2) + keys = list(uniques.keys()) + length_first = len(set(uniques[keys[0]])) + length_second = len(set(uniques[keys[1]])) + + if length_first < length_second: + # Role is assigned to the first element + role_key = keys[0] + content_key = keys[1] + else: + role_key = keys[1] + content_key = keys[0] + pass + + # Check roles are in aliases + all_aliases = set(aliases_for_system + aliases_for_user + aliases_for_assistant) + roles = set(uniques[role_key]) + leftover_aliases = (all_aliases | roles) - all_aliases + if len(leftover_aliases) != 0: + raise TypeError( + f"Unsloth: {list(leftover_aliases)} are not in aliases. Please update aliases." + ) + pass + + # Mapping for aliases + aliases_mapping = {} + for x in aliases_for_system: aliases_mapping[x] = "system" + for x in aliases_for_user: aliases_mapping[x] = "user" + for x in aliases_for_assistant: aliases_mapping[x] = "assistant" + + def _standardize_dataset(examples): + convos = examples[conversation_key] + all_convos = [] + for convo in convos: + new_convo = [] + if len(convo) == 0: continue + has_system = aliases_mapping[convo[0][role_key]] == "system" + if not has_system and system_message is not None: + new_convo.append({ "role" : "system", "content" : system_message, }) + for message in convo: + role = aliases_mapping[message[role_key]] + new_convo.append({ "role" : role, "content" : message[content_key], }) + pass + all_convos.append(new_convo) + pass + return { conversation_key : all_convos, } + pass + + return dataset.map(_standardize_dataset, batched = True,) +pass + + +def create_ollama_modelfile(tokenizer, gguf_location): + """ + Creates an Ollama Modelfile. + Use ollama.create(model = "new_ollama_model", modelfile = modelfile) + """ modelfile = getattr(tokenizer, "_ollama_modelfile", None) if modelfile is None: raise RuntimeError( From 6386d9439e8eeef95f16382a4c8d43364db0f498 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 8 Jun 2024 04:18:25 +1000 Subject: [PATCH 085/153] Update llama.py --- unsloth/models/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 0e860b9d..dd7f6ba1 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -209,8 +209,9 @@ def LlamaAttention_fast_forward_inference( # Attention if bsz == 1: + A *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963 + # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len]) - A *= self.scalar # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype) A = torch.matmul(A, Vnn, out = Qn) From b1a95516d7ed8f992272bcc9e73662110c15ea34 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 9 Jun 2024 20:18:01 +1000 Subject: [PATCH 086/153] Update chat_templates.py --- unsloth/chat_templates.py | 48 +++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index fb9c929c..9c6a3a77 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -64,7 +64,8 @@ {{ end }}{{ if .Prompt }}>>> User: {{ .Prompt }} {{ end }}>>> Assistant: {{ .Response }}{__EOS_TOKEN__} """ -PARAMETER stop {__EOS_TOKEN__} +PARAMETER stop "{__EOS_TOKEN__}" +SYSTEM """You are a helpful assistant to the user""" ''' unsloth_eos_token = "eos_token" @@ -98,7 +99,7 @@ {{ end }}<|assistant|> {{ .Response }}{__EOS_TOKEN__} """ -PARAMETER stop {__EOS_TOKEN__} +PARAMETER stop "{__EOS_TOKEN__}" ''' zephyr_eos_token = "eos_token" @@ -132,8 +133,8 @@ {{ end }}<|im_start|>assistant {{ .Response }}<|im_end|> """ -PARAMETER stop <|im_start|> -PARAMETER stop <|im_end|> +PARAMETER stop "<|im_start|>" +PARAMETER stop "<|im_end|>" ''' chatml_eos_token = "<|im_end|>" @@ -166,11 +167,12 @@ "{% endfor %}" pass +# Ollama from https://www.ollama.com/library/mistral mistral_ollama = \ ''' FROM {__FILE_LOCATION__} TEMPLATE """[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST]""" -PARAMETER stop {__EOS_TOKEN__} +PARAMETER stop "{__EOS_TOKEN__}" ''' mistral_eos_token = "eos_token" @@ -202,13 +204,14 @@ "{% endfor %}" pass +# Ollama from https://www.ollama.com/library/llama3 llama_ollama = \ ''' FROM {__FILE_LOCATION__} TEMPLATE """[INST] <>{{ .System }}<> {{ .Prompt }} [/INST]""" -PARAMETER stop {__EOS_TOKEN__} +PARAMETER stop "{__EOS_TOKEN__}" ''' llama_eos_token = "eos_token" @@ -240,11 +243,12 @@ "{% endif %}" pass +# Ollama from https://www.ollama.com/library/vicuna vicuna_ollama = \ ''' FROM {__FILE_LOCATION__} TEMPLATE """{{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} {{ end }}ASSISTANT: {{ .Response }} {__EOS_TOKEN__}""" -PARAMETER stop {__EOS_TOKEN__} +PARAMETER stop "{__EOS_TOKEN__}" ''' vicuna_eos_token = "eos_token" @@ -283,7 +287,8 @@ {{ end }}{{ if .Prompt }}### Human: {{ .Prompt }} {{ end }}### Assistant: {{ .Response }}{__EOS_TOKEN__} """ -PARAMETER stop {__EOS_TOKEN__} +PARAMETER stop "{__EOS_TOKEN__}" +SYSTEM """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.""" ''' vicuna_old_eos_token = "eos_token" @@ -298,7 +303,7 @@ "{{ messages[0]['content'] + '\n\n' }}"\ "{% set loop_messages = messages[1:] %}"\ "{% else %}"\ - "{{ 'Below are some instructions that describes some tasks. Write responses that appropriately completes each request.\n\n' }}"\ + "{{ 'Below are some instructions that describe some tasks. Write responses that appropriately complete each request.\n\n' }}"\ "{% set loop_messages = messages %}"\ "{% endif %}"\ "{% for message in loop_messages %}"\ @@ -321,13 +326,14 @@ TEMPLATE """{{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}### Instruction: -{{ .Prompt }} +{{ .Prompt }}{{ end }} -{{ end }}### Response: +### Response: {{ .Response }}{__EOS_TOKEN__} """ -PARAMETER stop {__EOS_TOKEN__} +PARAMETER stop "{__EOS_TOKEN__}" +SYSTEM """Below are some instructions that describe some tasks. Write responses that appropriately complete each request.""" ''' alpaca_eos_token = "eos_token" @@ -358,6 +364,7 @@ "{% endif %}" pass +# Ollama from https://www.ollama.com/library/gemma gemma_ollama = \ ''' FROM {__FILE_LOCATION__} @@ -367,8 +374,8 @@ {{ .Response }} """ PARAMETER repeat_penalty 1 -PARAMETER stop -PARAMETER stop +PARAMETER stop "" +PARAMETER stop "" PARAMETER penalize_newline false ''' @@ -392,8 +399,8 @@ {{ .Response }}<|im_end|> """ PARAMETER repeat_penalty 1 -PARAMETER stop <|im_start|> -PARAMETER stop <|im_end|> +PARAMETER stop "<|im_start|>" +PARAMETER stop "<|im_end|>" PARAMETER penalize_newline false ''' @@ -422,6 +429,7 @@ "{% endif %}" pass +# Ollama from https://www.ollama.com/library/llama3 llama3_ollama = \ ''' FROM {__FILE_LOCATION__} @@ -435,7 +443,6 @@ PARAMETER stop "<|start_header_id|>" PARAMETER stop "<|end_header_id|>" PARAMETER stop "<|eot_id|>" -PARAMETER stop "<|reserved_special_token" ''' llama3_template_eos_token = "eos_token" @@ -460,6 +467,7 @@ "{% endif %}" pass +# Ollama from https://www.ollama.com/library/phi3 phi3_ollama = \ ''' FROM {__FILE_LOCATION__} @@ -470,9 +478,9 @@ {{ end }}<|assistant|> {{ .Response }}<|end|> """ -PARAMETER stop <|end|> -PARAMETER stop <|user|> -PARAMETER stop <|assistant|> +PARAMETER stop "<|end|>" +PARAMETER stop "<|user|>" +PARAMETER stop "<|assistant|>" ''' phi3_template_eos_token = "<|end|>" From 344a05d467eadd6e35f83b6a67d21d6b8cbd8475 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 9 Jun 2024 20:34:19 +1000 Subject: [PATCH 087/153] Support bfloat16 GGUF --- unsloth/save.py | 97 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 77 insertions(+), 20 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 1a3e532f..6cef5b6d 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -59,7 +59,8 @@ "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.", "quantized" : "Recommended. Slow conversion. Fast inference, small files.", "f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.", - "f16" : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.", + "bf16" : "Bfloat16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.", + "f16" : "Float16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.", "q8_0" : "Fast conversion. High resource use, but generally acceptable.", "q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K", "q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K", @@ -102,7 +103,7 @@ def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencep if os.path.isfile(f"{file_location}/tokenizer.model"): sentencepiece_model = True pass - shutil.rmtree(file_location) + shutil.rmtree(file_location, ignore_errors = True) return sentencepiece_model pass @@ -700,7 +701,7 @@ def unsloth_save_model( # Remove temporary location import shutil - shutil.rmtree(temporary_location) + shutil.rmtree(temporary_location, ignore_errors = True) for _ in range(3): torch.cuda.empty_cache() @@ -763,7 +764,7 @@ def install_llama_cpp_old(version = -10): print(f"**[WARNING]** Deleting llama.cpp directory... {10-i} seconds left.") time.sleep(1) import shutil - shutil.rmtree("llama.cpp") + shutil.rmtree("llama.cpp", ignore_errors = True) pass # Clone a specific commit @@ -866,10 +867,11 @@ def _fix_gemma_gguf(): def save_to_gguf( model_type : str, + model_dtype : str, is_sentencepiece : bool = False, model_directory : str = "unsloth_finetuned_model", quantization_method : str = "fast_quantized", - first_conversion : str = "f16", + first_conversion : str = None, _run_installer = None, # Non blocking install of llama.cpp ): # logger.warning( @@ -877,6 +879,22 @@ def save_to_gguf( # "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\ # "Please be patient - GGUF saving should still work, but might not work as well." # ) + assert(model_dtype == "float16" or model_dtype == "bfloat16") + model_dtype = "f16" if model_dtype == "float16" else "bf16" + + # Check if bfloat16 is supported + if model_dtype == "bf16" and not torch.cuda.is_bf16_supported(): + logger.warning( + "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\ + "We shall switch instead to f16." + ) + model_dtype = "f16" + pass + + # Check first_conversion as well + if first_conversion is None: + first_conversion = model_dtype + pass if quantization_method.startswith("iq2"): raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!") @@ -889,7 +907,7 @@ def save_to_gguf( pass logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.") - if quantization_method == "not_quantized": quantization_method = "f16" + if quantization_method == "not_quantized": quantization_method = model_dtype elif quantization_method == "fast_quantized": quantization_method = "q8_0" elif quantization_method == "quantized": quantization_method = "q4_k_m" elif quantization_method is None: quantization_method = "q8_0" @@ -911,12 +929,13 @@ def save_to_gguf( print(print_info) # Check first_conversion format - if first_conversion == "f16" : pass - elif first_conversion == "f32" : pass - elif first_conversion == "q8_0": pass + if first_conversion == "f16" : pass + if first_conversion == "bf16" : pass + elif first_conversion == "f32" : pass + elif first_conversion == "q8_0" : pass else: raise RuntimeError( - f"Unsloth: `first_conversion` can only be one of ['f16', 'f32', 'q8_0'] and not `{first_conversion}`." + f"Unsloth: `first_conversion` can only be one of ['f16', 'bf16', 'f32', 'q8_0'] and not `{first_conversion}`." ) pass @@ -935,11 +954,13 @@ def save_to_gguf( if quantization_method == "f32": first_conversion = "f32" elif quantization_method == "f16": first_conversion = "f16" + elif quantization_method == "bf16": first_conversion = "bf16" elif quantization_method == "q8_0": first_conversion = "q8_0" else: # Quantized models must have f16 as the default argument - if first_conversion == "f32" : pass - elif first_conversion == "f16" : pass + if first_conversion == "f32" : pass + elif first_conversion == "f16" : pass + elif first_conversion == "bf16" : pass elif first_conversion == "q8_0": logger.warning_once( "Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\ @@ -950,8 +971,22 @@ def save_to_gguf( pass # Non llama/mistral needs can only use f32 or f16 - if not use_fast_convert and (first_conversion != "f16" or first_conversion != "f32"): - logger.warning_once("Unsloth: We must use f16 for non Llama and Mistral models.") + if not use_fast_convert and \ + (first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"): + + pass + # Latest llama.cpp works for all models for q8_0! + + # logger.warning_once("Unsloth: We must use f16 for non Llama and Mistral models.") + # first_conversion = "f16" + pass + + # Check if bfloat16 is supported + if first_conversion == "bf16" and not torch.cuda.is_bf16_supported(): + logger.warning( + "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\ + "We shall switch instead to f16." + ) first_conversion = "f16" pass @@ -1318,7 +1353,7 @@ def unsloth_save_pretrained_gguf( save_directory : Union[str, os.PathLike], tokenizer = None, quantization_method : str = "fast_quantized", - first_conversion : str = "f16", + first_conversion : str = None, push_to_hub : bool = False, token : Optional[Union[str, bool]] = None, private : Optional[bool] = None, @@ -1429,11 +1464,22 @@ def unsloth_save_pretrained_gguf( for _ in range(3): gc.collect() - model_type = self.config.model_type + model_dtype = self.config.torch_dtype + model_type = self.config.model_type + if type(model_dtype) is str: + assert(model_dtype == "float16" or model_dtype == "bfloat16") + elif model_dtype == torch.float16: + model_dtype = "float16" + elif model_dtype == torch.bfloat16: + model_dtype = "bfloat16" + else: + raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16") + pass + is_sentencepiece_model = check_if_sentencepiece_model(self) # Save to GGUF - file_location = save_to_gguf(model_type, is_sentencepiece_model, + file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, ) @@ -1463,7 +1509,7 @@ def unsloth_push_to_hub_gguf( repo_id : str, tokenizer = None, quantization_method : str = "fast_quantized", - first_conversion : str = "f16", + first_conversion : str = None, use_temp_dir : Optional[bool] = None, commit_message : Optional[str] = "Trained with Unsloth", private : Optional[bool] = None, @@ -1569,11 +1615,22 @@ def unsloth_push_to_hub_gguf( for _ in range(3): gc.collect() - model_type = self.config.model_type + model_dtype = self.config.torch_dtype + model_type = self.config.model_type + if type(model_dtype) is str: + assert(model_dtype == "float16" or model_dtype == "bfloat16") + elif model_dtype == torch.float16: + model_dtype = "float16" + elif model_dtype == torch.bfloat16: + model_dtype = "bfloat16" + else: + raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16") + pass + is_sentencepiece_model = check_if_sentencepiece_model(self) # Save to GGUF - file_location = save_to_gguf(model_type, is_sentencepiece_model, + file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, ) From 6b11e0d6f443c14831490c0a67fb7d1491e8fad0 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 10 Jun 2024 03:20:22 +1000 Subject: [PATCH 088/153] Update save.py --- unsloth/save.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/save.py b/unsloth/save.py index 6cef5b6d..3ad2f346 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1010,6 +1010,7 @@ def save_to_gguf( vocab_type = "bpe" pass + # convert.py is deprecated! use_fast_convert = False if use_fast_convert: command = f"python llama.cpp/convert.py {model_directory} "\ From c6e4b5ba33dce8e048465bf9623a3c333ef1a814 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 10 Jun 2024 03:33:12 +1000 Subject: [PATCH 089/153] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index dd7f6ba1..0fa505fc 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -209,7 +209,7 @@ def LlamaAttention_fast_forward_inference( # Attention if bsz == 1: - A *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963 + Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963 # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len]) # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched From 57f29ab0c6fe24e6df94bf3ee608538ea2895b2b Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 10 Jun 2024 03:40:31 +1000 Subject: [PATCH 090/153] fast_forward_inference --- unsloth/models/llama.py | 2 +- unsloth/models/qwen2.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 0fa505fc..8f48cd12 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -792,7 +792,7 @@ def _CausalLM_fast_forward( *args, **kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: - if past_key_values is not None and self.config.model_type != "qwen2": + if past_key_values is not None: outputs = fast_forward_inference( self, input_ids, diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py index 76fe31a6..115bf3e0 100644 --- a/unsloth/models/qwen2.py +++ b/unsloth/models/qwen2.py @@ -13,7 +13,6 @@ # limitations under the License. from .llama import * -from .mistral import FastMistralModel import os from ._utils import __version__ @@ -60,7 +59,7 @@ def pre_patch(): @staticmethod def from_pretrained( - model_name = "Qwen/Qwen1.5-7B", + model_name = "Qwen/Qwen2-7B", max_seq_length = 4096, dtype = None, load_in_4bit = True, @@ -73,7 +72,7 @@ def from_pretrained( trust_remote_code = False, **kwargs, ): - return FastMistralModel.from_pretrained( + return FastLlamaModel.from_pretrained( model_name = model_name, max_seq_length = max_seq_length, dtype = dtype, From d32e97264e4ecdebcc6126bc70b6c9bbe9406a26 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 10 Jun 2024 03:46:54 +1000 Subject: [PATCH 091/153] Update mapper.py --- unsloth/models/mapper.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 8808b855..73aa06ca 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -197,4 +197,12 @@ for value in values: FLOAT_TO_INT_MAPPER[value] = key pass + + # Get lowercased + lowered_key = key.lower() + INT_TO_FLOAT_MAPPER[lowered_key] = values[0].lower() + + for value in values: + FLOAT_TO_INT_MAPPER[value.lower()] = lowered_key + pass pass From e121fa5df2e15376a77505c8df6074186cfc694e Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 10 Jun 2024 03:47:33 +1000 Subject: [PATCH 092/153] Update loader.py --- unsloth/models/loader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index b2f0e4ef..3bc091b3 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -33,6 +33,9 @@ def _get_model_name(model_name, load_in_4bit = True): + # First try replacing lowercase 'b' with uppercase 'B' + model_name = model_name.lower() + if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER: model_name = INT_TO_FLOAT_MAPPER[model_name] logger.warning_once( From 5eaa10f3a279eb6acf8ffb348935ce6e9b032ba8 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 10 Jun 2024 03:53:30 +1000 Subject: [PATCH 093/153] Update llama.py --- unsloth/models/llama.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 8f48cd12..6064af59 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1831,10 +1831,10 @@ def patch_peft_model( @staticmethod def for_inference(model): - if model.config.model_type == "qwen2": - FastLlamaModel.for_training(model) - return - pass + # if model.config.model_type == "qwen2": + # FastLlamaModel.for_training(model) + # return + # pass internal_model = model internal_model.gradient_checkpointing = False From f57d28d1460ca5c71b48c99438f5c96197db3c3b Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 11 Jun 2024 03:56:52 +1000 Subject: [PATCH 094/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 6afea680..0e286166 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -193,17 +193,38 @@ def assert_same_tokenization(slow_tokenizer, fast_tokenizer): if x.endswith("_token") and x.count("_") == 1 ))) all_special_tokens = list(set(special_tokens + slow_tokenizer.all_special_tokens)) + + # Check if chat template is enabled! + check_chat_template = True + + if getattr(slow_tokenizer, "chat_template", None) is not None and \ + getattr(fast_tokenizer, "chat_template", None) is not None: + + # Check chat template! + messages = [ + {"role": "user", "content": " What is 2+2? "}, + {"role": "assistant", "content": " It's 4. "}, + ] + check_chat_template = \ + slow_tokenizer(slow_tokenizer.apply_chat_template(messages)).input_ids == \ + fast_tokenizer(slow_tokenizer.apply_chat_template(messages)).input_ids + pass + try: string = "\n".join(all_special_tokens) + \ "A quick brown fox jumps over the lazy dog!!\n\nHi\n\n" + \ "".join(all_special_tokens) - return slow_tokenizer(string).input_ids == fast_tokenizer(string).input_ids + check_special_tokens = \ + slow_tokenizer(string).input_ids == \ + fast_tokenizer(string).input_ids + + return check_chat_template and check_special_tokens except: # For eg see https://github.com/unslothai/unsloth/issues/292 # Sometimes tokenizer has weird tokens, causing a combined tokenization to fail. # [TODO] We temporarily disable this for CodeLlama tokenizers if slow_tokenizer.__repr__().split("(", 1)[0] in IGNORED_TOKENIZER_CHECKING: - return True + return check_chat_template else: return False pass From 893750707f0be1bdb6b04689dd94d02187ece7cf Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 11 Jun 2024 19:53:29 +1000 Subject: [PATCH 095/153] info --- unsloth/models/llama.py | 26 ++++++++++----------- unsloth/models/mistral.py | 48 ++++++++++++++++----------------------- 2 files changed, 32 insertions(+), 42 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6064af59..e6f9e756 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1207,12 +1207,12 @@ def from_pretrained( debug_info = """n_total_devices = total_train_batch_size // \\ args.gradient_accumulation_steps // self._train_batch_size - if n_total_devices > 2: + if n_total_devices > 1: logger.warning_once( - "Our OSS was designed for people with few GPU resources to level the playing field.\\n" - "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" - "We're a 2 person team, so we still have to fund our development costs - thanks!\\n" - "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" + "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" + "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" + "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) debug_info =""" debug_info = debug_info.split('\n') @@ -1237,17 +1237,17 @@ def from_pretrained( bsz = self._train_batch_size total_batches = bsz * ga * args.world_size n_total_devices = total_batches // ga // bsz - if n_total_devices > 2: + if n_total_devices > 1: logger.warning_once( - "Our OSS was designed for people with few GPU resources to level the playing field.\\n" - "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" - "We're a 2 person team, so we still have to fund our development costs - thanks!\\n" - "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" + "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" + "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" + "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) - divisor = n_total_devices / 2 + divisor = n_total_devices / 1 bsz = self._train_batch_size = max(int(bsz / divisor), 1) - if total_batches // ga // bsz > 2: - divisor = n_total_devices / 2 + if total_batches // ga // bsz > 1: + divisor = n_total_devices / 1 ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)""" check_batches = check_batches.split('\n') check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]]) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 365d60a3..5c49c636 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -393,21 +393,6 @@ def from_pretrained( layer.self_attn.apply_o = original_apply_o pass - # Patch Trainer - from transformers.trainer import Trainer - if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": - try: - inner_training_loop = inspect.getsource(Trainer._inner_training_loop) - except: - raise RuntimeError( - "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n" - "We're a 2 person team, so we still have to fund our development costs - thanks!\n" - "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - ) - pass - pass - # Patch Trainer from transformers.trainer import Trainer try: @@ -419,7 +404,7 @@ def from_pretrained( except: raise RuntimeError( "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n" + "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" "We're a 2 person team, so we still have to fund our development costs - thanks!\n" "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) @@ -447,7 +432,11 @@ def from_pretrained( f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\ f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' - logger.warning_once(debug_info)""" + logger.warning(debug_info) + import gc + for _ in range(3): + gc.collect() + torch.cuda.empty_cache()""" debug_info = debug_info.split('\n') debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) @@ -455,12 +444,12 @@ def from_pretrained( debug_info = """n_total_devices = total_train_batch_size // \\ args.gradient_accumulation_steps // self._train_batch_size - if n_total_devices > 2: + if n_total_devices > 1: logger.warning_once( - "Our OSS was designed for people with few GPU resources to level the playing field.\\n" - "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\\n" - "We're a 2 person team, so we still have to fund our development costs - thanks!\\n" - "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" + "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" + "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" + "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) debug_info =""" debug_info = debug_info.split('\n') @@ -485,16 +474,17 @@ def from_pretrained( bsz = self._train_batch_size total_batches = bsz * ga * args.world_size n_total_devices = total_batches // ga // bsz - if n_total_devices > 2: + if n_total_devices > 1: logger.warning_once( - "Please consider a commercial license - Unsloth was designed for the GPU Poor.\\n" - "The OSS currently works on 4 GPUs - we're a 2 person team, so please help fund\\n" - "our development costs by supporting us through Ko-fi or buying a license! Thanks!", + "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" + "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" + "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" + "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) - divisor = n_total_devices / 2 + divisor = n_total_devices / 1 bsz = self._train_batch_size = max(int(bsz / divisor), 1) - if total_batches // ga // bsz > 2: - divisor = n_total_devices / 2 + if total_batches // ga // bsz > 1: + divisor = n_total_devices / 1 ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)""" check_batches = check_batches.split('\n') check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]]) From 8982edb6eac2bd2d4facb92a97c3e5add6216348 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 11 Jun 2024 20:21:28 +1000 Subject: [PATCH 096/153] edits --- unsloth/models/llama.py | 8 +++++++- unsloth/models/mistral.py | 8 +++++++- unsloth/tokenizer_utils.py | 14 +++++++++----- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index e6f9e756..4cbbcf0a 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1196,7 +1196,13 @@ def from_pretrained( f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' logger.warning(debug_info) - import gc + import subprocess, re, gc + output = subprocess.check_output( + 'nvidia-smi --query-gpu=memory.used --format=csv', shell = True) + output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output) + output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output) + if output > 1: raise RuntimeError( + 'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.') for _ in range(3): gc.collect() torch.cuda.empty_cache()""" diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 5c49c636..fc2e1a9f 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -433,7 +433,13 @@ def from_pretrained( f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' logger.warning(debug_info) - import gc + import subprocess, re, gc + output = subprocess.check_output( + 'nvidia-smi --query-gpu=memory.used --format=csv', shell = True) + output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output) + output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output) + if output > 1: raise RuntimeError( + 'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.') for _ in range(3): gc.collect() torch.cuda.empty_cache()""" diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 0e286166..9ffd1d71 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -595,6 +595,8 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): # Get set and actual tokens where_untrained = where_untrained.tolist() if len(where_untrained) == 0: return + + # Remove untrained indices where it's longer where_untrained_set = frozenset(where_untrained) actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained) @@ -875,11 +877,13 @@ def patch_sft_trainer_tokenizer(): " )\n"\ "pass\n"\ "n_devices = torch.cuda.device_count()\n"\ - "more_than = 0\n"\ - "for j in range(n_devices):\n"\ - " vram = torch.cuda.max_memory_reserved(torch.cuda.device(j)) / 1024 / 1024 / 1024\n"\ - " more_than += (vram > 4)\n"\ - "if more_than > 1: raise RuntimeError('Error: More than 1 GPUs have a lot of VRAM usage.')\n"\ + "import subprocess, re\n"\ + "output = subprocess.check_output(\n"\ + " 'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)\n"\ + "output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)\n"\ + "output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)\n"\ + "if output > 1: raise RuntimeError(\n"\ + " 'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')\n"\ "for _ in range(3):\n"\ " gc.collect()\n"\ " torch.cuda.empty_cache()\n"\ From 8904605967995db57cc133d0cb924c53cc006afd Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 12 Jun 2024 06:30:12 +1000 Subject: [PATCH 097/153] Create chat template --- unsloth/chat_templates.py | 291 +++++++++++++++++++++++++++++++++++++- 1 file changed, 290 insertions(+), 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 9c6a3a77..70391cc2 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -17,8 +17,11 @@ "test_chat_templates", "test_hf_gguf_equivalence", "remove_special_tokens", - "create_ollama_modelfile", "standardize_dataset", + + "construct_chat_template", + "test_construct_chat_template", + "create_ollama_modelfile", ] from transformers import StoppingCriteria, StoppingCriteriaList @@ -785,6 +788,292 @@ def _standardize_dataset(examples): pass +import re + +def get_ollama_eos_tokens(tokenizer, extra_eos_tokens = []): + added_tokens_decoder = tokenizer.added_tokens_decoder.values() + added_tokens_decoder = [str(x) for x in added_tokens_decoder] + + # Remove added_tokens_decoder duplicates + added_tokens_decoder = list(set(added_tokens_decoder) - set(extra_eos_tokens)) + + # Remove BOS + if getattr(tokenizer, "bos_token", None) is not None: + added_tokens_decoder = [x for x in added_tokens_decoder if x != tokenizer.bos_token] + pass + + repeatted_tokens = [] + # Join all vocab + joined_text = "\x01\x00".join(added_tokens_decoder) + for token in added_tokens_decoder: + n = len(token) + repeatted_counts = joined_text.count(token[:n//2]) + # Try finding longer than 1/2 of the token in the rest + # For eg <|reserved_special_token_0|>, <|reserved_special_token_1|> + if repeatted_counts > 2: + for j in range(n//2+1, n): + if joined_text.count(token[:j]) < repeatted_counts: + j -= 1 + # Remove repeatted tokens to reduce search space + joined_text = joined_text.replace(token[:j], "") + repeatted_tokens.append(token[:j]) + break + pass + pass + pass + + # Remove duplicates + splitted = joined_text.split("\x01\x00") + final_eos_tokens = [] + for old, new in zip(added_tokens_decoder, splitted): + if old == new: final_eos_tokens.append(old) + pass + final_eos_tokens += extra_eos_tokens + final_eos_tokens += repeatted_tokens + return final_eos_tokens +pass + + +def construct_chat_template( \ + +tokenizer = None, + +template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +{OUTPUT}<|eot_id|>""", + +default_system_message = \ + "Below are some instructions that describe some tasks. Write responses that appropriately complete each request.", + +extra_eos_tokens = None, + +): + """ + Creates a Ollama modelfile and a HF Jinja template from a custom + template. You must provide 2x examples of an input & output. + There is an optional system message as well. + + You must use {INPUT}, {OUTPUT} twice, and {SYSTEM} is optional. + """ + assert(tokenizer is not None) + + if extra_eos_tokens is None: extra_eos_tokens = [] + + vocab = tokenizer.get_vocab() + for extra_eos in extra_eos_tokens: + assert(type(extra_eos) is str) + if extra_eos not in vocab: + raise ValueError(f"Unsloth: `{extra_eos}` is not a singular token in the tokenizer.") + pass + pass + + # O(N^2) search finding 2 repeatted pieces of text + j = len(template)-1 + at_least_one = False + while j > 0: + found = template.rfind(template[j:], 0, j) + if found == -1: break + j -= 1 + at_least_one = True + pass + if j > 0: j += 1 + else: raise + + if not at_least_one: raise + + # Repeatted text + instruction_response = template[j:] + if instruction_response.count("{INPUT}") != 1 or instruction_response.count("{OUTPUT}") != 1: + raise RuntimeError( + "Unsloth: Your prompt template must have 2 examples showing the user input {INPUT} "\ + "and the assistant output {OUTPUT}\n\n"\ + "For example what is not allowed is just:\n"\ + "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n\n\n"\ + "What is required is 2x of this:\n"\ + "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"\ + "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n" + ) + pass + + # 1st System, Instruction, Output pair + left = template[:j] + # 2nd Instruction, Output pair + right = template[j:] + + # Isolate input + extra_eos_tokens_regex = "|".join(f"(?:{re.escape(x)})" for x in extra_eos_tokens) + if len(extra_eos_tokens_regex) != 0: + find_end = f"(?:{extra_eos_tokens_regex})?" + else: + find_end = "" + find_end = r"\{INPUT\}[\s\n]{0,}" + find_end + input_end = list(re.finditer(find_end, right)) + assert(len(input_end) == 1) + input_end = input_end[0] + input_end = input_end.span(0)[1] + input_part = right[:input_end] + + # Isolate output + output_part = right[input_end:] + + # Isolate system + system_part = left[:left.find(input_part)] + + # Check if the user provided a correct prompt + combined = system_part + input_part + output_part + if combined != left: + combined_changed = combined.replace('\n', '\\n') + left_changed = left .replace('\n', '\\n') + raise RuntimeError( + "Unsloth: The prompt template you provided isn't correct. You gave:\n"\ + f"{combined_changed}\n\n"\ + "But we require the following:\n"\ + f"{left_changed}" + ) + pass + + # Ollama modelfile parts + + # Check bos_token is in system prompt + ollama_system = system_part + has_bos_token = False + if tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None): + if ollama_system.startswith(tokenizer.bos_token): + has_bos_token = True + ollama_system = ollama_system[len(tokenizer.bos_token):] + pass + pass + system_modelfile = "{{ if .System }}" + ollama_system.replace("{SYSTEM}", "{{ .System }}") + "{{ end }}" + input_modelfile = "{{ if .Prompt }}" + input_part .replace("{INPUT}", "{{ .Prompt }}") + "{{ end }}" + output_modelfile = output_part.replace("{OUTPUT}", "{{ .Response }}") + + # Check if EOS token is at the end of the output + if not output_modelfile.endswith(tuple(extra_eos_tokens)): + output_modelfile += "{__EOS_TOKEN__}" + pass + + # Ollama EOS + ollama_eos = get_ollama_eos_tokens(tokenizer, extra_eos_tokens) + ollama_eos = '\n'.join(f'PARAMETER stop "{eos}"' for eos in ollama_eos) + + # Ollama modelfile + modelfile = 'FROM {__FILE_LOCATION__}\n\n'\ + 'TEMPLATE """' + system_modelfile + input_modelfile + output_modelfile + \ + '"""\n\n' + ollama_eos + + # HF Jinja Chat template + def process(part, which, content = "message['content']"): + if part.endswith(which): + part = "'" + part[:part.find(which)] + f"' + {content}" + elif part.startswith(which): + part = f"{content} + '" + part[part.find(which):] + "'" + else: + part = "'" + part.replace(which, f"' + {content} + '") + "'" + if part.startswith("'' + "): part = part[5:] + return part + pass + input_jinja = process(input_part, "{INPUT}") + output_jinja = process(output_part, "{OUTPUT}") + pass + + jinja_template = \ + "{% for message in loop_messages %}"\ + "{% if message['role'] == 'user' %}"\ + "{{ " + input_jinja + " }}"\ + "{% elif message['role'] == 'assistant' %}"\ + "{{ " + output_jinja + " }}"\ + "{% else %}"\ + "{{ raise_exception('Only user and assistant roles are supported!') }}"\ + "{% endif %}"\ + "{% endfor %}"\ + "{% if add_generation_prompt %}"\ + "{{ '" + output_part[:output_part.find("{OUTPUT}")] + "' }}"\ + "{% endif %}" + pass + + # Now add system prompt to jinja + if len(system_part) != 0: + partial_system = process(system_part, "{SYSTEM}", "messages[0]['content']") + partial_system = partial_system.replace("{SYSTEM}", "") + + # Separate the BOS + if has_bos_token: + partial_system = partial_system.replace(tokenizer.bos_token, "", 1) + pass + + partial_system = \ + "{% if messages[0]['role'] == 'system' %}"\ + "{{ " + partial_system + " }}"\ + "{% set loop_messages = messages[1:] %}" + if default_system_message is not None: + partial_system += "{% else %}"\ + "{{ '" + system_part.replace("{SYSTEM}", default_system_message) + "' }}"\ + "{% set loop_messages = messages %}"\ + "{% endif %}" + else: + partial_system += "{% endif %}" + pass + + jinja_template = partial_system + jinja_template + + if has_bos_token: + jinja_template = "{{ bos_token }}" + jinja_template + pass + + return modelfile, jinja_template +pass + + +def test_construct_chat_template(): + token = "hf_" + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token = token) + + template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +{OUTPUT}<|eot_id|>""" + + default_system_message = \ + "Below are some instructions that describe some tasks. Write responses that appropriately complete each request." + + extra_eos_tokens = None + + modelfile, jinja_template = construct_chat_template(template, default_system_message, extra_eos_tokens) + + messages = [ + {"role": "system", "content": "You are an assistant"}, + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "It's 4."}, + {"role": "user", "content": "Ok!"}, + {"role": "assistant", "content": "Anything else?"}, + {"role": "user", "content": "What's 2x2?"}, + ] + correct_output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) + + tokenizer.chat_template = jinja_template + new_output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) + + assert(correct_output == new_output) + pass +pass + + def create_ollama_modelfile(tokenizer, gguf_location): """ Creates an Ollama Modelfile. From 2a374c23683a4b39910b23559eb99a9dd0af7e2d Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 03:25:36 +1000 Subject: [PATCH 098/153] Fix tokenizer --- unsloth/chat_templates.py | 26 +++--- unsloth/tokenizer_utils.py | 166 ++++++++++++++++++++++++++++++++----- 2 files changed, 159 insertions(+), 33 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 70391cc2..4c782326 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -788,8 +788,6 @@ def _standardize_dataset(examples): pass -import re - def get_ollama_eos_tokens(tokenizer, extra_eos_tokens = []): added_tokens_decoder = tokenizer.added_tokens_decoder.values() added_tokens_decoder = [str(x) for x in added_tokens_decoder] @@ -875,6 +873,15 @@ def construct_chat_template( \ pass pass + error_msg = \ + "Unsloth: Your prompt template must have 2 examples showing the user input {INPUT} "\ + "and the assistant output {OUTPUT}\n\n"\ + "For example what is not allowed is just:\n"\ + "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n\n\n"\ + "What is required is 2x of this:\n"\ + "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"\ + "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n" + # O(N^2) search finding 2 repeatted pieces of text j = len(template)-1 at_least_one = False @@ -885,22 +892,15 @@ def construct_chat_template( \ at_least_one = True pass if j > 0: j += 1 - else: raise + else: raise RuntimeError(error_msg) + - if not at_least_one: raise + if not at_least_one: raise RuntimeError(error_msg) # Repeatted text instruction_response = template[j:] if instruction_response.count("{INPUT}") != 1 or instruction_response.count("{OUTPUT}") != 1: - raise RuntimeError( - "Unsloth: Your prompt template must have 2 examples showing the user input {INPUT} "\ - "and the assistant output {OUTPUT}\n\n"\ - "For example what is not allowed is just:\n"\ - "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n\n\n"\ - "What is required is 2x of this:\n"\ - "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"\ - "### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n" - ) + raise RuntimeError(error_msg) pass # 1st System, Instruction, Output pair diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 9ffd1d71..f10b2c0a 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -185,6 +185,111 @@ def convert_to_fast_tokenizer( pass +# Check Mistral chat template without BOS / EOS +mistral_template = \ + "{% if messages[0]['role'] == 'system' %}"\ + "{% if messages[1]['role'] == 'user' %}"\ + "{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}"\ + "{% set loop_messages = messages[2:] %}"\ + "{% else %}"\ + "{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\ + "{% set loop_messages = messages[1:] %}"\ + "{% endif %}"\ + "{% else %}"\ + "{% set loop_messages = messages %}"\ + "{% endif %}"\ + "{% for message in loop_messages %}"\ + "{% if message['role'] == 'user' %}"\ + "{{ '[INST] ' + message['content'] + ' [/INST]' }}"\ + "{% elif message['role'] == 'assistant' %}"\ + "{{ message['content'] }}"\ + "{% else %}"\ + "{{ raise_exception('Only user and assistant roles are supported!') }}"\ + "{% endif %}"\ + "{% endfor %}" +pass + +# Check Llama chat template without BOS / EOS +llama_template = \ + "{% if messages[0]['role'] == 'system' %}"\ + "{% if messages[1]['role'] == 'user' %}"\ + "{{ '[INST] <>\n' + messages[0]['content'] + '\n<>\n\n' + messages[1]['content'] + ' [/INST]' }}"\ + "{% set loop_messages = messages[2:] %}"\ + "{% else %}"\ + "{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\ + "{% set loop_messages = messages[1:] %}"\ + "{% endif %}"\ + "{% else %}"\ + "{% set loop_messages = messages %}"\ + "{% endif %}"\ + "{% for message in loop_messages %}"\ + "{% if message['role'] == 'user' %}"\ + "{{ '[INST] ' + message['content'].strip() + ' [/INST]' }}"\ + "{% elif message['role'] == 'assistant' %}"\ + "{{ ' ' + message['content'].strip() + ' ' }}"\ + "{% else %}"\ + "{{ raise_exception('Only user and assistant roles are supported!') }}"\ + "{% endif %}"\ + "{% endfor %}" +pass + + +def select_correct_slow_tokenizer( + tokenizer_name, + model_max_length = None, + padding_side = "right", + token = None, + trust_remote_code = False, + cache_dir = "huggingface_tokenizers_cache", +): + """ + Returns 'correct' tokenizer by checking if the chat templates are + actually tokenized correctly. + """ + messages = [ + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "It's 4."}, + ] + + settings = ( + (False, False, True,), + (False, True, True,), + (True, False, True,), + (True, False, False,), + ) + + for (use_fast, legacy, from_slow,) in settings: + # Default as mentioned by Arthur from HF: + slow_tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + model_max_length = model_max_length, + padding_side = padding_side, + token = token, + trust_remote_code = trust_remote_code, + # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373 + use_fast = use_fast, + legacy = legacy, + from_slow = from_slow, + cache_dir = cache_dir, + ) + slow_tokenizer_chat_template = slow_tokenizer.chat_template + + slow_tokenizer.chat_template = llama_template + result1 = slow_tokenizer.decode(slow_tokenizer.apply_chat_template(messages)) + slow_tokenizer.chat_template = mistral_template + result2 = slow_tokenizer.decode(slow_tokenizer.apply_chat_template(messages)) + + # If 2 spaces seen, normally wrong! + if " "*2 not in result1 and " "*2 not in result2: + slow_tokenizer.chat_template = slow_tokenizer_chat_template + return slow_tokenizer + pass + pass + # Return fast version as default + return slow_tokenizer +pass + + def assert_same_tokenization(slow_tokenizer, fast_tokenizer): # Get eos_token, bos_token etc dir_names = dir(slow_tokenizer) @@ -195,21 +300,44 @@ def assert_same_tokenization(slow_tokenizer, fast_tokenizer): all_special_tokens = list(set(special_tokens + slow_tokenizer.all_special_tokens)) # Check if chat template is enabled! - check_chat_template = True + check_chat_template1 = True + check_chat_template2 = True + check_chat_template3 = True + slow_chat_template = getattr(slow_tokenizer, "chat_template", None) + fast_chat_template = getattr(fast_tokenizer, "chat_template", None) + messages = [ + {"role": "user", "content": " What is 2+2? "}, + {"role": "assistant", "content": " It's 4. "}, + ] + # Check the tokenizer's own chat template + if slow_chat_template is not None and fast_chat_template is not None: + check_chat_template1 = \ + slow_tokenizer.apply_chat_template(messages) == \ + fast_tokenizer.apply_chat_template(messages) + pass - if getattr(slow_tokenizer, "chat_template", None) is not None and \ - getattr(fast_tokenizer, "chat_template", None) is not None: + # Check Mistral chat template without BOS / EOS + slow_tokenizer.chat_template = mistral_template + fast_tokenizer.chat_template = mistral_template + check_chat_template2 = \ + slow_tokenizer.apply_chat_template(messages) == \ + fast_tokenizer.apply_chat_template(messages) + pass - # Check chat template! - messages = [ - {"role": "user", "content": " What is 2+2? "}, - {"role": "assistant", "content": " It's 4. "}, - ] - check_chat_template = \ - slow_tokenizer(slow_tokenizer.apply_chat_template(messages)).input_ids == \ - fast_tokenizer(slow_tokenizer.apply_chat_template(messages)).input_ids + # Check Llama chat template without BOS / EOS + slow_tokenizer.chat_template = llama_template + fast_tokenizer.chat_template = llama_template + check_chat_template3 = \ + slow_tokenizer.apply_chat_template(messages) == \ + fast_tokenizer.apply_chat_template(messages) pass + # Combine them all and revert chat templates + check_chat_template = check_chat_template1 and check_chat_template2 and check_chat_template3 + slow_tokenizer.chat_template = slow_chat_template + fast_tokenizer.chat_template = fast_chat_template + + # Try special tokens try: string = "\n".join(all_special_tokens) + \ "A quick brown fox jumps over the lazy dog!!\n\nHi\n\n" + \ @@ -227,6 +355,7 @@ def assert_same_tokenization(slow_tokenizer, fast_tokenizer): return check_chat_template else: return False + pass pass @@ -379,17 +508,13 @@ def load_correct_tokenizer( # Mainly to solve Deepseek models with no tokenizer.model file slow_tokenizer = None try: - slow_tokenizer = AutoTokenizer.from_pretrained( + slow_tokenizer = select_correct_slow_tokenizer( tokenizer_name, - model_max_length = model_max_length, - padding_side = padding_side, - token = token, + model_max_length = model_max_length, + padding_side = padding_side, + token = token, trust_remote_code = trust_remote_code, - # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373 - use_fast = False, - legacy = False, - from_slow = True, - cache_dir = cache_dir, + cache_dir = cache_dir, ) except: pass @@ -418,6 +543,7 @@ def load_correct_tokenizer( if assert_same_tokenization(slow_tokenizer, fast_tokenizer): return fast_tokenizer else: + logger.warning(f"Unsloth: Will load {tokenizer_name} as a legacy tokenizer.") return convert_to_fast_tokenizer(slow_tokenizer) pass else: From 8176155d0099d60a3e084a8a04934b674a89c169 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 16:48:50 +1000 Subject: [PATCH 099/153] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 70 +++++--------------------------------- 1 file changed, 9 insertions(+), 61 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index f10b2c0a..df88170e 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -232,62 +232,6 @@ def convert_to_fast_tokenizer( "{% endif %}"\ "{% endfor %}" pass - - -def select_correct_slow_tokenizer( - tokenizer_name, - model_max_length = None, - padding_side = "right", - token = None, - trust_remote_code = False, - cache_dir = "huggingface_tokenizers_cache", -): - """ - Returns 'correct' tokenizer by checking if the chat templates are - actually tokenized correctly. - """ - messages = [ - {"role": "user", "content": "What is 2+2?"}, - {"role": "assistant", "content": "It's 4."}, - ] - - settings = ( - (False, False, True,), - (False, True, True,), - (True, False, True,), - (True, False, False,), - ) - - for (use_fast, legacy, from_slow,) in settings: - # Default as mentioned by Arthur from HF: - slow_tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name, - model_max_length = model_max_length, - padding_side = padding_side, - token = token, - trust_remote_code = trust_remote_code, - # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373 - use_fast = use_fast, - legacy = legacy, - from_slow = from_slow, - cache_dir = cache_dir, - ) - slow_tokenizer_chat_template = slow_tokenizer.chat_template - - slow_tokenizer.chat_template = llama_template - result1 = slow_tokenizer.decode(slow_tokenizer.apply_chat_template(messages)) - slow_tokenizer.chat_template = mistral_template - result2 = slow_tokenizer.decode(slow_tokenizer.apply_chat_template(messages)) - - # If 2 spaces seen, normally wrong! - if " "*2 not in result1 and " "*2 not in result2: - slow_tokenizer.chat_template = slow_tokenizer_chat_template - return slow_tokenizer - pass - pass - # Return fast version as default - return slow_tokenizer -pass def assert_same_tokenization(slow_tokenizer, fast_tokenizer): @@ -508,13 +452,17 @@ def load_correct_tokenizer( # Mainly to solve Deepseek models with no tokenizer.model file slow_tokenizer = None try: - slow_tokenizer = select_correct_slow_tokenizer( + slow_tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, - model_max_length = model_max_length, - padding_side = padding_side, - token = token, + model_max_length = model_max_length, + padding_side = padding_side, + token = token, trust_remote_code = trust_remote_code, - cache_dir = cache_dir, + # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373 + use_fast = False, + legacy = False, + from_slow = True, + cache_dir = cache_dir, ) except: pass From 21a99f1d0c1b1866ae35353669d5b64c80d3804b Mon Sep 17 00:00:00 2001 From: Eliot Hall <60240707+chrehall68@users.noreply.github.com> Date: Wed, 12 Jun 2024 23:52:28 -0700 Subject: [PATCH 100/153] fix case where gguf saving fails due to first_conversion dtype (#630) --- unsloth/save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/save.py b/unsloth/save.py index 3ad2f346..c521799f 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -930,7 +930,7 @@ def save_to_gguf( # Check first_conversion format if first_conversion == "f16" : pass - if first_conversion == "bf16" : pass + elif first_conversion == "bf16" : pass elif first_conversion == "f32" : pass elif first_conversion == "q8_0" : pass else: From dbf2dcff1a9297040ddfa039a581ef6b630fda37 Mon Sep 17 00:00:00 2001 From: Eliot Hall <60240707+chrehall68@users.noreply.github.com> Date: Wed, 12 Jun 2024 23:53:29 -0700 Subject: [PATCH 101/153] Support revision parameter in FastLanguageModel.from_pretrained (#629) * support `revision` parameter * match unsloth formatting of named parameters --- unsloth/models/loader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 3bc091b3..190c026a 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -81,6 +81,7 @@ def from_pretrained( trust_remote_code = False, use_gradient_checkpointing = True, resize_model_vocab = None, + revision = None, *args, **kwargs, ): if token is None and "HF_TOKEN" in os.environ: @@ -95,12 +96,12 @@ def from_pretrained( # First check if it's a normal model via AutoConfig is_peft = False try: - model_config = AutoConfig.from_pretrained(model_name, token = token) + model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision) is_peft = False except: try: # Most likely a PEFT model - peft_config = PeftConfig.from_pretrained(model_name, token = token) + peft_config = PeftConfig.from_pretrained(model_name, token = token, revision = revision) except: raise RuntimeError(f"Unsloth: `{model_name}` is not a full model or a PEFT model.") @@ -154,6 +155,7 @@ def from_pretrained( model_patcher = dispatch_model, tokenizer_name = tokenizer_name, trust_remote_code = trust_remote_code, + revision = revision if not is_peft else None, *args, **kwargs, ) @@ -189,7 +191,7 @@ def from_pretrained( if is_peft: # Now add PEFT adapters - model = PeftModel.from_pretrained(model, old_model_name, token = token) + model = PeftModel.from_pretrained(model, old_model_name, token = token, revision = revision) # Patch it as well! model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing) pass From 9016171682289d413ee305863a8c476663268118 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Ed=C3=A9n?= Date: Thu, 13 Jun 2024 08:55:57 +0200 Subject: [PATCH 102/153] clears any selected_adapters before calling internal_model.save_pretrained (#609) --- unsloth/save.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/save.py b/unsloth/save.py index c521799f..682dd530 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -643,7 +643,8 @@ def unsloth_save_model( model.config = new_config # Save! - + + save_pretrained_settings["selected_adapters"] = None # Check if pushing to an organization if save_pretrained_settings["push_to_hub"] and (username != actual_username): print(f"Unsloth: Saving to organization with address {new_save_directory}") From 0428920ecde77494a4442b8e8584be7a40dc4d37 Mon Sep 17 00:00:00 2001 From: XiaoYang Date: Thu, 13 Jun 2024 14:57:23 +0800 Subject: [PATCH 103/153] Update __init__.py (#602) Check for incompatible modules before importing unsloth --- unsloth/__init__.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index d85eca00..bb997147 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -14,8 +14,17 @@ import os import warnings import importlib +import sys -# Currently only supports 1 GPU, or else seg faults will occur. +# Define a list of modules to check +MODULES_TO_CHECK = ["peft", "bitsandbytes"] + +# Check if any of the modules in the list have been imported +for module in MODULES_TO_CHECK: + if module in sys.modules: + raise ImportError(f"Please import unsloth before {module}.") + +# Currently only supports 1 GPU, or else seg faults will occur. if "CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" devices = os.environ["CUDA_VISIBLE_DEVICES"] From 9fdd847dab60086355c2fe2bffcb7cd1c0b24461 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 13 Jun 2024 15:22:34 +0800 Subject: [PATCH 104/153] Fixed unsloth/tokenizer_utils.py for chat training (#604) --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index df88170e..5941623b 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -920,7 +920,7 @@ def patch_sft_trainer_tokenizer(): check_text = \ "\n"\ - "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\ + "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])[0]\n"\ "chat_template = getattr(tokenizer, 'chat_template', None)\n"\ "chat_template = '' if chat_template is None else chat_template\n"\ "has_bos_token_already = (test_text.startswith(tokenizer.bos_token) or tokenizer.bos_token in chat_template) "\ From b5fc6aa0089050a31c734e9fa24b4e69e9d83200 Mon Sep 17 00:00:00 2001 From: mahiatlinux <110882203+mahiatlinux@users.noreply.github.com> Date: Thu, 13 Jun 2024 19:27:03 +1200 Subject: [PATCH 105/153] Add GGML saving option to Unsloth for easier Ollama model creation and testing. (#345) * Add save to llama.cpp GGML to save.py. * Fix conversion command and path of convert to GGML function. * Add autosaving lora to the GGML function * Create lora save function for conversion to GGML * Test fix #2 for saving lora * Test fix #3 to save the lora adapters to convert to GGML * Remove unwated tokenizer saving for conversion to ggml and added a few print statements. * Needed tokenizer for saving, added it back, also made it more unslothy style by having positional arguments, and added a few messages. * Positional arguments didn't work out, so reverted to older version of the code, and added a few comments. * Test fix 1 for arch * Test fix 2 new Mistral error. * Test fix 3 * Revert to old version for testing. * Upload issue test fix 1 * Fix 2 uploading ggml * Positional ags added. * Temporray remove positional args * Fix upload again!!! * Add print statements and fix link * Make the calling name better * Create local saving for GGML * Add choosing directory to save local GGML. * Fix lil variable error in the save_to_custom_dir func --- unsloth/save.py | 144 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 140 insertions(+), 4 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 682dd530..9c1380c4 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1655,6 +1655,140 @@ def unsloth_push_to_hub_gguf( pass pass +# Corrected function to save LoRA to a custom directory +def save_lora_to_custom_dir(model, tokenizer, save_directory): + # Create the custom directory if it doesn't exist + os.makedirs(save_directory, exist_ok=True) + + # Call the unsloth_save_model function with the custom directory + unsloth_save_model( + model, + tokenizer, + save_directory=save_directory, + save_method="lora", + push_to_hub=False, + ) + +# Corrected method within the model class to convert LoRA to GGML and push to Hugging Face Hub +def unsloth_convert_lora_to_ggml_and_push_to_hub( + self, + tokenizer, + repo_id: str, + use_temp_dir: Optional[bool] = None, + commit_message: Optional[str] = "Converted LoRA to GGML with Unsloth", + private: Optional[bool] = None, + token: Union[bool, str, None] = None, + create_pr: bool = False, + revision: str = None, + commit_description: str = "Convert LoRA to GGML format using Unsloth", + temporary_location: str = "_unsloth_temporary_saved_buffers", + maximum_memory_usage: float = 0.85, +): + if not os.path.exists("llama.cpp"): + if IS_KAGGLE_ENVIRONMENT: + python_install = install_python_non_blocking(["protobuf"]) + python_install.wait() + install_llama_cpp_blocking(use_cuda=False) + makefile = None + else: + git_clone = install_llama_cpp_clone_non_blocking() + python_install = install_python_non_blocking(["protobuf"]) + git_clone.wait() + makefile = install_llama_cpp_make_non_blocking() + python_install.wait() + else: + makefile = None + + for _ in range(3): + gc.collect() + + lora_directory_push = "lora-to-ggml-push" + save_lora_to_custom_dir(self, tokenizer, lora_directory_push) + + model_type = self.config.model_type + output_file = os.path.join(lora_directory_push, "ggml-adapter-model.bin") + + print(f"Unsloth: Converting auto-saved LoRA adapters at {lora_directory_push} to GGML format.") + print(f"The output file will be {output_file}") + + command = f"python3 llama.cpp/convert-lora-to-ggml.py {lora_directory_push} {output_file} llama" + + try: + with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp: + for line in sp.stdout: + print(line, end="", flush=True) + for line in sp.stderr: + print(line, end="", flush=True) + sp.wait() + if sp.returncode != 0: + raise subprocess.CalledProcessError(sp.returncode, command) + except subprocess.CalledProcessError as e: + print(f"Error: Conversion failed with return code {e.returncode}") + return + + print(f"Unsloth: Conversion completed! Output file: {output_file}") + + print("Unsloth: Uploading GGML file to Hugging Face Hub...") + username = upload_to_huggingface( + self, repo_id, token, + "GGML converted LoRA", "ggml", output_file, None, private, + ) + link = f"{repo_id.lstrip('/')}" + print("Unsloth: Done.") + print(f"Converted LoRA to GGML and uploaded to https://huggingface.co/{link}") + print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!") + +def unsloth_convert_lora_to_ggml_and_save_locally( + self, + save_directory: str, # Added parameter for the folder name + tokenizer, + temporary_location: str = "_unsloth_temporary_saved_buffers", + maximum_memory_usage: float = 0.85, +): + if not os.path.exists("llama.cpp"): + if IS_KAGGLE_ENVIRONMENT: + python_install = install_python_non_blocking(["protobuf"]) + python_install.wait() + install_llama_cpp_blocking(use_cuda=False) + makefile = None + else: + git_clone = install_llama_cpp_clone_non_blocking() + python_install = install_python_non_blocking(["protobuf"]) + git_clone.wait() + makefile = install_llama_cpp_make_non_blocking() + python_install.wait() + else: + makefile = None + + for _ in range(3): + gc.collect() + + # Use the provided save_directory for local saving + save_lora_to_custom_dir(self, tokenizer, save_directory) + + model_type = self.config.model_type + output_file = os.path.join(save_directory, "ggml-adapter-model.bin") + + print(f"Unsloth: Converting auto-saved LoRA adapters at {save_directory} to GGML format.") + print(f"The output file will be {output_file}") + + command = f"python3 llama.cpp/convert-lora-to-ggml.py {save_directory} {output_file} llama" + + try: + with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp: + for line in sp.stdout: + print(line, end="", flush=True) + for line in sp.stderr: + print(line, end="", flush=True) + sp.wait() + if sp.returncode != 0: + raise subprocess.CalledProcessError(sp.returncode, command) + except subprocess.CalledProcessError as e: + print(f"Error: Conversion failed with return code {e.returncode}") + return + print("Unsloth: Done.") + print(f"Unsloth: Conversion completed! Output file: {output_file}") + print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!") def patch_saving_functions(model): import inspect @@ -1747,10 +1881,12 @@ def patch_saving_functions(model): # Add saving methods to top level model if hasattr(model, "config"): # Counteract tokenizers - model.push_to_hub_merged = types.MethodType(unsloth_push_to_hub_merged, model) - model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged, model) - model.push_to_hub_gguf = types.MethodType(unsloth_push_to_hub_gguf, model) - model.save_pretrained_gguf = types.MethodType(unsloth_save_pretrained_gguf, model) + model.push_to_hub_merged = types.MethodType(unsloth_push_to_hub_merged, model) + model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged, model) + model.push_to_hub_gguf = types.MethodType(unsloth_push_to_hub_gguf, model) + model.save_pretrained_gguf = types.MethodType(unsloth_save_pretrained_gguf, model) + model.push_to_hub_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub, model) + model.save_pretrained_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model) pass return model pass From 3fafbf7dc7c010c7ff6df34afe30514fc2871d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20De=20Greef?= Date: Thu, 13 Jun 2024 00:30:37 -0700 Subject: [PATCH 106/153] docs: Add LoraConfig parameters documentation (#619) --- PARAMETERS.md | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 PARAMETERS.md diff --git a/PARAMETERS.md b/PARAMETERS.md new file mode 100644 index 00000000..94d63798 --- /dev/null +++ b/PARAMETERS.md @@ -0,0 +1,87 @@ +## LoraConfig Parameters + +Adjusting the `LoraConfig` parameters allows you to balance model performance and computational efficiency in Low-Rank Adaptation (LoRA). Here’s a concise breakdown of key parameters: + +**r** +- **Description**: Rank of the low-rank decomposition for factorizing weight matrices. +- **Impact**: + - **Higher**: Retains more information, increases computational load. + - **Lower**: Fewer parameters, more efficient training, potential performance drop if too small. + + +**lora_alpha** +- **Description**: Scaling factor for the low-rank matrices' contribution. +- **Impact**: + - **Higher**: Increases influence, speeds up convergence, risks instability or overfitting. + - **Lower**: Subtler effect, may require more training steps. + +**lora_dropout** +- **Description**: Probability of zeroing out elements in low-rank matrices for regularization. +- **Impact**: + - **Higher**: More regularization, prevents overfitting, may slow training and degrade performance. + - **Lower**: Less regularization, may speed up training, risks overfitting. + +**loftq_config** +- **Description**: Configuration for LoftQ, a quantization method for the backbone weights and initialization of LoRA layers. +- **Impact**: + - **Not None**: If specified, LoftQ will quantize the backbone weights and initialize the LoRA layers. It requires setting `init_lora_weights='loftq'`. + - **None**: LoftQ quantization is not applied. + - **Note**: Do not pass an already quantized model when using LoftQ as LoftQ handles the quantization process itself. + + +**use_rslora** +- **Description**: Enables Rank-Stabilized LoRA (RSLora). +- **Impact**: + - **True**: Uses Rank-Stabilized LoRA, setting the adapter scaling factor to `lora_alpha/math.sqrt(r)`, which has been proven to work better as per the [Rank-Stabilized LoRA paper](https://doi.org/10.48550/arXiv.2312.03732). + - **False**: Uses the original default scaling factor `lora_alpha/r`. + +**gradient_accumulation_steps** +- **Default**: 1 +- **Description**: The number of steps to accumulate gradients before performing a backpropagation update. +- **Impact**: + - **Higher**: Accumulate gradients over multiple steps, effectively increasing the batch size without requiring additional memory. This can improve training stability and convergence, especially with large models and limited hardware. + - **Lower**: Faster updates but may require more memory per step and can be less stable. + +**weight_decay** +- **Default**: 0.01 +- **Description**: Regularization technique that applies a small penalty to the weights during training. +- **Impact**: + - **Non-zero Value (e.g., 0.01)**: Adds a penalty proportional to the magnitude of the weights to the loss function, helping to prevent overfitting by discouraging large weights. + - **Zero**: No weight decay is applied, which can lead to overfitting, especially in large models or with small datasets. + +**learning_rate** +- **Default**: 2e-4 +- **Description**: The rate at which the model updates its parameters during training. +- **Impact**: + - **Higher**: Faster convergence but risks overshooting optimal parameters and causing instability in training. + - **Lower**: More stable and precise updates but may slow down convergence, requiring more training steps to achieve good performance. + +## Target Modules + +**q_proj (query projection)** +- **Description**: Part of the attention mechanism in transformer models, responsible for projecting the input into the query space. +- **Impact**: Transforms the input into query vectors that are used to compute attention scores. + +**k_proj (key projection)** +- **Description**: Projects the input into the key space in the attention mechanism. +- **Impact**: Produces key vectors that are compared with query vectors to determine attention weights. + +**v_proj (value projection)** +- **Description**: Projects the input into the value space in the attention mechanism. +- **Impact**: Produces value vectors that are weighted by the attention scores and combined to form the output. + +**o_proj (output projection)** +- **Description**: Projects the output of the attention mechanism back into the original space. +- **Impact**: Transforms the combined weighted value vectors back to the input dimension, integrating attention results into the model. + +**gate_proj (gate projection)** +- **Description**: Typically used in gated mechanisms within neural networks, such as gating units in gated recurrent units (GRUs) or other gating mechanisms. +- **Impact**: Controls the flow of information through the gate, allowing selective information passage based on learned weights. + +**up_proj (up projection)** +- **Description**: Used for up-projection, typically increasing the dimensionality of the input. +- **Impact**: Expands the input to a higher-dimensional space, often used in feedforward layers or when transitioning between different layers with differing dimensionalities. + +**down_proj (down projection)** +- **Description**: Used for down-projection, typically reducing the dimensionality of the input. +- **Impact**: Compresses the input to a lower-dimensional space, useful for reducing computational complexity and controlling the model size. From 273a871c3bd63acc449c253a6371f8cebb7e29e3 Mon Sep 17 00:00:00 2001 From: Alberto Ferrer Date: Thu, 13 Jun 2024 02:16:20 -0600 Subject: [PATCH 107/153] llama.cpp failing (#371) llama.cpp is failing to generate quantize versions for the trained models. Error: ```bash You might have to compile llama.cpp yourself, then run this again. You do not need to close this Python program. Run the following commands in a new terminal: You must run this in the same folder as you're saving your model. git clone https://github.com/ggerganov/llama.cpp cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j Once that's done, redo the quantization. ``` But when i do clone this with recursive it works. Co-authored-by: Daniel Han From b312b3fa38aabd3e8386301ee2181927187f0dcb Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Thu, 13 Jun 2024 10:37:57 +0200 Subject: [PATCH 108/153] fix libcuda_dirs import for triton 3.0 (#227) * fix libcuda_dirs import for triton 3.0 * Update __init__.py * Update __init__.py --------- Co-authored-by: Daniel Han --- unsloth/__init__.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index bb997147..428c9873 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -15,6 +15,7 @@ import warnings import importlib import sys +from packaging.version import Version # Define a list of modules to check MODULES_TO_CHECK = ["peft", "bitsandbytes"] @@ -75,8 +76,14 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # Try loading bitsandbytes and triton import bitsandbytes as bnb + import triton -from triton.common.build import libcuda_dirs +libcuda_dirs = lambda: None +if Version(triton.__version__) >= Version("3.0.0"): + try: from triton.backends.nvidia.driver import libcuda_dirs + except: pass +else: from triton.common.build import libcuda_dirs + import os import re import numpy as np @@ -112,8 +119,11 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 importlib.reload(bnb) importlib.reload(triton) try: - import bitsandbytes as bnb - from triton.common.build import libcuda_dirs + libcuda_dirs = lambda: None + if Version(triton.__version__) >= Version("3.0.0"): + try: from triton.backends.nvidia.driver import libcuda_dirs + except: pass + else: from triton.common.build import libcuda_dirs cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 libcuda_dirs() except: From 1601dca031238f46745207b643f90336c3ac1151 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 19:23:31 +1000 Subject: [PATCH 109/153] Update save.py --- unsloth/save.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 9c1380c4..26880e5e 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -22,7 +22,7 @@ import pickle import gc from transformers.models.llama.modeling_llama import logger -from .kernels import fast_dequantize, QUANT_STATE, get_lora_parameters +from .kernels import fast_dequantize, QUANT_STATE, get_lora_parameters_bias import subprocess import psutil import re @@ -132,9 +132,10 @@ def _free_cached_model(model): def _merge_lora(layer, name): + bias = None if isinstance(layer, (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear)): # Is LoRA so we need to merge! - W, quant_state, A, B, s = get_lora_parameters(layer) + W, quant_state, A, B, s, bias = get_lora_parameters_bias(layer) if quant_state is not None: dtype = quant_state.dtype if type(quant_state) is not list else quant_state[2] W = fast_dequantize(W, quant_state) @@ -156,7 +157,7 @@ def _merge_lora(layer, name): W = W.t().to(dtype) else: W = layer.weight - return W + return W, bias pass @@ -527,7 +528,12 @@ def unsloth_save_model( for item in LLAMA_WEIGHTS: proj = eval(f"layer.{item}") name = f"model.layers.{j}.{item}.weight" - W = _merge_lora(proj, name) + W, bias = _merge_lora(proj, name) + + # Bias term + if bias is not None: + state_dict[f"model.layers.{j}.{item}.bias"] = bias + pass if (torch.cuda.memory_allocated() + W.nbytes) < max_vram: # Save to GPU memory From 26dc50294a0fe62a85fdb46c76f70148c567739c Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 19:25:27 +1000 Subject: [PATCH 110/153] Update __init__.py --- unsloth/kernels/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/kernels/__init__.py b/unsloth/kernels/__init__.py index b1fdba83..ebea02af 100644 --- a/unsloth/kernels/__init__.py +++ b/unsloth/kernels/__init__.py @@ -24,6 +24,7 @@ ) from .fast_lora import ( get_lora_parameters, + get_lora_parameters_bias, apply_lora_mlp_swiglu, apply_lora_mlp_geglu_exact, apply_lora_mlp_geglu_approx, From 6a516573d01c75fe980fba3f5188eb72fca6274a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 19:26:33 +1000 Subject: [PATCH 111/153] Update fast_lora.py --- unsloth/kernels/fast_lora.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/unsloth/kernels/fast_lora.py b/unsloth/kernels/fast_lora.py index aba44f02..8f7aea58 100644 --- a/unsloth/kernels/fast_lora.py +++ b/unsloth/kernels/fast_lora.py @@ -13,7 +13,13 @@ # limitations under the License. import torch -from .utils import fast_dequantize, QUANT_STATE, get_lora_parameters, matmul_lora +from .utils import ( + fast_dequantize, + QUANT_STATE, + get_lora_parameters, + get_lora_parameters_bias, + matmul_lora, +) class LoRA_MLP(torch.autograd.Function): From 4a8ba90605d4b41a92797759be963e19d4e30438 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 19:28:48 +1000 Subject: [PATCH 112/153] Update save.py --- unsloth/save.py | 48 +----------------------------------------------- 1 file changed, 1 insertion(+), 47 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 26880e5e..536dc78d 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -829,49 +829,6 @@ def install_llama_cpp_blocking(use_cuda = True): pass -def _fix_gemma_gguf(): - # Fixes Gemma saving to GGUF to float32 instead of float16! - with open("llama.cpp/convert-hf-to-gguf.py", "rb") as file: - text = file.read() - pass - - gemma_start = text.find(b"class GemmaModel(Model):") - if gemma_start == -1: return - - gemma_end = text.find(b"self.gguf_writer.add_tensor(new_name, data)", gemma_start) - if gemma_end == -1: return - - gemma_text = text[gemma_start : gemma_end] - bad_text = \ -b""" data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16)""" - good_text = \ -b""" # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16)""" - find_bad = gemma_text.find(bad_text) - if find_bad == -1: return - - gemma_text = gemma_text[:find_bad] + good_text + gemma_text[find_bad + len(bad_text):] - text = text[:gemma_start] + gemma_text + text[gemma_end:] - - with open("llama.cpp/convert-hf-to-gguf.py", "w+b") as file: - file.write(text) - pass -pass - - def save_to_gguf( model_type : str, model_dtype : str, @@ -1024,9 +981,6 @@ def save_to_gguf( f"--outfile {final_location} --vocab-type {vocab_type} "\ f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab" else: - # Need to fix convert-hf-to-gguf.py for some models! - # _fix_gemma_gguf() - command = f"python llama.cpp/convert-hf-to-gguf.py {model_directory} "\ f"--outfile {final_location} "\ f"--outtype {first_conversion}" @@ -1425,7 +1379,7 @@ def unsloth_save_pretrained_gguf( # Non blocking install GGUF first if not os.path.exists("llama.cpp"): - if IS_KAGGLE_ENVIRONMENT: + if True:#IS_KAGGLE_ENVIRONMENT: # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() From 0abb5ba23ef4a28b149f8cb2a136b761de67fce1 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 19:39:04 +1000 Subject: [PATCH 113/153] Update save.py --- unsloth/save.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 536dc78d..d314d39d 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -801,7 +801,7 @@ def install_llama_cpp_old(version = -10): pass -def install_llama_cpp_blocking(use_cuda = True): +def install_llama_cpp_blocking(use_cuda = False): # https://github.com/ggerganov/llama.cpp/issues/7062 # Weirdly GPU conversion for GGUF breaks?? # use_cuda = "LLAMA_CUDA=1" if use_cuda else "" @@ -911,6 +911,10 @@ def save_to_gguf( install_llama_cpp_blocking() pass # Check if successful. If not install 10th latest release + print("====================================") + print(error) + print(os.path.exists("llama.cpp/quantize")) + print("====================================") if error != 0 or not os.path.exists("llama.cpp/quantize"): print(f"Unsloth: llama.cpp error code = {error}.") install_llama_cpp_old(-10) @@ -1383,7 +1387,6 @@ def unsloth_save_pretrained_gguf( # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() - install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: @@ -1404,7 +1407,6 @@ def unsloth_save_pretrained_gguf( # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() - install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: @@ -1534,7 +1536,6 @@ def unsloth_push_to_hub_gguf( # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() - install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: @@ -1555,7 +1556,6 @@ def unsloth_push_to_hub_gguf( # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() - install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: From b24dd050bfd57a922894b7d6d82ecf6754effc91 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 19:44:51 +1000 Subject: [PATCH 114/153] Update save.py --- unsloth/save.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/save.py b/unsloth/save.py index d314d39d..fdfb7397 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -914,6 +914,7 @@ def save_to_gguf( print("====================================") print(error) print(os.path.exists("llama.cpp/quantize")) + raise print("====================================") if error != 0 or not os.path.exists("llama.cpp/quantize"): print(f"Unsloth: llama.cpp error code = {error}.") From 48c6d6dbeec5bed5cd45ff024af8d2e683d54d97 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 19:47:17 +1000 Subject: [PATCH 115/153] Update loader.py --- unsloth/models/loader.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 190c026a..ff64360c 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -33,11 +33,8 @@ def _get_model_name(model_name, load_in_4bit = True): - # First try replacing lowercase 'b' with uppercase 'B' - model_name = model_name.lower() - if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER: - model_name = INT_TO_FLOAT_MAPPER[model_name] + model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] logger.warning_once( f"Unsloth: Your transformers version of {transformers_version} does not support native "\ f"4bit loading.\nThe minimum required version is 4.37.\n"\ @@ -47,7 +44,7 @@ def _get_model_name(model_name, load_in_4bit = True): ) elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER: - new_model_name = INT_TO_FLOAT_MAPPER[model_name] + new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\ # f"`load_in_4bit = False`. We shall load `{new_model_name}` instead." @@ -55,7 +52,7 @@ def _get_model_name(model_name, load_in_4bit = True): model_name = new_model_name elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER: - new_model_name = FLOAT_TO_INT_MAPPER[model_name] + new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()] # logger.warning_once( # f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\ # f"We shall load `{new_model_name}` for 4x faster loading." From e35f6082336c5bb69a727cda74771a3339971a30 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 20:12:50 +1000 Subject: [PATCH 116/153] Update save.py --- unsloth/save.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index fdfb7397..d73b6833 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -812,7 +812,7 @@ def install_llama_cpp_blocking(use_cuda = False): # https://github.com/ggerganov/llama.cpp/issues/7062 # Weirdly GPU conversion for GGUF breaks?? # f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp", - f"make all -j{psutil.cpu_count()*2} -C llama.cpp", + f"make -j{psutil.cpu_count()*2} quantize -C llama.cpp", "pip install gguf protobuf", ] if os.path.exists("llama.cpp"): return @@ -914,7 +914,6 @@ def save_to_gguf( print("====================================") print(error) print(os.path.exists("llama.cpp/quantize")) - raise print("====================================") if error != 0 or not os.path.exists("llama.cpp/quantize"): print(f"Unsloth: llama.cpp error code = {error}.") From 4822eaeffa2377721ef623592ceaf48fb54aad88 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 22:30:50 +1000 Subject: [PATCH 117/153] Update save.py --- unsloth/save.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index d73b6833..89862f9a 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -812,7 +812,7 @@ def install_llama_cpp_blocking(use_cuda = False): # https://github.com/ggerganov/llama.cpp/issues/7062 # Weirdly GPU conversion for GGUF breaks?? # f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp", - f"make -j{psutil.cpu_count()*2} quantize -C llama.cpp", + f"make all -j{psutil.cpu_count()*2} -C llama.cpp", "pip install gguf protobuf", ] if os.path.exists("llama.cpp"): return @@ -915,10 +915,10 @@ def save_to_gguf( print(error) print(os.path.exists("llama.cpp/quantize")) print("====================================") - if error != 0 or not os.path.exists("llama.cpp/quantize"): - print(f"Unsloth: llama.cpp error code = {error}.") - install_llama_cpp_old(-10) - pass + # if error != 0 or not os.path.exists("llama.cpp/quantize"): + # print(f"Unsloth: llama.cpp error code = {error}.") + # install_llama_cpp_old(-10) + # pass if quantization_method == "f32": first_conversion = "f32" elif quantization_method == "f16": first_conversion = "f16" @@ -1030,7 +1030,7 @@ def save_to_gguf( print(f"Unsloth: [2] Converting GGUF 16bit into {quantization_method}. This will take 20 minutes...") final_location = f"./{model_directory}-unsloth.{quantization_method.upper()}.gguf" - command = f"./llama.cpp/quantize {old_location} "\ + command = f"./llama.cpp/examples/quantize {old_location} "\ f"{final_location} {quantization_method} {n_cpus}" # quantize uses stderr From 7d847ed3185b340fc5e457f15aa4ffc3a664e26f Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 22:50:08 +1000 Subject: [PATCH 118/153] quantize now llama-quantize --- unsloth/save.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 89862f9a..cae59cae 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -792,7 +792,7 @@ def install_llama_cpp_old(version = -10): pass pass # Check if successful - if not os.path.exists("llama.cpp/quantize"): + if not os.path.exists("llama.cpp/quantize") and not os.path.exists("llama.cpp/llama-quantize"): raise RuntimeError( "Unsloth: llama.cpp GGUF seems to be too buggy to install.\n"\ "File a report to llama.cpp's main repo since this is not an Unsloth issue." @@ -910,15 +910,23 @@ def save_to_gguf( error = 0 install_llama_cpp_blocking() pass + # Check if successful. If not install 10th latest release - print("====================================") - print(error) - print(os.path.exists("llama.cpp/quantize")) - print("====================================") - # if error != 0 or not os.path.exists("llama.cpp/quantize"): - # print(f"Unsloth: llama.cpp error code = {error}.") - # install_llama_cpp_old(-10) - # pass + + # Careful llama.cpp/quantize changed to llama.cpp/llama-quantize + # and llama.cpp/main changed to llama.cpp/llama-cli + # See https://github.com/ggerganov/llama.cpp/pull/7809 + quantize_location = None + if os.path.exists("llama.cpp/quantize"): + quantize_location = "llama.cpp/quantize" + elif os.path.exists("llama.cpp/llama-quantize"): + quantize_location = "llama.cpp/llama-quantize" + pass + + if error != 0 or quantize_location is None: + print(f"Unsloth: llama.cpp error code = {error}.") + install_llama_cpp_old(-10) + pass if quantization_method == "f32": first_conversion = "f32" elif quantization_method == "f16": first_conversion = "f16" @@ -1030,7 +1038,7 @@ def save_to_gguf( print(f"Unsloth: [2] Converting GGUF 16bit into {quantization_method}. This will take 20 minutes...") final_location = f"./{model_directory}-unsloth.{quantization_method.upper()}.gguf" - command = f"./llama.cpp/examples/quantize {old_location} "\ + command = f"./{quantize_location} {old_location} "\ f"{final_location} {quantization_method} {n_cpus}" # quantize uses stderr @@ -1383,10 +1391,11 @@ def unsloth_save_pretrained_gguf( # Non blocking install GGUF first if not os.path.exists("llama.cpp"): - if True:#IS_KAGGLE_ENVIRONMENT: + if IS_KAGGLE_ENVIRONMENT: # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() + install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: @@ -1407,6 +1416,7 @@ def unsloth_save_pretrained_gguf( # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() + install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: @@ -1536,6 +1546,7 @@ def unsloth_push_to_hub_gguf( # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() + install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: @@ -1556,6 +1567,7 @@ def unsloth_push_to_hub_gguf( # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() + install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: From 82f10cbaacc178dba95f9eb468c137b211282f0f Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 22:50:38 +1000 Subject: [PATCH 119/153] Update chat_templates.py --- unsloth/chat_templates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 4c782326..2e3761f5 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -1286,7 +1286,7 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf") pass for prompt in prompts: - command = f"./llama.cpp/main -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\ + command = f"./llama.cpp/llama-cli -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\ f"--check-tensors -p '{prompt}'" datas = [] From 08424f04c9972eff7684b04c81b4559197bda712 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Thu, 13 Jun 2024 23:58:48 +1000 Subject: [PATCH 120/153] Update loader.py --- unsloth/models/loader.py | 57 +++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ff64360c..de1e2e57 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -67,18 +67,18 @@ def _get_model_name(model_name, load_in_4bit = True): class FastLanguageModel(FastLlamaModel): @staticmethod def from_pretrained( - model_name = "unsloth/llama-3-8b-bnb-4bit", - max_seq_length = None, - dtype = None, - load_in_4bit = True, - token = None, - device_map = "sequential", - rope_scaling = None, - fix_tokenizer = True, - trust_remote_code = False, - use_gradient_checkpointing = True, - resize_model_vocab = None, - revision = None, + model_name = "unsloth/llama-3-8b-bnb-4bit", + max_seq_length = None, + dtype = None, + load_in_4bit = True, + token = None, + device_map = "sequential", + rope_scaling = None, + fix_tokenizer = True, + trust_remote_code = False, + use_gradient_checkpointing = "unsloth", + resize_model_vocab = None, + revision = None, *args, **kwargs, ): if token is None and "HF_TOKEN" in os.environ: @@ -141,23 +141,24 @@ def from_pretrained( pass model, tokenizer = dispatch_model.from_pretrained( - model_name = model_name, - max_seq_length = max_seq_length, - dtype = dtype, - load_in_4bit = load_in_4bit, - token = token, - device_map = device_map, - rope_scaling = rope_scaling, - fix_tokenizer = fix_tokenizer, - model_patcher = dispatch_model, - tokenizer_name = tokenizer_name, + model_name = model_name, + max_seq_length = max_seq_length, + dtype = dtype, + load_in_4bit = load_in_4bit, + token = token, + device_map = device_map, + rope_scaling = rope_scaling, + fix_tokenizer = fix_tokenizer, + model_patcher = dispatch_model, + tokenizer_name = tokenizer_name, trust_remote_code = trust_remote_code, - revision = revision if not is_peft else None, + revision = revision if not is_peft else None, *args, **kwargs, ) if resize_model_vocab is not None: model.resize_token_embeddings(resize_model_vocab) + pass # In case the model supports tagging, add the unsloth tag. if hasattr(model, "add_model_tags"): @@ -187,8 +188,16 @@ def from_pretrained( pass if is_peft: + # From https://github.com/huggingface/peft/issues/184 # Now add PEFT adapters - model = PeftModel.from_pretrained(model, old_model_name, token = token, revision = revision) + model.enable_input_require_grads() + model = PeftModel.from_pretrained( + model, + old_model_name, + token = token, + revision = revision, + is_trainable = True, + ) # Patch it as well! model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing) pass From eb906d04bf615f817f1c18ef2a332e02f435718a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 00:00:59 +1000 Subject: [PATCH 121/153] Update mapper.py --- unsloth/models/mapper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 73aa06ca..5ef75839 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -186,6 +186,9 @@ "unsloth/Qwen2-70B-Instruct-bnb-4bit" : ( "Qwen/Qwen2-70B-Instruct", ), + "mistralai/Codestral-22B-v0.1" : ( + "mistral-community/Codestral-22B-v0.1", + ), } INT_TO_FLOAT_MAPPER = {} From 0a304aefc01ba0dccff3d56d28e37eea4160bf90 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 00:07:42 +1000 Subject: [PATCH 122/153] Update __init__.py --- unsloth/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 428c9873..a6d6ff92 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -23,7 +23,9 @@ # Check if any of the modules in the list have been imported for module in MODULES_TO_CHECK: if module in sys.modules: - raise ImportError(f"Please import unsloth before {module}.") + raise ImportError(f"Unsloth: Please import Unsloth before {module}.") + pass +pass # Currently only supports 1 GPU, or else seg faults will occur. if "CUDA_VISIBLE_DEVICES" in os.environ: From 71edc42ed37425c30fbc66bddebbefec0fc0ba42 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 02:39:07 +1000 Subject: [PATCH 123/153] embedding size --- unsloth/__init__.py | 2 +- unsloth/tokenizer_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index a6d6ff92..93960e2f 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -26,7 +26,7 @@ raise ImportError(f"Unsloth: Please import Unsloth before {module}.") pass pass - + # Currently only supports 1 GPU, or else seg faults will occur. if "CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 5941623b..395c3b73 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -734,7 +734,7 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16): pass # Count all the possible bad tokens - final_counts = np.zeros(len(tokenizer), dtype = np.int64) + final_counts = np.zeros(max(len(tokenizer), embedding_matrix.shape[0]), dtype = np.int64) def mapping(examples): input_ids = examples["input_ids"] counter = np.fromiter(itertools.chain.from_iterable(input_ids), dtype = np.int32) From b74e321f92e7ad1f51ce0be2e4962b8cb68d82b8 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 04:44:52 +1000 Subject: [PATCH 124/153] Update qwen2.py --- unsloth/models/qwen2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py index 115bf3e0..907148df 100644 --- a/unsloth/models/qwen2.py +++ b/unsloth/models/qwen2.py @@ -15,6 +15,7 @@ from .llama import * import os from ._utils import __version__ +from .mistral import FastMistralModel from transformers.models.qwen2.modeling_qwen2 import ( Qwen2Attention, @@ -72,7 +73,7 @@ def from_pretrained( trust_remote_code = False, **kwargs, ): - return FastLlamaModel.from_pretrained( + return FastMistralModel.from_pretrained( model_name = model_name, max_seq_length = max_seq_length, dtype = dtype, From b82277f71078ea202d9136aab1e1b1f0d158bc00 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 18:05:39 +1000 Subject: [PATCH 125/153] docs --- PARAMETERS.md | 87 --------------------------------------------------- README.md | 2 +- 2 files changed, 1 insertion(+), 88 deletions(-) delete mode 100644 PARAMETERS.md diff --git a/PARAMETERS.md b/PARAMETERS.md deleted file mode 100644 index 94d63798..00000000 --- a/PARAMETERS.md +++ /dev/null @@ -1,87 +0,0 @@ -## LoraConfig Parameters - -Adjusting the `LoraConfig` parameters allows you to balance model performance and computational efficiency in Low-Rank Adaptation (LoRA). Here’s a concise breakdown of key parameters: - -**r** -- **Description**: Rank of the low-rank decomposition for factorizing weight matrices. -- **Impact**: - - **Higher**: Retains more information, increases computational load. - - **Lower**: Fewer parameters, more efficient training, potential performance drop if too small. - - -**lora_alpha** -- **Description**: Scaling factor for the low-rank matrices' contribution. -- **Impact**: - - **Higher**: Increases influence, speeds up convergence, risks instability or overfitting. - - **Lower**: Subtler effect, may require more training steps. - -**lora_dropout** -- **Description**: Probability of zeroing out elements in low-rank matrices for regularization. -- **Impact**: - - **Higher**: More regularization, prevents overfitting, may slow training and degrade performance. - - **Lower**: Less regularization, may speed up training, risks overfitting. - -**loftq_config** -- **Description**: Configuration for LoftQ, a quantization method for the backbone weights and initialization of LoRA layers. -- **Impact**: - - **Not None**: If specified, LoftQ will quantize the backbone weights and initialize the LoRA layers. It requires setting `init_lora_weights='loftq'`. - - **None**: LoftQ quantization is not applied. - - **Note**: Do not pass an already quantized model when using LoftQ as LoftQ handles the quantization process itself. - - -**use_rslora** -- **Description**: Enables Rank-Stabilized LoRA (RSLora). -- **Impact**: - - **True**: Uses Rank-Stabilized LoRA, setting the adapter scaling factor to `lora_alpha/math.sqrt(r)`, which has been proven to work better as per the [Rank-Stabilized LoRA paper](https://doi.org/10.48550/arXiv.2312.03732). - - **False**: Uses the original default scaling factor `lora_alpha/r`. - -**gradient_accumulation_steps** -- **Default**: 1 -- **Description**: The number of steps to accumulate gradients before performing a backpropagation update. -- **Impact**: - - **Higher**: Accumulate gradients over multiple steps, effectively increasing the batch size without requiring additional memory. This can improve training stability and convergence, especially with large models and limited hardware. - - **Lower**: Faster updates but may require more memory per step and can be less stable. - -**weight_decay** -- **Default**: 0.01 -- **Description**: Regularization technique that applies a small penalty to the weights during training. -- **Impact**: - - **Non-zero Value (e.g., 0.01)**: Adds a penalty proportional to the magnitude of the weights to the loss function, helping to prevent overfitting by discouraging large weights. - - **Zero**: No weight decay is applied, which can lead to overfitting, especially in large models or with small datasets. - -**learning_rate** -- **Default**: 2e-4 -- **Description**: The rate at which the model updates its parameters during training. -- **Impact**: - - **Higher**: Faster convergence but risks overshooting optimal parameters and causing instability in training. - - **Lower**: More stable and precise updates but may slow down convergence, requiring more training steps to achieve good performance. - -## Target Modules - -**q_proj (query projection)** -- **Description**: Part of the attention mechanism in transformer models, responsible for projecting the input into the query space. -- **Impact**: Transforms the input into query vectors that are used to compute attention scores. - -**k_proj (key projection)** -- **Description**: Projects the input into the key space in the attention mechanism. -- **Impact**: Produces key vectors that are compared with query vectors to determine attention weights. - -**v_proj (value projection)** -- **Description**: Projects the input into the value space in the attention mechanism. -- **Impact**: Produces value vectors that are weighted by the attention scores and combined to form the output. - -**o_proj (output projection)** -- **Description**: Projects the output of the attention mechanism back into the original space. -- **Impact**: Transforms the combined weighted value vectors back to the input dimension, integrating attention results into the model. - -**gate_proj (gate projection)** -- **Description**: Typically used in gated mechanisms within neural networks, such as gating units in gated recurrent units (GRUs) or other gating mechanisms. -- **Impact**: Controls the flow of information through the gate, allowing selective information passage based on learned weights. - -**up_proj (up projection)** -- **Description**: Used for up-projection, typically increasing the dimensionality of the input. -- **Impact**: Expands the input to a higher-dimensional space, often used in feedforward layers or when transitioning between different layers with differing dimensionalities. - -**down_proj (down projection)** -- **Description**: Used for down-projection, typically reducing the dimensionality of the input. -- **Impact**: Compresses the input to a lower-dimensional space, useful for reducing computational complexity and controlling the model size. diff --git a/README.md b/README.md index 2c50f457..dab899cf 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ ## ✨ Finetune for Free -All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. +All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation. | Unsloth supports | Free Notebooks | Performance | Memory use | |-----------|---------|--------|----------| From d98e45e41b90d5df56432c764135466ee7597e38 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 18:06:12 +1000 Subject: [PATCH 126/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index dab899cf..0a2cd1fa 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ ## ✨ Finetune for Free -All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation. +All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. | Unsloth supports | Free Notebooks | Performance | Memory use | |-----------|---------|--------|----------| @@ -35,7 +35,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral 7B v3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing) - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text - This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language - +- Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth. ## 🦥 Unsloth.ai News - 📣 NEW! Continued Pretraining [notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) for other languages like Korean! From b6f0fdb53d40918aa05ec4dfbd789681ba7879a5 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 18:07:42 +1000 Subject: [PATCH 127/153] Update qwen2.py --- unsloth/models/qwen2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py index 907148df..115bf3e0 100644 --- a/unsloth/models/qwen2.py +++ b/unsloth/models/qwen2.py @@ -15,7 +15,6 @@ from .llama import * import os from ._utils import __version__ -from .mistral import FastMistralModel from transformers.models.qwen2.modeling_qwen2 import ( Qwen2Attention, @@ -73,7 +72,7 @@ def from_pretrained( trust_remote_code = False, **kwargs, ): - return FastMistralModel.from_pretrained( + return FastLlamaModel.from_pretrained( model_name = model_name, max_seq_length = max_seq_length, dtype = dtype, From 6c031e4a32f06ed77b86958d1f2f97047ef42ccc Mon Sep 17 00:00:00 2001 From: Walter Korman Date: Fri, 14 Jun 2024 01:11:28 -0700 Subject: [PATCH 128/153] README: Fix minor typo. (#559) * README: Fix minor typo. One-character typo fix while reading. * Update README.md --------- Co-authored-by: Daniel Han --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0a2cd1fa..a56dea5c 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ model = FastLanguageModel.get_peft_model( ## 🥇 Performance Benchmarking -- For the full list of **reproducable** benchmarking tables, [go to our website](https://unsloth.ai/blog/mistral-benchmark#Benchmark%20tables) +- For the full list of **reproducible** benchmarking tables, [go to our website](https://unsloth.ai/blog/mistral-benchmark#Benchmark%20tables) | 1 A100 40GB | 🤗Hugging Face | Flash Attention | 🦥Unsloth Open Source | 🦥[Unsloth Pro](https://unsloth.ai/pricing) | |--------------|--------------|-----------------|---------------------|-----------------| @@ -257,7 +257,7 @@ trainer.train() # (1) Saving to GGUF / merging to 16bit for vLLM # (2) Continued training from a saved LoRA adapter # (3) Adding an evaluation loop / OOMs -# (4) Cutomized chat templates +# (4) Customized chat templates ``` From 2401dee8ff7d23a09a619f5a0feb14bbe59f8b2c Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 20:15:47 +1000 Subject: [PATCH 129/153] Update mistral.py --- unsloth/models/mistral.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index fc2e1a9f..ff2e909f 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -512,7 +512,7 @@ def from_pretrained( if "n_total_devices >" not in inner_training_loop: raise RuntimeError( "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n" + "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" "We're a 2 person team, so we still have to fund our development costs - thanks!\n" "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", ) @@ -521,6 +521,7 @@ def from_pretrained( "is_sagemaker_mp_enabled()", "False", ) + exec(inner_training_loop, globals()) Trainer._inner_training_loop = _fast_inner_training_loop # Save max_seq_length @@ -560,6 +561,7 @@ def from_pretrained( # Add save modules patch_saving_functions(model) + Trainer._inner_training_loop = _fast_inner_training_loop # Save tokenizer for inference purposes tokenizer.padding_side = "left" # Force inference From 1b93d7eee6c980ca1ce521aca27624b7dc075bbd Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 20:16:58 +1000 Subject: [PATCH 130/153] Update qwen2.py --- unsloth/models/qwen2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py index 115bf3e0..96fcf5d9 100644 --- a/unsloth/models/qwen2.py +++ b/unsloth/models/qwen2.py @@ -72,7 +72,7 @@ def from_pretrained( trust_remote_code = False, **kwargs, ): - return FastLlamaModel.from_pretrained( + return FastMistralModel.from_pretrained( model_name = model_name, max_seq_length = max_seq_length, dtype = dtype, From 358103718f265fea647a99c5ffcb6aff490201ad Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 20:19:37 +1000 Subject: [PATCH 131/153] Update qwen2.py --- unsloth/models/qwen2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py index 96fcf5d9..04f888b2 100644 --- a/unsloth/models/qwen2.py +++ b/unsloth/models/qwen2.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .llama import * +from .mistral import * import os from ._utils import __version__ From b56b8b84dcd560865c3aa9a8c7fbbbaf7eb86e4a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 20:24:06 +1000 Subject: [PATCH 132/153] Update qwen2.py --- unsloth/models/qwen2.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py index 04f888b2..2973bd02 100644 --- a/unsloth/models/qwen2.py +++ b/unsloth/models/qwen2.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .mistral import * -import os -from ._utils import __version__ +from .llama import * from transformers.models.qwen2.modeling_qwen2 import ( Qwen2Attention, @@ -72,7 +70,7 @@ def from_pretrained( trust_remote_code = False, **kwargs, ): - return FastMistralModel.from_pretrained( + return FastLlamaModel.from_pretrained( model_name = model_name, max_seq_length = max_seq_length, dtype = dtype, From fe8c06496f0ebd81932b25fbc48cd673f42920d0 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 20:32:01 +1000 Subject: [PATCH 133/153] Update llama.py --- unsloth/models/llama.py | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 4cbbcf0a..0cbab0bc 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -59,6 +59,11 @@ from ..save import patch_saving_functions import re, os, inspect, math, sys +from inspect import currentframe, getframeinfo +def DEBUG(): + frameinfo = getframeinfo(currentframe()) + print(frameinfo.filename, frameinfo.lineno) +pass def original_apply_qkv(self, X): Q = self.q_proj(X) @@ -289,6 +294,7 @@ def LlamaAttention_fast_forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # Clear inference + DEBUG() if hasattr(self, "paged_attention"): del self.paged_attention_K del self.paged_attention_V @@ -330,6 +336,7 @@ def LlamaAttention_fast_forward( V = torch.cat([past_key_value[1], V], dim = 2) pass past_key_value = (K, V) if use_cache else None + DEBUG() # Attention module if (not HAS_FLASH_ATTENTION and attention_mask is None): @@ -338,6 +345,7 @@ def LlamaAttention_fast_forward( Q = Q.transpose(1, 2) K = K.transpose(1, 2) V = V.transpose(1, 2) + DEBUG() # Group query attention if n_groups != 1: @@ -353,6 +361,7 @@ def LlamaAttention_fast_forward( pass A = xformers_attention(Q, K, V, attn_bias = causal_mask) A = A.view(bsz, q_len, n_heads, head_dim) + DEBUG() elif HAS_FLASH_ATTENTION and attention_mask is None: Q = Q.transpose(1, 2) @@ -379,6 +388,7 @@ def LlamaAttention_fast_forward( attn_output = A.reshape(bsz, q_len, n_heads*head_dim) attn_output = self.apply_o(self, attn_output) attn_weights = None + DEBUG() return attn_output, attn_weights, past_key_value pass @@ -430,8 +440,10 @@ def LlamaDecoderLayer_fast_forward( hidden_states = fast_swiglu_inference(self.mlp, hidden_states) hidden_states += residual else: + DEBUG() residual = hidden_states hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states) + DEBUG() hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, causal_mask=causal_mask, @@ -442,13 +454,18 @@ def LlamaDecoderLayer_fast_forward( use_cache=use_cache, padding_mask=padding_mask, ) + DEBUG() hidden_states = residual + hidden_states + DEBUG() # Fully Connected residual = hidden_states hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states) + DEBUG() hidden_states = self.mlp(hidden_states) + DEBUG() hidden_states = residual + hidden_states + DEBUG() pass outputs = (hidden_states,) @@ -473,7 +490,8 @@ def LlamaModel_fast_forward( return_dict: Optional[bool] = None, *args, **kwargs, ) -> Union[Tuple, BaseModelOutputWithPast]: - + + DEBUG() output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions assert(output_attentions is False) output_hidden_states = ( @@ -508,6 +526,7 @@ def LlamaModel_fast_forward( inputs_embeds = inputs_embeds[:,:self.max_seq_length,:] pass pass + DEBUG() past_key_values_length = 0 @@ -515,6 +534,7 @@ def LlamaModel_fast_forward( past_key_values_length = past_key_values[0][0].shape[2] seq_length_with_past = seq_length_with_past + past_key_values_length pass + DEBUG() # We already handle KV cache position_ids ourselves. if False:#(past_key_values_length != 0): @@ -529,11 +549,13 @@ def LlamaModel_fast_forward( else: position_ids = None pass + DEBUG() if position_ids is not None: if position_ids.shape[0] != batch_size: position_ids = position_ids.repeat((batch_size, 1)) pass + DEBUG() # Embed positions if inputs_embeds is None: @@ -544,6 +566,7 @@ def LlamaModel_fast_forward( # Normalized from Gemma IS_GEMMA = self.config.model_type == "gemma" train_embed_tokens = self.embed_tokens.weight.requires_grad + DEBUG() if IS_GEMMA: # Match Gemma exactly by casting to bfloat16 / float16 @@ -568,6 +591,7 @@ def LlamaModel_fast_forward( if inputs_requires_grad: inputs_embeds.requires_grad_(True) pass pass + DEBUG() # Fix up attention mask by setting elements to 0 # Specifically for DPO @@ -585,6 +609,7 @@ def LlamaModel_fast_forward( inputs_embeds *= attention_mask.unsqueeze(0).transpose(0, 1).transpose(1, 2) if inputs_requires_grad: inputs_embeds.requires_grad_(True) pass + DEBUG() # Ignore attention_mask if attention_mask is None: @@ -606,6 +631,7 @@ def LlamaModel_fast_forward( sliding_window = getattr(self.config, "sliding_window", None), ) pass + DEBUG() hidden_states = inputs_embeds @@ -629,6 +655,7 @@ def LlamaModel_fast_forward( else: boundaries = None pass + DEBUG() # Check checkpointing method gradient_checkpointing = False @@ -641,6 +668,7 @@ def LlamaModel_fast_forward( if output_attentions is False and hasattr(self, "_offloaded_gradient_checkpointing"): offloaded_gradient_checkpointing = True pass + DEBUG() # Go through every layer! for idx, decoder_layer in enumerate(self.layers): @@ -648,6 +676,7 @@ def LlamaModel_fast_forward( if output_hidden_states: all_hidden_states += (hidden_states,) past_key_value = past_key_values[idx] if past_key_values is not None else None + DEBUG() if offloaded_gradient_checkpointing: hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply( decoder_layer, @@ -703,10 +732,12 @@ def custom_forward(*inputs): else: hidden_states = fast_rms_layernorm(self.norm, hidden_states, gemma = IS_GEMMA) pass + DEBUG() if output_hidden_states: all_hidden_states += (hidden_states,) next_cache = next_decoder_cache if use_cache else None + DEBUG() if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( @@ -801,6 +832,7 @@ def _CausalLM_fast_forward( attention_mask = attention_mask, ) else: + DEBUG() causal_mask = xformers.attn_bias.LowerTriangularMask() output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -812,6 +844,7 @@ def _CausalLM_fast_forward( # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) self.model._has_no_labels = labels is None + DEBUG() outputs = self.model( input_ids=input_ids, causal_mask=causal_mask, @@ -826,6 +859,7 @@ def _CausalLM_fast_forward( ) pass + DEBUG() hidden_states = outputs[0] bsz, q_len, hd = hidden_states.shape lm_head = self.lm_head.weight @@ -837,6 +871,8 @@ def _CausalLM_fast_forward( pass logits = logits.to(self.config.torch_dtype) + DEBUG() + loss = None if labels is not None: shift_logits = logits @@ -851,6 +887,7 @@ def _CausalLM_fast_forward( labels = shift_labels, ) pass + DEBUG() if not return_dict: output = (logits,) + outputs[1:] @@ -881,6 +918,7 @@ def PeftModelForCausalLM_fast_forward( task_ids=None, **kwargs, ): + DEBUG() return self.base_model( input_ids=input_ids, causal_mask=causal_mask, From d8d332ac8332107ad89e41b79b106f1efd34d69a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 20:37:10 +1000 Subject: [PATCH 134/153] Update llama.py --- unsloth/models/llama.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 0cbab0bc..0b0ce437 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -59,10 +59,9 @@ from ..save import patch_saving_functions import re, os, inspect, math, sys -from inspect import currentframe, getframeinfo +import sys def DEBUG(): - frameinfo = getframeinfo(currentframe()) - print(frameinfo.filename, frameinfo.lineno) + print(sys._getframe().f_back.f_lineno) pass def original_apply_qkv(self, X): From cdb1dbb5ff7627d35f727ba2a795be6856bd9e80 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 20:42:19 +1000 Subject: [PATCH 135/153] Update llama.py --- unsloth/models/llama.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 0b0ce437..3097c5dd 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -470,6 +470,7 @@ def LlamaDecoderLayer_fast_forward( outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights,) if use_cache: outputs += (present_key_value,) + DEBUG() return outputs pass @@ -687,6 +688,7 @@ def LlamaModel_fast_forward( output_attentions, use_cache, )[0] + DEBUG() elif gradient_checkpointing: def create_custom_forward(module): @@ -694,7 +696,7 @@ def custom_forward(*inputs): return module(*inputs, past_key_value, output_attentions, padding_mask = padding_mask) return custom_forward pass - + DEBUG() layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, @@ -705,7 +707,7 @@ def custom_forward(*inputs): preserve_rng_state = False, ) hidden_states = layer_outputs[0] - + DEBUG() else: layer_outputs = decoder_layer( hidden_states, @@ -718,6 +720,7 @@ def custom_forward(*inputs): padding_mask=padding_mask, ) hidden_states = layer_outputs[0] + DEBUG() pass if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) From e8b3cf01b1132a62eff2d0cbe88661fc42376c82 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 20:53:51 +1000 Subject: [PATCH 136/153] Update README.md --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a56dea5c..534079ed 100644 --- a/README.md +++ b/README.md @@ -100,14 +100,16 @@ model = FastLanguageModel.get_peft_model( ### Conda Installation Select either `pytorch-cuda=11.8` for CUDA 11.8 or `pytorch-cuda=12.1` for CUDA 12.1. If you have `mamba`, use `mamba` instead of `conda` for faster solving. See this [Github issue](https://github.com/unslothai/unsloth/issues/73) for help on debugging Conda installs. ```bash -conda create --name unsloth_env python=3.10 +conda create --name unsloth_env \ + python=3.10 \ + pytorch-cuda=<11.8/12.1> \ + pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \ + -y conda activate unsloth_env -conda install pytorch-cuda=<12.1/11.8> pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers - pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -pip install --no-deps trl peft accelerate bitsandbytes +pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes ``` ### Pip Installation @@ -162,7 +164,7 @@ pip install --no-deps packaging ninja einops flash-attn xformers trl peft accele # Pre Ampere RTX 2080, T4, GTX 1080 GPUs: pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -pip install --no-deps xformers trl peft accelerate bitsandbytes +pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes ``` 7. For Pytorch 2.3.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher. ```bash From 7e6f000146df73d47b77387726739a5c8d55ca02 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 20:54:47 +1000 Subject: [PATCH 137/153] FastMistralModel --- unsloth/models/llama.py | 46 +++-------------------------------------- unsloth/models/qwen2.py | 6 +++--- 2 files changed, 6 insertions(+), 46 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 3097c5dd..4cbbcf0a 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -59,10 +59,6 @@ from ..save import patch_saving_functions import re, os, inspect, math, sys -import sys -def DEBUG(): - print(sys._getframe().f_back.f_lineno) -pass def original_apply_qkv(self, X): Q = self.q_proj(X) @@ -293,7 +289,6 @@ def LlamaAttention_fast_forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # Clear inference - DEBUG() if hasattr(self, "paged_attention"): del self.paged_attention_K del self.paged_attention_V @@ -335,7 +330,6 @@ def LlamaAttention_fast_forward( V = torch.cat([past_key_value[1], V], dim = 2) pass past_key_value = (K, V) if use_cache else None - DEBUG() # Attention module if (not HAS_FLASH_ATTENTION and attention_mask is None): @@ -344,7 +338,6 @@ def LlamaAttention_fast_forward( Q = Q.transpose(1, 2) K = K.transpose(1, 2) V = V.transpose(1, 2) - DEBUG() # Group query attention if n_groups != 1: @@ -360,7 +353,6 @@ def LlamaAttention_fast_forward( pass A = xformers_attention(Q, K, V, attn_bias = causal_mask) A = A.view(bsz, q_len, n_heads, head_dim) - DEBUG() elif HAS_FLASH_ATTENTION and attention_mask is None: Q = Q.transpose(1, 2) @@ -387,7 +379,6 @@ def LlamaAttention_fast_forward( attn_output = A.reshape(bsz, q_len, n_heads*head_dim) attn_output = self.apply_o(self, attn_output) attn_weights = None - DEBUG() return attn_output, attn_weights, past_key_value pass @@ -439,10 +430,8 @@ def LlamaDecoderLayer_fast_forward( hidden_states = fast_swiglu_inference(self.mlp, hidden_states) hidden_states += residual else: - DEBUG() residual = hidden_states hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states) - DEBUG() hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, causal_mask=causal_mask, @@ -453,24 +442,18 @@ def LlamaDecoderLayer_fast_forward( use_cache=use_cache, padding_mask=padding_mask, ) - DEBUG() hidden_states = residual + hidden_states - DEBUG() # Fully Connected residual = hidden_states hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states) - DEBUG() hidden_states = self.mlp(hidden_states) - DEBUG() hidden_states = residual + hidden_states - DEBUG() pass outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights,) if use_cache: outputs += (present_key_value,) - DEBUG() return outputs pass @@ -490,8 +473,7 @@ def LlamaModel_fast_forward( return_dict: Optional[bool] = None, *args, **kwargs, ) -> Union[Tuple, BaseModelOutputWithPast]: - - DEBUG() + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions assert(output_attentions is False) output_hidden_states = ( @@ -526,7 +508,6 @@ def LlamaModel_fast_forward( inputs_embeds = inputs_embeds[:,:self.max_seq_length,:] pass pass - DEBUG() past_key_values_length = 0 @@ -534,7 +515,6 @@ def LlamaModel_fast_forward( past_key_values_length = past_key_values[0][0].shape[2] seq_length_with_past = seq_length_with_past + past_key_values_length pass - DEBUG() # We already handle KV cache position_ids ourselves. if False:#(past_key_values_length != 0): @@ -549,13 +529,11 @@ def LlamaModel_fast_forward( else: position_ids = None pass - DEBUG() if position_ids is not None: if position_ids.shape[0] != batch_size: position_ids = position_ids.repeat((batch_size, 1)) pass - DEBUG() # Embed positions if inputs_embeds is None: @@ -566,7 +544,6 @@ def LlamaModel_fast_forward( # Normalized from Gemma IS_GEMMA = self.config.model_type == "gemma" train_embed_tokens = self.embed_tokens.weight.requires_grad - DEBUG() if IS_GEMMA: # Match Gemma exactly by casting to bfloat16 / float16 @@ -591,7 +568,6 @@ def LlamaModel_fast_forward( if inputs_requires_grad: inputs_embeds.requires_grad_(True) pass pass - DEBUG() # Fix up attention mask by setting elements to 0 # Specifically for DPO @@ -609,7 +585,6 @@ def LlamaModel_fast_forward( inputs_embeds *= attention_mask.unsqueeze(0).transpose(0, 1).transpose(1, 2) if inputs_requires_grad: inputs_embeds.requires_grad_(True) pass - DEBUG() # Ignore attention_mask if attention_mask is None: @@ -631,7 +606,6 @@ def LlamaModel_fast_forward( sliding_window = getattr(self.config, "sliding_window", None), ) pass - DEBUG() hidden_states = inputs_embeds @@ -655,7 +629,6 @@ def LlamaModel_fast_forward( else: boundaries = None pass - DEBUG() # Check checkpointing method gradient_checkpointing = False @@ -668,7 +641,6 @@ def LlamaModel_fast_forward( if output_attentions is False and hasattr(self, "_offloaded_gradient_checkpointing"): offloaded_gradient_checkpointing = True pass - DEBUG() # Go through every layer! for idx, decoder_layer in enumerate(self.layers): @@ -676,7 +648,6 @@ def LlamaModel_fast_forward( if output_hidden_states: all_hidden_states += (hidden_states,) past_key_value = past_key_values[idx] if past_key_values is not None else None - DEBUG() if offloaded_gradient_checkpointing: hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply( decoder_layer, @@ -688,7 +659,6 @@ def LlamaModel_fast_forward( output_attentions, use_cache, )[0] - DEBUG() elif gradient_checkpointing: def create_custom_forward(module): @@ -696,7 +666,7 @@ def custom_forward(*inputs): return module(*inputs, past_key_value, output_attentions, padding_mask = padding_mask) return custom_forward pass - DEBUG() + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, @@ -707,7 +677,7 @@ def custom_forward(*inputs): preserve_rng_state = False, ) hidden_states = layer_outputs[0] - DEBUG() + else: layer_outputs = decoder_layer( hidden_states, @@ -720,7 +690,6 @@ def custom_forward(*inputs): padding_mask=padding_mask, ) hidden_states = layer_outputs[0] - DEBUG() pass if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) @@ -734,12 +703,10 @@ def custom_forward(*inputs): else: hidden_states = fast_rms_layernorm(self.norm, hidden_states, gemma = IS_GEMMA) pass - DEBUG() if output_hidden_states: all_hidden_states += (hidden_states,) next_cache = next_decoder_cache if use_cache else None - DEBUG() if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( @@ -834,7 +801,6 @@ def _CausalLM_fast_forward( attention_mask = attention_mask, ) else: - DEBUG() causal_mask = xformers.attn_bias.LowerTriangularMask() output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -846,7 +812,6 @@ def _CausalLM_fast_forward( # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) self.model._has_no_labels = labels is None - DEBUG() outputs = self.model( input_ids=input_ids, causal_mask=causal_mask, @@ -861,7 +826,6 @@ def _CausalLM_fast_forward( ) pass - DEBUG() hidden_states = outputs[0] bsz, q_len, hd = hidden_states.shape lm_head = self.lm_head.weight @@ -873,8 +837,6 @@ def _CausalLM_fast_forward( pass logits = logits.to(self.config.torch_dtype) - DEBUG() - loss = None if labels is not None: shift_logits = logits @@ -889,7 +851,6 @@ def _CausalLM_fast_forward( labels = shift_labels, ) pass - DEBUG() if not return_dict: output = (logits,) + outputs[1:] @@ -920,7 +881,6 @@ def PeftModelForCausalLM_fast_forward( task_ids=None, **kwargs, ): - DEBUG() return self.base_model( input_ids=input_ids, causal_mask=causal_mask, diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py index 2973bd02..47327280 100644 --- a/unsloth/models/qwen2.py +++ b/unsloth/models/qwen2.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .llama import * +from .mistral import * from transformers.models.qwen2.modeling_qwen2 import ( Qwen2Attention, @@ -32,7 +32,7 @@ pass -class FastQwen2Model(FastLlamaModel): +class FastQwen2Model(FastMistralModel): @staticmethod def pre_patch(): @@ -70,7 +70,7 @@ def from_pretrained( trust_remote_code = False, **kwargs, ): - return FastLlamaModel.from_pretrained( + return FastMistralModel.from_pretrained( model_name = model_name, max_seq_length = max_seq_length, dtype = dtype, From 28995abd7a1d131402078e40f389c7cbdf7d3728 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 23:05:24 +1000 Subject: [PATCH 138/153] Update mistral.py --- unsloth/models/mistral.py | 570 +++++++++++++++++++------------------- 1 file changed, 285 insertions(+), 285 deletions(-) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index ff2e909f..d0af320e 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -287,291 +287,291 @@ def pre_patch(): pass - @staticmethod - def from_pretrained( - model_name = "unsloth/mistral-7b-bnb-4bit", - max_seq_length = None, - dtype = None, - load_in_4bit = True, - token = None, - device_map = "sequential", - rope_scaling = None, # Mistral does not support RoPE scaling - fix_tokenizer = True, - model_patcher = None, - tokenizer_name = None, - trust_remote_code = False, - **kwargs, - ): - if token is None and "HF_TOKEN" in os.environ: - token = os.environ["HF_TOKEN"] - - if token is None and "HUGGINGFACE_TOKEN" in os.environ: - token = os.environ["HUGGINGFACE_TOKEN"] - - if model_patcher is None: model_patcher = FastMistralModel - # Mistral does NOT support RoPE Scaling! - if rope_scaling is not None: - logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.") - pass - - SUPPORTS_BFLOAT16 = is_bfloat16_supported() - gpu_stats = torch.cuda.get_device_properties(0) - max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) - - statistics = \ - f"==((====))== Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\ - f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\ - f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\ - f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\ - f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' - print(statistics) - model_patcher.pre_patch() - # get_statistics() - - if dtype is None: - dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16 - elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16: - logger.warning_once("Device does not support bfloat16. Will change to float16.") - dtype = torch.float16 - - assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32) - - # Check max sequence length - model_config = AutoConfig.from_pretrained(model_name, token = token) - model_max_seq_length = model_config.max_position_embeddings - - # If max_seq_length is not specified, use maximum fron config - if max_seq_length is None: - max_seq_length = model_max_seq_length - pass - - # Mistral does NOT support RoPE Scaling sadly so we have to error out. - if max_seq_length > model_max_seq_length: - raise RuntimeError( - f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\ - f"The maximum sequence length supported is {model_max_seq_length}.", - ) - pass - - bnb_config = None - if load_in_4bit: - bnb_config = BitsAndBytesConfig( - load_in_4bit = True, - bnb_4bit_use_double_quant = True, - bnb_4bit_quant_type = "nf4", - bnb_4bit_compute_dtype = dtype, - ) - - max_position_embeddings = max(max_seq_length, model_max_seq_length) - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map = device_map, - torch_dtype = dtype, - quantization_config = bnb_config, - token = token, - # rope_scaling = rope_scaling, - trust_remote_code = trust_remote_code, - **kwargs, - ) - - # Counteract saved tokenizers - tokenizer_name = model_name if tokenizer_name is None else tokenizer_name - tokenizer = load_correct_tokenizer( - tokenizer_name, - model_max_length = max_position_embeddings, - padding_side = "right", - token = token, - trust_remote_code = trust_remote_code, - ) - - model, tokenizer = patch_tokenizer(model, tokenizer) - model = model_patcher.post_patch(model) - - # Patch up QKV / O and MLP - for idx, layer in enumerate(model.model.layers): - layer.self_attn.apply_qkv = original_apply_qkv - layer.self_attn.apply_o = original_apply_o - pass - - # Patch Trainer - from transformers.trainer import Trainer - try: - if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": - inner_training_loop = inspect.getsource(Trainer._inner_training_loop) - Trainer._original_training_loop = inner_training_loop - else: - inner_training_loop = Trainer._original_training_loop - except: - raise RuntimeError( - "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" - "We're a 2 person team, so we still have to fund our development costs - thanks!\n" - "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - ) - pass - - import transformers.trainer - items_in_trainer = dir(transformers.trainer) - good_items = [] - for item in items_in_trainer: - # TODO: Support Deepspeed - if item.startswith(("deepspeed", "xm", "met", "smp")): continue - if item in inner_training_loop: good_items.append(item) - pass - exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals()) - - start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0] - end = inner_training_loop.find("\n\n", start) - original_debug = inner_training_loop[start:end] - spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:] - front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0) - - debug_info = """debug_info = \\ - f"==((====))== Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\ - f" \\\\\\ /| Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\ - f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\ - f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ - f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' - logger.warning(debug_info) - import subprocess, re, gc - output = subprocess.check_output( - 'nvidia-smi --query-gpu=memory.used --format=csv', shell = True) - output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output) - output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output) - if output > 1: raise RuntimeError( - 'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.') - for _ in range(3): - gc.collect() - torch.cuda.empty_cache()""" - - debug_info = debug_info.split('\n') - debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) - inner_training_loop = inner_training_loop.replace(original_debug, debug_info) - - debug_info = """n_total_devices = total_train_batch_size // \\ - args.gradient_accumulation_steps // self._train_batch_size - if n_total_devices > 1: - logger.warning_once( - "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" - "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" - "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" - "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - ) - debug_info =""" - debug_info = debug_info.split('\n') - debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) - inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1) - - front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0) - inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE) - inner_training_loop = inner_training_loop.replace( - "train_dataloader = tpu_spmd_dataloader(train_dataloader)", - "raise RuntimeError('Unsloth: TPUs are not yet supported!')" - ) - inner_training_loop = inner_training_loop.replace( - "self.accelerator.free_memory()", - "self.accelerator.free_memory()\n" + \ - front_spaces + "if self.is_deepspeed_enabled:"\ - "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1, - ) - - check_batches = """train_dataloader = self.get_train_dataloader() - ga = args.gradient_accumulation_steps - bsz = self._train_batch_size - total_batches = bsz * ga * args.world_size - n_total_devices = total_batches // ga // bsz - if n_total_devices > 1: - logger.warning_once( - "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" - "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" - "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" - "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - ) - divisor = n_total_devices / 1 - bsz = self._train_batch_size = max(int(bsz / divisor), 1) - if total_batches // ga // bsz > 1: - divisor = n_total_devices / 1 - ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)""" - check_batches = check_batches.split('\n') - check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]]) - inner_training_loop = inner_training_loop.replace( - "train_dataloader = self.get_train_dataloader()", - check_batches, 1, - ) - inner_training_loop = inner_training_loop.replace( - "_inner_training_loop", - "_fast_inner_training_loop", 1, - ) - exec(inner_training_loop, globals()) - - Trainer._inner_training_loop = _fast_inner_training_loop - inner_training_loop = inner_training_loop.replace( - "is_torch_tpu_available()", - "False", - ) - if "n_total_devices >" not in inner_training_loop: - raise RuntimeError( - "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" - "We're a 2 person team, so we still have to fund our development costs - thanks!\n" - "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - ) - pass - inner_training_loop = inner_training_loop.replace( - "is_sagemaker_mp_enabled()", - "False", - ) - exec(inner_training_loop, globals()) - Trainer._inner_training_loop = _fast_inner_training_loop - - # Save max_seq_length - max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings) - model.max_seq_length = max_position_embeddings - internal_model = model - while hasattr(internal_model, "model"): - internal_model.max_seq_length = max_position_embeddings - internal_model = internal_model.model - pass - internal_model.max_seq_length = max_position_embeddings - - # We check the tokenizer first for errors - if fix_tokenizer: - tokenizer = check_tokenizer( - model = model, - tokenizer = tokenizer, - model_name = model_name, - model_max_length = max_position_embeddings, - padding_side = "right", - token = token, - ) - pass - patch_saving_functions(tokenizer) - - # Fix up config for transformers uploading PEFT - # Not necessary anymore since we require transformers>=4.37 - if False: - name = model.config._name_or_path - if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): - name = name[:len(name) - len("-bnb-4bit")] - model.config.update({"_name_or_path" : name}) - pass + # @staticmethod + # def from_pretrained( + # model_name = "unsloth/mistral-7b-bnb-4bit", + # max_seq_length = None, + # dtype = None, + # load_in_4bit = True, + # token = None, + # device_map = "sequential", + # rope_scaling = None, # Mistral does not support RoPE scaling + # fix_tokenizer = True, + # model_patcher = None, + # tokenizer_name = None, + # trust_remote_code = False, + # **kwargs, + # ): + # if token is None and "HF_TOKEN" in os.environ: + # token = os.environ["HF_TOKEN"] + + # if token is None and "HUGGINGFACE_TOKEN" in os.environ: + # token = os.environ["HUGGINGFACE_TOKEN"] + + # if model_patcher is None: model_patcher = FastMistralModel + # # Mistral does NOT support RoPE Scaling! + # if rope_scaling is not None: + # logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.") + # pass + + # SUPPORTS_BFLOAT16 = is_bfloat16_supported() + # gpu_stats = torch.cuda.get_device_properties(0) + # max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) + + # statistics = \ + # f"==((====))== Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\ + # f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\ + # f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\ + # f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\ + # f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' + # print(statistics) + # model_patcher.pre_patch() + # # get_statistics() + + # if dtype is None: + # dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16 + # elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16: + # logger.warning_once("Device does not support bfloat16. Will change to float16.") + # dtype = torch.float16 + + # assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32) + + # # Check max sequence length + # model_config = AutoConfig.from_pretrained(model_name, token = token) + # model_max_seq_length = model_config.max_position_embeddings + + # # If max_seq_length is not specified, use maximum fron config + # if max_seq_length is None: + # max_seq_length = model_max_seq_length + # pass + + # # Mistral does NOT support RoPE Scaling sadly so we have to error out. + # if max_seq_length > model_max_seq_length: + # raise RuntimeError( + # f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\ + # f"The maximum sequence length supported is {model_max_seq_length}.", + # ) + # pass + + # bnb_config = None + # if load_in_4bit: + # bnb_config = BitsAndBytesConfig( + # load_in_4bit = True, + # bnb_4bit_use_double_quant = True, + # bnb_4bit_quant_type = "nf4", + # bnb_4bit_compute_dtype = dtype, + # ) + + # max_position_embeddings = max(max_seq_length, model_max_seq_length) + # model = AutoModelForCausalLM.from_pretrained( + # model_name, + # device_map = device_map, + # torch_dtype = dtype, + # quantization_config = bnb_config, + # token = token, + # # rope_scaling = rope_scaling, + # trust_remote_code = trust_remote_code, + # **kwargs, + # ) + + # # Counteract saved tokenizers + # tokenizer_name = model_name if tokenizer_name is None else tokenizer_name + # tokenizer = load_correct_tokenizer( + # tokenizer_name, + # model_max_length = max_position_embeddings, + # padding_side = "right", + # token = token, + # trust_remote_code = trust_remote_code, + # ) + + # model, tokenizer = patch_tokenizer(model, tokenizer) + # model = model_patcher.post_patch(model) + + # # Patch up QKV / O and MLP + # for idx, layer in enumerate(model.model.layers): + # layer.self_attn.apply_qkv = original_apply_qkv + # layer.self_attn.apply_o = original_apply_o + # pass + + # # Patch Trainer + # from transformers.trainer import Trainer + # try: + # if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": + # inner_training_loop = inspect.getsource(Trainer._inner_training_loop) + # Trainer._original_training_loop = inner_training_loop + # else: + # inner_training_loop = Trainer._original_training_loop + # except: + # raise RuntimeError( + # "Our OSS was designed for people with few GPU resources to level the playing field.\n" + # "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" + # "We're a 2 person team, so we still have to fund our development costs - thanks!\n" + # "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + # ) + # pass + + # import transformers.trainer + # items_in_trainer = dir(transformers.trainer) + # good_items = [] + # for item in items_in_trainer: + # # TODO: Support Deepspeed + # if item.startswith(("deepspeed", "xm", "met", "smp")): continue + # if item in inner_training_loop: good_items.append(item) + # pass + # exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals()) + + # start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0] + # end = inner_training_loop.find("\n\n", start) + # original_debug = inner_training_loop[start:end] + # spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:] + # front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0) + + # debug_info = """debug_info = \\ + # f"==((====))== Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\ + # f" \\\\\\ /| Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\ + # f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\ + # f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ + # f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' + # logger.warning(debug_info) + # import subprocess, re, gc + # output = subprocess.check_output( + # 'nvidia-smi --query-gpu=memory.used --format=csv', shell = True) + # output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output) + # output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output) + # if output > 1: raise RuntimeError( + # 'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.') + # for _ in range(3): + # gc.collect() + # torch.cuda.empty_cache()""" + + # debug_info = debug_info.split('\n') + # debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) + # inner_training_loop = inner_training_loop.replace(original_debug, debug_info) + + # debug_info = """n_total_devices = total_train_batch_size // \\ + # args.gradient_accumulation_steps // self._train_batch_size + # if n_total_devices > 1: + # logger.warning_once( + # "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" + # "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" + # "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" + # "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + # ) + # debug_info =""" + # debug_info = debug_info.split('\n') + # debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) + # inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1) + + # front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0) + # inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE) + # inner_training_loop = inner_training_loop.replace( + # "train_dataloader = tpu_spmd_dataloader(train_dataloader)", + # "raise RuntimeError('Unsloth: TPUs are not yet supported!')" + # ) + # inner_training_loop = inner_training_loop.replace( + # "self.accelerator.free_memory()", + # "self.accelerator.free_memory()\n" + \ + # front_spaces + "if self.is_deepspeed_enabled:"\ + # "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1, + # ) + + # check_batches = """train_dataloader = self.get_train_dataloader() + # ga = args.gradient_accumulation_steps + # bsz = self._train_batch_size + # total_batches = bsz * ga * args.world_size + # n_total_devices = total_batches // ga // bsz + # if n_total_devices > 1: + # logger.warning_once( + # "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" + # "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" + # "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" + # "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + # ) + # divisor = n_total_devices / 1 + # bsz = self._train_batch_size = max(int(bsz / divisor), 1) + # if total_batches // ga // bsz > 1: + # divisor = n_total_devices / 1 + # ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)""" + # check_batches = check_batches.split('\n') + # check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]]) + # inner_training_loop = inner_training_loop.replace( + # "train_dataloader = self.get_train_dataloader()", + # check_batches, 1, + # ) + # inner_training_loop = inner_training_loop.replace( + # "_inner_training_loop", + # "_fast_inner_training_loop", 1, + # ) + # exec(inner_training_loop, globals()) + + # Trainer._inner_training_loop = _fast_inner_training_loop + # inner_training_loop = inner_training_loop.replace( + # "is_torch_tpu_available()", + # "False", + # ) + # if "n_total_devices >" not in inner_training_loop: + # raise RuntimeError( + # "Our OSS was designed for people with few GPU resources to level the playing field.\n" + # "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" + # "We're a 2 person team, so we still have to fund our development costs - thanks!\n" + # "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + # ) + # pass + # inner_training_loop = inner_training_loop.replace( + # "is_sagemaker_mp_enabled()", + # "False", + # ) + # exec(inner_training_loop, globals()) + # Trainer._inner_training_loop = _fast_inner_training_loop + + # # Save max_seq_length + # max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings) + # model.max_seq_length = max_position_embeddings + # internal_model = model + # while hasattr(internal_model, "model"): + # internal_model.max_seq_length = max_position_embeddings + # internal_model = internal_model.model + # pass + # internal_model.max_seq_length = max_position_embeddings + + # # We check the tokenizer first for errors + # if fix_tokenizer: + # tokenizer = check_tokenizer( + # model = model, + # tokenizer = tokenizer, + # model_name = model_name, + # model_max_length = max_position_embeddings, + # padding_side = "right", + # token = token, + # ) + # pass + # patch_saving_functions(tokenizer) + + # # Fix up config for transformers uploading PEFT + # # Not necessary anymore since we require transformers>=4.37 + # if False: + # name = model.config._name_or_path + # if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): + # name = name[:len(name) - len("-bnb-4bit")] + # model.config.update({"_name_or_path" : name}) + # pass - # Log Unsloth version for future fastpaths for inference - model.config.update({"unsloth_version" : __version__}) - - # Add save modules - patch_saving_functions(model) - Trainer._inner_training_loop = _fast_inner_training_loop - - # Save tokenizer for inference purposes - tokenizer.padding_side = "left" # Force inference - internal_model = model - while hasattr(internal_model, "model"): - internal_model._saved_temp_tokenizer = tokenizer - internal_model = internal_model.model - pass - internal_model._saved_temp_tokenizer = tokenizer + # # Log Unsloth version for future fastpaths for inference + # model.config.update({"unsloth_version" : __version__}) + + # # Add save modules + # patch_saving_functions(model) + # Trainer._inner_training_loop = _fast_inner_training_loop + + # # Save tokenizer for inference purposes + # tokenizer.padding_side = "left" # Force inference + # internal_model = model + # while hasattr(internal_model, "model"): + # internal_model._saved_temp_tokenizer = tokenizer + # internal_model = internal_model.model + # pass + # internal_model._saved_temp_tokenizer = tokenizer - return model, tokenizer - pass + # return model, tokenizer + # pass pass From 515b1ae45d031657bfc930b52e4ee63191f8c6c2 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 23:09:21 +1000 Subject: [PATCH 139/153] Update mistral.py --- unsloth/models/mistral.py | 596 ++++++++++++++++++++------------------ 1 file changed, 310 insertions(+), 286 deletions(-) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index d0af320e..1b89929d 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -287,291 +287,315 @@ def pre_patch(): pass - # @staticmethod - # def from_pretrained( - # model_name = "unsloth/mistral-7b-bnb-4bit", - # max_seq_length = None, - # dtype = None, - # load_in_4bit = True, - # token = None, - # device_map = "sequential", - # rope_scaling = None, # Mistral does not support RoPE scaling - # fix_tokenizer = True, - # model_patcher = None, - # tokenizer_name = None, - # trust_remote_code = False, - # **kwargs, - # ): - # if token is None and "HF_TOKEN" in os.environ: - # token = os.environ["HF_TOKEN"] - - # if token is None and "HUGGINGFACE_TOKEN" in os.environ: - # token = os.environ["HUGGINGFACE_TOKEN"] - - # if model_patcher is None: model_patcher = FastMistralModel - # # Mistral does NOT support RoPE Scaling! - # if rope_scaling is not None: - # logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.") - # pass - - # SUPPORTS_BFLOAT16 = is_bfloat16_supported() - # gpu_stats = torch.cuda.get_device_properties(0) - # max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) - - # statistics = \ - # f"==((====))== Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\ - # f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\ - # f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\ - # f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\ - # f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' - # print(statistics) - # model_patcher.pre_patch() - # # get_statistics() - - # if dtype is None: - # dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16 - # elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16: - # logger.warning_once("Device does not support bfloat16. Will change to float16.") - # dtype = torch.float16 - - # assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32) - - # # Check max sequence length - # model_config = AutoConfig.from_pretrained(model_name, token = token) - # model_max_seq_length = model_config.max_position_embeddings - - # # If max_seq_length is not specified, use maximum fron config - # if max_seq_length is None: - # max_seq_length = model_max_seq_length - # pass - - # # Mistral does NOT support RoPE Scaling sadly so we have to error out. - # if max_seq_length > model_max_seq_length: - # raise RuntimeError( - # f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\ - # f"The maximum sequence length supported is {model_max_seq_length}.", - # ) - # pass - - # bnb_config = None - # if load_in_4bit: - # bnb_config = BitsAndBytesConfig( - # load_in_4bit = True, - # bnb_4bit_use_double_quant = True, - # bnb_4bit_quant_type = "nf4", - # bnb_4bit_compute_dtype = dtype, - # ) - - # max_position_embeddings = max(max_seq_length, model_max_seq_length) - # model = AutoModelForCausalLM.from_pretrained( - # model_name, - # device_map = device_map, - # torch_dtype = dtype, - # quantization_config = bnb_config, - # token = token, - # # rope_scaling = rope_scaling, - # trust_remote_code = trust_remote_code, - # **kwargs, - # ) - - # # Counteract saved tokenizers - # tokenizer_name = model_name if tokenizer_name is None else tokenizer_name - # tokenizer = load_correct_tokenizer( - # tokenizer_name, - # model_max_length = max_position_embeddings, - # padding_side = "right", - # token = token, - # trust_remote_code = trust_remote_code, - # ) - - # model, tokenizer = patch_tokenizer(model, tokenizer) - # model = model_patcher.post_patch(model) - - # # Patch up QKV / O and MLP - # for idx, layer in enumerate(model.model.layers): - # layer.self_attn.apply_qkv = original_apply_qkv - # layer.self_attn.apply_o = original_apply_o - # pass - - # # Patch Trainer - # from transformers.trainer import Trainer - # try: - # if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": - # inner_training_loop = inspect.getsource(Trainer._inner_training_loop) - # Trainer._original_training_loop = inner_training_loop - # else: - # inner_training_loop = Trainer._original_training_loop - # except: - # raise RuntimeError( - # "Our OSS was designed for people with few GPU resources to level the playing field.\n" - # "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" - # "We're a 2 person team, so we still have to fund our development costs - thanks!\n" - # "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - # ) - # pass - - # import transformers.trainer - # items_in_trainer = dir(transformers.trainer) - # good_items = [] - # for item in items_in_trainer: - # # TODO: Support Deepspeed - # if item.startswith(("deepspeed", "xm", "met", "smp")): continue - # if item in inner_training_loop: good_items.append(item) - # pass - # exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals()) - - # start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0] - # end = inner_training_loop.find("\n\n", start) - # original_debug = inner_training_loop[start:end] - # spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:] - # front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0) - - # debug_info = """debug_info = \\ - # f"==((====))== Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\ - # f" \\\\\\ /| Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\ - # f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\ - # f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ - # f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' - # logger.warning(debug_info) - # import subprocess, re, gc - # output = subprocess.check_output( - # 'nvidia-smi --query-gpu=memory.used --format=csv', shell = True) - # output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output) - # output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output) - # if output > 1: raise RuntimeError( - # 'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.') - # for _ in range(3): - # gc.collect() - # torch.cuda.empty_cache()""" - - # debug_info = debug_info.split('\n') - # debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) - # inner_training_loop = inner_training_loop.replace(original_debug, debug_info) - - # debug_info = """n_total_devices = total_train_batch_size // \\ - # args.gradient_accumulation_steps // self._train_batch_size - # if n_total_devices > 1: - # logger.warning_once( - # "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" - # "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" - # "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" - # "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - # ) - # debug_info =""" - # debug_info = debug_info.split('\n') - # debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) - # inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1) - - # front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0) - # inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE) - # inner_training_loop = inner_training_loop.replace( - # "train_dataloader = tpu_spmd_dataloader(train_dataloader)", - # "raise RuntimeError('Unsloth: TPUs are not yet supported!')" - # ) - # inner_training_loop = inner_training_loop.replace( - # "self.accelerator.free_memory()", - # "self.accelerator.free_memory()\n" + \ - # front_spaces + "if self.is_deepspeed_enabled:"\ - # "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1, - # ) - - # check_batches = """train_dataloader = self.get_train_dataloader() - # ga = args.gradient_accumulation_steps - # bsz = self._train_batch_size - # total_batches = bsz * ga * args.world_size - # n_total_devices = total_batches // ga // bsz - # if n_total_devices > 1: - # logger.warning_once( - # "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" - # "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" - # "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" - # "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - # ) - # divisor = n_total_devices / 1 - # bsz = self._train_batch_size = max(int(bsz / divisor), 1) - # if total_batches // ga // bsz > 1: - # divisor = n_total_devices / 1 - # ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)""" - # check_batches = check_batches.split('\n') - # check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]]) - # inner_training_loop = inner_training_loop.replace( - # "train_dataloader = self.get_train_dataloader()", - # check_batches, 1, - # ) - # inner_training_loop = inner_training_loop.replace( - # "_inner_training_loop", - # "_fast_inner_training_loop", 1, - # ) - # exec(inner_training_loop, globals()) - - # Trainer._inner_training_loop = _fast_inner_training_loop - # inner_training_loop = inner_training_loop.replace( - # "is_torch_tpu_available()", - # "False", - # ) - # if "n_total_devices >" not in inner_training_loop: - # raise RuntimeError( - # "Our OSS was designed for people with few GPU resources to level the playing field.\n" - # "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" - # "We're a 2 person team, so we still have to fund our development costs - thanks!\n" - # "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - # ) - # pass - # inner_training_loop = inner_training_loop.replace( - # "is_sagemaker_mp_enabled()", - # "False", - # ) - # exec(inner_training_loop, globals()) - # Trainer._inner_training_loop = _fast_inner_training_loop - - # # Save max_seq_length - # max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings) - # model.max_seq_length = max_position_embeddings - # internal_model = model - # while hasattr(internal_model, "model"): - # internal_model.max_seq_length = max_position_embeddings - # internal_model = internal_model.model - # pass - # internal_model.max_seq_length = max_position_embeddings - - # # We check the tokenizer first for errors - # if fix_tokenizer: - # tokenizer = check_tokenizer( - # model = model, - # tokenizer = tokenizer, - # model_name = model_name, - # model_max_length = max_position_embeddings, - # padding_side = "right", - # token = token, - # ) - # pass - # patch_saving_functions(tokenizer) - - # # Fix up config for transformers uploading PEFT - # # Not necessary anymore since we require transformers>=4.37 - # if False: - # name = model.config._name_or_path - # if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): - # name = name[:len(name) - len("-bnb-4bit")] - # model.config.update({"_name_or_path" : name}) - # pass - - # # Log Unsloth version for future fastpaths for inference - # model.config.update({"unsloth_version" : __version__}) - - # # Add save modules - # patch_saving_functions(model) - # Trainer._inner_training_loop = _fast_inner_training_loop - - # # Save tokenizer for inference purposes - # tokenizer.padding_side = "left" # Force inference - # internal_model = model - # while hasattr(internal_model, "model"): - # internal_model._saved_temp_tokenizer = tokenizer - # internal_model = internal_model.model - # pass - # internal_model._saved_temp_tokenizer = tokenizer + @staticmethod + def from_pretrained( + model_name = "unsloth/llama-2-7b-bnb-4bit", + max_seq_length = None, + dtype = None, + load_in_4bit = True, + token = None, + device_map = "sequential", + rope_scaling = None, + fix_tokenizer = True, + model_patcher = None, + tokenizer_name = None, + trust_remote_code = False, + **kwargs, + ): + if token is None and "HF_TOKEN" in os.environ: + token = os.environ["HF_TOKEN"] + + if token is None and "HUGGINGFACE_TOKEN" in os.environ: + token = os.environ["HUGGINGFACE_TOKEN"] + + if model_patcher is None: model_patcher = FastLlamaModel + SUPPORTS_BFLOAT16 = is_bfloat16_supported() + gpu_stats = torch.cuda.get_device_properties(0) + max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) + + statistics = \ + f"==((====))== Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\ + f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\ + f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\ + f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\ + f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' + print(statistics) + model_patcher.pre_patch() + # get_statistics() + + if dtype is None: + dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16 + elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16: + logger.warning_once("Device does not support bfloat16. Will change to float16.") + dtype = torch.float16 + + assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32) + + # RoPE scaling + model_max_seq_length = \ + AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings + + # If max_seq_length is not specified, use maximum fron config + if max_seq_length is None: + max_seq_length = model_max_seq_length + pass + + if (rope_scaling is None) and (max_seq_length > model_max_seq_length): + rope_scaling = max_seq_length / model_max_seq_length + logger.warning_once( + f"Unsloth: {model_name} can only handle sequence lengths of at most "\ + f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\ + f"{round(rope_scaling, 3)}, it can be magically be extended to "\ + f"{max_seq_length}!" + ) + rope_scaling = {"type": "linear", "factor": rope_scaling,} + pass + + bnb_config = None + if load_in_4bit: + bnb_config = BitsAndBytesConfig( + load_in_4bit = True, + bnb_4bit_use_double_quant = True, + bnb_4bit_quant_type = "nf4", + bnb_4bit_compute_dtype = dtype, + ) + pass + + # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12 + # RoPE Scaling's max_position_embeddings must be updated + max_position_embeddings = max(max_seq_length, model_max_seq_length) + try: + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map = device_map, + torch_dtype = dtype, + quantization_config = bnb_config, + token = token, + rope_scaling = rope_scaling, + max_position_embeddings = max_position_embeddings, + trust_remote_code = trust_remote_code, + **kwargs, + ) + except Exception as error: + if "rope_scaling" in str(error): + if rope_scaling is not None: + raise TypeError("Unsloth: {model_name} does not support rope_scaling.") + pass + + # Counteract missing rope_scaling + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map = device_map, + torch_dtype = dtype, + quantization_config = bnb_config, + token = token, + max_position_embeddings = max_position_embeddings, + trust_remote_code = trust_remote_code, + **kwargs, + ) + else: + raise error + pass + pass + + # Counteract saved tokenizers + tokenizer_name = model_name if tokenizer_name is None else tokenizer_name + tokenizer = load_correct_tokenizer( + tokenizer_name = tokenizer_name, + model_max_length = max_position_embeddings, + padding_side = "right", + token = token, + trust_remote_code = trust_remote_code, + ) + + model, tokenizer = patch_tokenizer(model, tokenizer) + model = model_patcher.post_patch(model) + + # Patch up QKV / O and MLP + for idx, layer in enumerate(model.model.layers): + layer.self_attn.apply_qkv = original_apply_qkv + layer.self_attn.apply_o = original_apply_o + pass + + # Patch Trainer + from transformers.trainer import Trainer + try: + if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": + inner_training_loop = inspect.getsource(Trainer._inner_training_loop) + Trainer._original_training_loop = inner_training_loop + else: + inner_training_loop = Trainer._original_training_loop + except: + raise RuntimeError( + "Our OSS was designed for people with few GPU resources to level the playing field.\n" + "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" + "We're a 2 person team, so we still have to fund our development costs - thanks!\n" + "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + ) + pass + + import transformers.trainer + items_in_trainer = dir(transformers.trainer) + good_items = [] + for item in items_in_trainer: + # TODO: Support Deepspeed + if item.startswith(("deepspeed", "xm", "met", "smp")): continue + if item in inner_training_loop: good_items.append(item) + pass + exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals()) + + start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0] + end = inner_training_loop.find("\n\n", start) + original_debug = inner_training_loop[start:end] + spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:] + front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0) + + debug_info = """debug_info = \\ + f"==((====))== Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\ + f" \\\\\\ /| Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\ + f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\ + f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ + f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' + logger.warning(debug_info) + import subprocess, re, gc + output = subprocess.check_output( + 'nvidia-smi --query-gpu=memory.used --format=csv', shell = True) + output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output) + output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output) + if output > 1: raise RuntimeError( + 'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.') + for _ in range(3): + gc.collect() + torch.cuda.empty_cache()""" + + debug_info = debug_info.split('\n') + debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) + inner_training_loop = inner_training_loop.replace(original_debug, debug_info) + + debug_info = """n_total_devices = total_train_batch_size // \\ + args.gradient_accumulation_steps // self._train_batch_size + if n_total_devices > 1: + logger.warning_once( + "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" + "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" + "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" + "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + ) + debug_info =""" + debug_info = debug_info.split('\n') + debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) + inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1) + + front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0) + inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE) + inner_training_loop = inner_training_loop.replace( + "train_dataloader = tpu_spmd_dataloader(train_dataloader)", + "raise RuntimeError('Unsloth: TPUs are not yet supported!')" + ) + inner_training_loop = inner_training_loop.replace( + "self.accelerator.free_memory()", + "self.accelerator.free_memory()\n" + \ + front_spaces + "if self.is_deepspeed_enabled:"\ + "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1, + ) + + check_batches = """train_dataloader = self.get_train_dataloader() + ga = args.gradient_accumulation_steps + bsz = self._train_batch_size + total_batches = bsz * ga * args.world_size + n_total_devices = total_batches // ga // bsz + if n_total_devices > 1: + logger.warning_once( + "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" + "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" + "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" + "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + ) + divisor = n_total_devices / 1 + bsz = self._train_batch_size = max(int(bsz / divisor), 1) + if total_batches // ga // bsz > 1: + divisor = n_total_devices / 1 + ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)""" + check_batches = check_batches.split('\n') + check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]]) + inner_training_loop = inner_training_loop.replace( + "train_dataloader = self.get_train_dataloader()", + check_batches, 1, + ) + inner_training_loop = inner_training_loop.replace( + "_inner_training_loop", + "_fast_inner_training_loop", 1, + ) + exec(inner_training_loop, globals()) + + Trainer._inner_training_loop = _fast_inner_training_loop + inner_training_loop = inner_training_loop.replace( + "is_torch_tpu_available()", + "False", + ) + if "n_total_devices >" not in inner_training_loop: + raise RuntimeError( + "Our OSS was designed for people with few GPU resources to level the playing field.\n" + "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" + "We're a 2 person team, so we still have to fund our development costs - thanks!\n" + "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", + ) + pass + inner_training_loop = inner_training_loop.replace( + "is_sagemaker_mp_enabled()", + "False", + ) + exec(inner_training_loop, globals()) + Trainer._inner_training_loop = _fast_inner_training_loop + + # Save max_seq_length + model.max_seq_length = max_position_embeddings + internal_model = model + while hasattr(internal_model, "model"): + internal_model.max_seq_length = max_position_embeddings + internal_model = internal_model.model + pass + internal_model.max_seq_length = max_position_embeddings + + # We check the tokenizer first for errors + if fix_tokenizer: + tokenizer = check_tokenizer( + model = model, + tokenizer = tokenizer, + model_name = model_name, + model_max_length = max_position_embeddings, + padding_side = "right", + token = token, + ) + pass + patch_saving_functions(tokenizer) + + # Fix up config for transformers uploading PEFT + # Not necessary anymore since we require transformers>=4.37! + if False: + name = model.config._name_or_path + if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): + name = name[:len(name) - len("-bnb-4bit")] + model.config.update({"_name_or_path" : name}) + pass + pass + + # Log Unsloth version for future fastpaths for inference + model.config.update({"unsloth_version" : __version__}) + + # Add save modules + patch_saving_functions(model) + Trainer._inner_training_loop = _fast_inner_training_loop + + # Save tokenizer for inference purposes + tokenizer.padding_side = "left" # Force inference + internal_model = model + while hasattr(internal_model, "model"): + internal_model._saved_temp_tokenizer = tokenizer + internal_model = internal_model.model + pass + internal_model._saved_temp_tokenizer = tokenizer - # return model, tokenizer - # pass + return model, tokenizer + pass pass From 7f28209ac5263ad997806da01dde2159185526f2 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 23:29:18 +1000 Subject: [PATCH 140/153] Update mistral.py --- unsloth/models/mistral.py | 84 ++++++++++++++------------------------- 1 file changed, 30 insertions(+), 54 deletions(-) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 1b89929d..ff2e909f 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -289,13 +289,13 @@ def pre_patch(): @staticmethod def from_pretrained( - model_name = "unsloth/llama-2-7b-bnb-4bit", + model_name = "unsloth/mistral-7b-bnb-4bit", max_seq_length = None, dtype = None, load_in_4bit = True, token = None, device_map = "sequential", - rope_scaling = None, + rope_scaling = None, # Mistral does not support RoPE scaling fix_tokenizer = True, model_patcher = None, tokenizer_name = None, @@ -308,7 +308,12 @@ def from_pretrained( if token is None and "HUGGINGFACE_TOKEN" in os.environ: token = os.environ["HUGGINGFACE_TOKEN"] - if model_patcher is None: model_patcher = FastLlamaModel + if model_patcher is None: model_patcher = FastMistralModel + # Mistral does NOT support RoPE Scaling! + if rope_scaling is not None: + logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.") + pass + SUPPORTS_BFLOAT16 = is_bfloat16_supported() gpu_stats = torch.cuda.get_device_properties(0) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) @@ -331,24 +336,21 @@ def from_pretrained( assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32) - # RoPE scaling - model_max_seq_length = \ - AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings + # Check max sequence length + model_config = AutoConfig.from_pretrained(model_name, token = token) + model_max_seq_length = model_config.max_position_embeddings # If max_seq_length is not specified, use maximum fron config if max_seq_length is None: max_seq_length = model_max_seq_length pass - if (rope_scaling is None) and (max_seq_length > model_max_seq_length): - rope_scaling = max_seq_length / model_max_seq_length - logger.warning_once( - f"Unsloth: {model_name} can only handle sequence lengths of at most "\ - f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\ - f"{round(rope_scaling, 3)}, it can be magically be extended to "\ - f"{max_seq_length}!" + # Mistral does NOT support RoPE Scaling sadly so we have to error out. + if max_seq_length > model_max_seq_length: + raise RuntimeError( + f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\ + f"The maximum sequence length supported is {model_max_seq_length}.", ) - rope_scaling = {"type": "linear", "factor": rope_scaling,} pass bnb_config = None @@ -359,49 +361,23 @@ def from_pretrained( bnb_4bit_quant_type = "nf4", bnb_4bit_compute_dtype = dtype, ) - pass - # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12 - # RoPE Scaling's max_position_embeddings must be updated max_position_embeddings = max(max_seq_length, model_max_seq_length) - try: - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map = device_map, - torch_dtype = dtype, - quantization_config = bnb_config, - token = token, - rope_scaling = rope_scaling, - max_position_embeddings = max_position_embeddings, - trust_remote_code = trust_remote_code, - **kwargs, - ) - except Exception as error: - if "rope_scaling" in str(error): - if rope_scaling is not None: - raise TypeError("Unsloth: {model_name} does not support rope_scaling.") - pass - - # Counteract missing rope_scaling - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map = device_map, - torch_dtype = dtype, - quantization_config = bnb_config, - token = token, - max_position_embeddings = max_position_embeddings, - trust_remote_code = trust_remote_code, - **kwargs, - ) - else: - raise error - pass - pass + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map = device_map, + torch_dtype = dtype, + quantization_config = bnb_config, + token = token, + # rope_scaling = rope_scaling, + trust_remote_code = trust_remote_code, + **kwargs, + ) # Counteract saved tokenizers tokenizer_name = model_name if tokenizer_name is None else tokenizer_name tokenizer = load_correct_tokenizer( - tokenizer_name = tokenizer_name, + tokenizer_name, model_max_length = max_position_embeddings, padding_side = "right", token = token, @@ -549,6 +525,7 @@ def from_pretrained( Trainer._inner_training_loop = _fast_inner_training_loop # Save max_seq_length + max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings) model.max_seq_length = max_position_embeddings internal_model = model while hasattr(internal_model, "model"): @@ -571,15 +548,14 @@ def from_pretrained( patch_saving_functions(tokenizer) # Fix up config for transformers uploading PEFT - # Not necessary anymore since we require transformers>=4.37! + # Not necessary anymore since we require transformers>=4.37 if False: name = model.config._name_or_path if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): name = name[:len(name) - len("-bnb-4bit")] model.config.update({"_name_or_path" : name}) pass - pass - + # Log Unsloth version for future fastpaths for inference model.config.update({"unsloth_version" : __version__}) From 453cc48660967ddcf4e1eeb707d2f29e77dde7b0 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 23:32:58 +1000 Subject: [PATCH 141/153] Update mistral.py --- unsloth/models/mistral.py | 84 +++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 30 deletions(-) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index ff2e909f..6bf3fc84 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -289,13 +289,13 @@ def pre_patch(): @staticmethod def from_pretrained( - model_name = "unsloth/mistral-7b-bnb-4bit", + model_name = "unsloth/llama-2-7b-bnb-4bit", max_seq_length = None, dtype = None, load_in_4bit = True, token = None, device_map = "sequential", - rope_scaling = None, # Mistral does not support RoPE scaling + rope_scaling = None, fix_tokenizer = True, model_patcher = None, tokenizer_name = None, @@ -308,12 +308,7 @@ def from_pretrained( if token is None and "HUGGINGFACE_TOKEN" in os.environ: token = os.environ["HUGGINGFACE_TOKEN"] - if model_patcher is None: model_patcher = FastMistralModel - # Mistral does NOT support RoPE Scaling! - if rope_scaling is not None: - logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.") - pass - + if model_patcher is None: model_patcher = FastLlamaModel SUPPORTS_BFLOAT16 = is_bfloat16_supported() gpu_stats = torch.cuda.get_device_properties(0) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) @@ -336,21 +331,24 @@ def from_pretrained( assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32) - # Check max sequence length - model_config = AutoConfig.from_pretrained(model_name, token = token) - model_max_seq_length = model_config.max_position_embeddings + # RoPE scaling + model_max_seq_length = \ + AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings # If max_seq_length is not specified, use maximum fron config if max_seq_length is None: max_seq_length = model_max_seq_length pass - # Mistral does NOT support RoPE Scaling sadly so we have to error out. - if max_seq_length > model_max_seq_length: - raise RuntimeError( - f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\ - f"The maximum sequence length supported is {model_max_seq_length}.", + if (rope_scaling is None) and (max_seq_length > model_max_seq_length): + rope_scaling = max_seq_length / model_max_seq_length + logger.warning_once( + f"Unsloth: {model_name} can only handle sequence lengths of at most "\ + f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\ + f"{round(rope_scaling, 3)}, it can be magically be extended to "\ + f"{max_seq_length}!" ) + rope_scaling = {"type": "linear", "factor": rope_scaling,} pass bnb_config = None @@ -361,23 +359,49 @@ def from_pretrained( bnb_4bit_quant_type = "nf4", bnb_4bit_compute_dtype = dtype, ) + pass + # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12 + # RoPE Scaling's max_position_embeddings must be updated max_position_embeddings = max(max_seq_length, model_max_seq_length) - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map = device_map, - torch_dtype = dtype, - quantization_config = bnb_config, - token = token, - # rope_scaling = rope_scaling, - trust_remote_code = trust_remote_code, - **kwargs, - ) + try: + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map = device_map, + torch_dtype = dtype, + quantization_config = bnb_config, + token = token, + # rope_scaling = rope_scaling, + max_position_embeddings = max_position_embeddings, + trust_remote_code = trust_remote_code, + **kwargs, + ) + except Exception as error: + if "rope_scaling" in str(error): + if rope_scaling is not None: + raise TypeError("Unsloth: {model_name} does not support rope_scaling.") + pass + + # Counteract missing rope_scaling + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map = device_map, + torch_dtype = dtype, + quantization_config = bnb_config, + token = token, + max_position_embeddings = max_position_embeddings, + trust_remote_code = trust_remote_code, + **kwargs, + ) + else: + raise error + pass + pass # Counteract saved tokenizers tokenizer_name = model_name if tokenizer_name is None else tokenizer_name tokenizer = load_correct_tokenizer( - tokenizer_name, + tokenizer_name = tokenizer_name, model_max_length = max_position_embeddings, padding_side = "right", token = token, @@ -525,7 +549,6 @@ def from_pretrained( Trainer._inner_training_loop = _fast_inner_training_loop # Save max_seq_length - max_position_embeddings = max(max_seq_length, model.config.max_position_embeddings) model.max_seq_length = max_position_embeddings internal_model = model while hasattr(internal_model, "model"): @@ -548,14 +571,15 @@ def from_pretrained( patch_saving_functions(tokenizer) # Fix up config for transformers uploading PEFT - # Not necessary anymore since we require transformers>=4.37 + # Not necessary anymore since we require transformers>=4.37! if False: name = model.config._name_or_path if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): name = name[:len(name) - len("-bnb-4bit")] model.config.update({"_name_or_path" : name}) pass - + pass + # Log Unsloth version for future fastpaths for inference model.config.update({"unsloth_version" : __version__}) From 6633d4a9bba8d94af0c6d1387a123edcd3aeb0b0 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Fri, 14 Jun 2024 23:38:46 +1000 Subject: [PATCH 142/153] Update mistral.py --- unsloth/models/mistral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 6bf3fc84..1b89929d 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -371,7 +371,7 @@ def from_pretrained( torch_dtype = dtype, quantization_config = bnb_config, token = token, - # rope_scaling = rope_scaling, + rope_scaling = rope_scaling, max_position_embeddings = max_position_embeddings, trust_remote_code = trust_remote_code, **kwargs, From e5bf125140975f709ca0d69dd5d2b5ca2e0d8e06 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 15 Jun 2024 00:31:38 +1000 Subject: [PATCH 143/153] Auto check rope scaling --- unsloth/models/llama.py | 94 +++++------ unsloth/models/mistral.py | 323 +++----------------------------------- unsloth/models/qwen2.py | 46 +++--- 3 files changed, 91 insertions(+), 372 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 4cbbcf0a..9e84c6ee 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -51,6 +51,7 @@ pass from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING from transformers import set_seed as transformers_set_seed from peft import LoraConfig, TaskType, get_peft_model as _get_peft_model from peft import PeftModelForCausalLM @@ -1028,16 +1029,16 @@ def pre_patch(): @staticmethod def from_pretrained( - model_name = "unsloth/llama-2-7b-bnb-4bit", - max_seq_length = None, - dtype = None, - load_in_4bit = True, - token = None, - device_map = "sequential", - rope_scaling = None, - fix_tokenizer = True, - model_patcher = None, - tokenizer_name = None, + model_name = "unsloth/llama-3-8b-bnb-4bit", + max_seq_length = None, + dtype = None, + load_in_4bit = True, + token = None, + device_map = "sequential", + rope_scaling = None, + fix_tokenizer = True, + model_patcher = None, + tokenizer_name = None, trust_remote_code = False, **kwargs, ): @@ -1070,9 +1071,17 @@ def from_pretrained( assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32) - # RoPE scaling - model_max_seq_length = \ - AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings + # RoPE Scaling + model_config = AutoConfig.from_pretrained(model_name, token = token) + model_max_seq_length = model_config.max_position_embeddings + + # Check if RoPE Scaling is even allowed + model_function = MODEL_FOR_CAUSAL_LM_MAPPING[model_config.__class__] + has_rope_scaling = False + try: + with open(inspect.getfile(model_function), "r") as file: + has_rope_scaling = "self.config.rope_scaling" in file.read() + except: pass # If max_seq_length is not specified, use maximum fron config if max_seq_length is None: @@ -1080,6 +1089,18 @@ def from_pretrained( pass if (rope_scaling is None) and (max_seq_length > model_max_seq_length): + + # Warn RoPE scaling isn't allowed + if not has_rope_scaling: + raise RuntimeError( + f"Unsloth: {model_name} can only handle sequence lengths of at most "\ + f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\ + f"{round(rope_scaling, 3)}, it should be magically be extended to "\ + f"{max_seq_length}. However, {model_name} doesn't support RoPE Scaling!\n"\ + "Please file a feature request at https://github.com/unslothai/unsloth." + ) + pass + rope_scaling = max_seq_length / model_max_seq_length logger.warning_once( f"Unsloth: {model_name} can only handle sequence lengths of at most "\ @@ -1088,6 +1109,9 @@ def from_pretrained( f"{max_seq_length}!" ) rope_scaling = {"type": "linear", "factor": rope_scaling,} + + # Add to kwargs + kwargs["rope_scaling"] = rope_scaling pass bnb_config = None @@ -1103,39 +1127,16 @@ def from_pretrained( # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12 # RoPE Scaling's max_position_embeddings must be updated max_position_embeddings = max(max_seq_length, model_max_seq_length) - try: - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map = device_map, - torch_dtype = dtype, - quantization_config = bnb_config, - token = token, - rope_scaling = rope_scaling, - max_position_embeddings = max_position_embeddings, - trust_remote_code = trust_remote_code, - **kwargs, - ) - except Exception as error: - if "rope_scaling" in str(error): - if rope_scaling is not None: - raise TypeError("Unsloth: {model_name} does not support rope_scaling.") - pass - - # Counteract missing rope_scaling - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map = device_map, - torch_dtype = dtype, - quantization_config = bnb_config, - token = token, - max_position_embeddings = max_position_embeddings, - trust_remote_code = trust_remote_code, - **kwargs, - ) - else: - raise error - pass - pass + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map = device_map, + torch_dtype = dtype, + quantization_config = bnb_config, + token = token, + max_position_embeddings = max_position_embeddings, + trust_remote_code = trust_remote_code, + **kwargs, + ) # Counteract saved tokenizers tokenizer_name = model_name if tokenizer_name is None else tokenizer_name @@ -1423,7 +1424,6 @@ def get_peft_model( if loftq_config is None: loftq_config = {} - import inspect signature = str(inspect.signature(LoraConfig)) SUPPORTS_LOFTQ = "loftq_config" in signature SUPPORTS_RSLORA = "use_rslora" in signature diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 1b89929d..291f0aa5 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -289,313 +289,32 @@ def pre_patch(): @staticmethod def from_pretrained( - model_name = "unsloth/llama-2-7b-bnb-4bit", - max_seq_length = None, - dtype = None, - load_in_4bit = True, - token = None, - device_map = "sequential", - rope_scaling = None, - fix_tokenizer = True, - model_patcher = None, - tokenizer_name = None, + model_name = "unsloth/mistral-7b-bnb-4bit", + max_seq_length = None, + dtype = None, + load_in_4bit = True, + token = None, + device_map = "sequential", + rope_scaling = None, # Mistral does not support RoPE scaling + fix_tokenizer = True, + model_patcher = None, + tokenizer_name = None, trust_remote_code = False, **kwargs, ): - if token is None and "HF_TOKEN" in os.environ: - token = os.environ["HF_TOKEN"] - - if token is None and "HUGGINGFACE_TOKEN" in os.environ: - token = os.environ["HUGGINGFACE_TOKEN"] - - if model_patcher is None: model_patcher = FastLlamaModel - SUPPORTS_BFLOAT16 = is_bfloat16_supported() - gpu_stats = torch.cuda.get_device_properties(0) - max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) - - statistics = \ - f"==((====))== Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\ - f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\ - f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\ - f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\ - f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' - print(statistics) - model_patcher.pre_patch() - # get_statistics() - - if dtype is None: - dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16 - elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16: - logger.warning_once("Device does not support bfloat16. Will change to float16.") - dtype = torch.float16 - - assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32) - - # RoPE scaling - model_max_seq_length = \ - AutoConfig.from_pretrained(model_name, token = token).max_position_embeddings - - # If max_seq_length is not specified, use maximum fron config - if max_seq_length is None: - max_seq_length = model_max_seq_length - pass - - if (rope_scaling is None) and (max_seq_length > model_max_seq_length): - rope_scaling = max_seq_length / model_max_seq_length - logger.warning_once( - f"Unsloth: {model_name} can only handle sequence lengths of at most "\ - f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\ - f"{round(rope_scaling, 3)}, it can be magically be extended to "\ - f"{max_seq_length}!" - ) - rope_scaling = {"type": "linear", "factor": rope_scaling,} - pass - - bnb_config = None - if load_in_4bit: - bnb_config = BitsAndBytesConfig( - load_in_4bit = True, - bnb_4bit_use_double_quant = True, - bnb_4bit_quant_type = "nf4", - bnb_4bit_compute_dtype = dtype, - ) - pass - - # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12 - # RoPE Scaling's max_position_embeddings must be updated - max_position_embeddings = max(max_seq_length, model_max_seq_length) - try: - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map = device_map, - torch_dtype = dtype, - quantization_config = bnb_config, - token = token, - rope_scaling = rope_scaling, - max_position_embeddings = max_position_embeddings, - trust_remote_code = trust_remote_code, - **kwargs, - ) - except Exception as error: - if "rope_scaling" in str(error): - if rope_scaling is not None: - raise TypeError("Unsloth: {model_name} does not support rope_scaling.") - pass - - # Counteract missing rope_scaling - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map = device_map, - torch_dtype = dtype, - quantization_config = bnb_config, - token = token, - max_position_embeddings = max_position_embeddings, - trust_remote_code = trust_remote_code, - **kwargs, - ) - else: - raise error - pass - pass - - # Counteract saved tokenizers - tokenizer_name = model_name if tokenizer_name is None else tokenizer_name - tokenizer = load_correct_tokenizer( - tokenizer_name = tokenizer_name, - model_max_length = max_position_embeddings, - padding_side = "right", + return FastLlamaModel.from_pretrained( + model_name = model_name, + max_seq_length = max_seq_length, + dtype = dtype, + load_in_4bit = load_in_4bit, token = token, + device_map = device_map, + rope_scaling = rope_scaling, + fix_tokenizer = fix_tokenizer, + model_patcher = FastMistralModel, + tokenizer_name = tokenizer_name, trust_remote_code = trust_remote_code, + **kwargs, ) - - model, tokenizer = patch_tokenizer(model, tokenizer) - model = model_patcher.post_patch(model) - - # Patch up QKV / O and MLP - for idx, layer in enumerate(model.model.layers): - layer.self_attn.apply_qkv = original_apply_qkv - layer.self_attn.apply_o = original_apply_o - pass - - # Patch Trainer - from transformers.trainer import Trainer - try: - if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": - inner_training_loop = inspect.getsource(Trainer._inner_training_loop) - Trainer._original_training_loop = inner_training_loop - else: - inner_training_loop = Trainer._original_training_loop - except: - raise RuntimeError( - "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" - "We're a 2 person team, so we still have to fund our development costs - thanks!\n" - "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - ) - pass - - import transformers.trainer - items_in_trainer = dir(transformers.trainer) - good_items = [] - for item in items_in_trainer: - # TODO: Support Deepspeed - if item.startswith(("deepspeed", "xm", "met", "smp")): continue - if item in inner_training_loop: good_items.append(item) - pass - exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals()) - - start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0] - end = inner_training_loop.find("\n\n", start) - original_debug = inner_training_loop[start:end] - spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:] - front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0) - - debug_info = """debug_info = \\ - f"==((====))== Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\ - f" \\\\\\ /| Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\ - f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\ - f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ - f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' - logger.warning(debug_info) - import subprocess, re, gc - output = subprocess.check_output( - 'nvidia-smi --query-gpu=memory.used --format=csv', shell = True) - output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output) - output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output) - if output > 1: raise RuntimeError( - 'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.') - for _ in range(3): - gc.collect() - torch.cuda.empty_cache()""" - - debug_info = debug_info.split('\n') - debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) - inner_training_loop = inner_training_loop.replace(original_debug, debug_info) - - debug_info = """n_total_devices = total_train_batch_size // \\ - args.gradient_accumulation_steps // self._train_batch_size - if n_total_devices > 1: - logger.warning_once( - "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" - "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" - "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" - "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - ) - debug_info =""" - debug_info = debug_info.split('\n') - debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) - inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1) - - front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0) - inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE) - inner_training_loop = inner_training_loop.replace( - "train_dataloader = tpu_spmd_dataloader(train_dataloader)", - "raise RuntimeError('Unsloth: TPUs are not yet supported!')" - ) - inner_training_loop = inner_training_loop.replace( - "self.accelerator.free_memory()", - "self.accelerator.free_memory()\n" + \ - front_spaces + "if self.is_deepspeed_enabled:"\ - "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1, - ) - - check_batches = """train_dataloader = self.get_train_dataloader() - ga = args.gradient_accumulation_steps - bsz = self._train_batch_size - total_batches = bsz * ga * args.world_size - n_total_devices = total_batches // ga // bsz - if n_total_devices > 1: - logger.warning_once( - "* Our OSS was designed for people with few GPU resources to level the playing field.\\n" - "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n" - "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n" - "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - ) - divisor = n_total_devices / 1 - bsz = self._train_batch_size = max(int(bsz / divisor), 1) - if total_batches // ga // bsz > 1: - divisor = n_total_devices / 1 - ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)""" - check_batches = check_batches.split('\n') - check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]]) - inner_training_loop = inner_training_loop.replace( - "train_dataloader = self.get_train_dataloader()", - check_batches, 1, - ) - inner_training_loop = inner_training_loop.replace( - "_inner_training_loop", - "_fast_inner_training_loop", 1, - ) - exec(inner_training_loop, globals()) - - Trainer._inner_training_loop = _fast_inner_training_loop - inner_training_loop = inner_training_loop.replace( - "is_torch_tpu_available()", - "False", - ) - if "n_total_devices >" not in inner_training_loop: - raise RuntimeError( - "Our OSS was designed for people with few GPU resources to level the playing field.\n" - "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n" - "We're a 2 person team, so we still have to fund our development costs - thanks!\n" - "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!", - ) - pass - inner_training_loop = inner_training_loop.replace( - "is_sagemaker_mp_enabled()", - "False", - ) - exec(inner_training_loop, globals()) - Trainer._inner_training_loop = _fast_inner_training_loop - - # Save max_seq_length - model.max_seq_length = max_position_embeddings - internal_model = model - while hasattr(internal_model, "model"): - internal_model.max_seq_length = max_position_embeddings - internal_model = internal_model.model - pass - internal_model.max_seq_length = max_position_embeddings - - # We check the tokenizer first for errors - if fix_tokenizer: - tokenizer = check_tokenizer( - model = model, - tokenizer = tokenizer, - model_name = model_name, - model_max_length = max_position_embeddings, - padding_side = "right", - token = token, - ) - pass - patch_saving_functions(tokenizer) - - # Fix up config for transformers uploading PEFT - # Not necessary anymore since we require transformers>=4.37! - if False: - name = model.config._name_or_path - if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): - name = name[:len(name) - len("-bnb-4bit")] - model.config.update({"_name_or_path" : name}) - pass - pass - - # Log Unsloth version for future fastpaths for inference - model.config.update({"unsloth_version" : __version__}) - - # Add save modules - patch_saving_functions(model) - Trainer._inner_training_loop = _fast_inner_training_loop - - # Save tokenizer for inference purposes - tokenizer.padding_side = "left" # Force inference - internal_model = model - while hasattr(internal_model, "model"): - internal_model._saved_temp_tokenizer = tokenizer - internal_model = internal_model.model - pass - internal_model._saved_temp_tokenizer = tokenizer - - return model, tokenizer pass pass diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py index 47327280..984bf7ca 100644 --- a/unsloth/models/qwen2.py +++ b/unsloth/models/qwen2.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .mistral import * +from .llama import * from transformers.models.qwen2.modeling_qwen2 import ( Qwen2Attention, @@ -32,7 +32,7 @@ pass -class FastQwen2Model(FastMistralModel): +class FastQwen2Model(FastLlamaModel): @staticmethod def pre_patch(): @@ -57,30 +57,30 @@ def pre_patch(): @staticmethod def from_pretrained( - model_name = "Qwen/Qwen2-7B", - max_seq_length = 4096, - dtype = None, - load_in_4bit = True, - token = None, - device_map = "sequential", - rope_scaling = None, # Qwen2 does not support RoPE scaling - fix_tokenizer = True, - model_patcher = None, - tokenizer_name = None, + model_name = "Qwen/Qwen2-7B", + max_seq_length = 4096, + dtype = None, + load_in_4bit = True, + token = None, + device_map = "sequential", + rope_scaling = None, # Qwen2 does not support RoPE scaling + fix_tokenizer = True, + model_patcher = None, + tokenizer_name = None, trust_remote_code = False, **kwargs, ): - return FastMistralModel.from_pretrained( - model_name = model_name, - max_seq_length = max_seq_length, - dtype = dtype, - load_in_4bit = load_in_4bit, - token = token, - device_map = device_map, - rope_scaling = rope_scaling, - fix_tokenizer = fix_tokenizer, - model_patcher = FastQwen2Model, - tokenizer_name = tokenizer_name, + return FastLlamaModel.from_pretrained( + model_name = model_name, + max_seq_length = max_seq_length, + dtype = dtype, + load_in_4bit = load_in_4bit, + token = token, + device_map = device_map, + rope_scaling = rope_scaling, + fix_tokenizer = fix_tokenizer, + model_patcher = FastQwen2Model, + tokenizer_name = tokenizer_name, trust_remote_code = trust_remote_code, **kwargs, ) From 341565bba38753031369890bdfb561f1c6017217 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 15 Jun 2024 00:35:24 +1000 Subject: [PATCH 144/153] Update llama.py --- unsloth/models/llama.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 9e84c6ee..7cbdcfbd 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1090,24 +1090,23 @@ def from_pretrained( if (rope_scaling is None) and (max_seq_length > model_max_seq_length): - # Warn RoPE scaling isn't allowed - if not has_rope_scaling: - raise RuntimeError( - f"Unsloth: {model_name} can only handle sequence lengths of at most "\ - f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\ - f"{round(rope_scaling, 3)}, it should be magically be extended to "\ - f"{max_seq_length}. However, {model_name} doesn't support RoPE Scaling!\n"\ - "Please file a feature request at https://github.com/unslothai/unsloth." - ) - pass - rope_scaling = max_seq_length / model_max_seq_length + logger.warning_once( f"Unsloth: {model_name} can only handle sequence lengths of at most "\ f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\ f"{round(rope_scaling, 3)}, it can be magically be extended to "\ f"{max_seq_length}!" ) + + # Warn RoPE scaling isn't allowed + if not has_rope_scaling: + raise RuntimeError( + "However, {model_name} doesn't support RoPE Scaling!\n"\ + "Please file a feature request at https://github.com/unslothai/unsloth." + ) + pass + rope_scaling = {"type": "linear", "factor": rope_scaling,} # Add to kwargs From dd3c6b1d39dc253c50b71dcd133479c512a1cf35 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 15 Jun 2024 17:44:23 +1000 Subject: [PATCH 145/153] Update llama.py --- unsloth/models/llama.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 7cbdcfbd..6702aefb 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -51,7 +51,6 @@ pass from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig -from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING from transformers import set_seed as transformers_set_seed from peft import LoraConfig, TaskType, get_peft_model as _get_peft_model from peft import PeftModelForCausalLM @@ -1076,12 +1075,7 @@ def from_pretrained( model_max_seq_length = model_config.max_position_embeddings # Check if RoPE Scaling is even allowed - model_function = MODEL_FOR_CAUSAL_LM_MAPPING[model_config.__class__] - has_rope_scaling = False - try: - with open(inspect.getfile(model_function), "r") as file: - has_rope_scaling = "self.config.rope_scaling" in file.read() - except: pass + has_rope_scaling = hasattr(model_config, "rope_scaling") # If max_seq_length is not specified, use maximum fron config if max_seq_length is None: From 6d1ae234a4cf95f3dca9fb17967c4ba04a0e3408 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 15 Jun 2024 17:46:45 +1000 Subject: [PATCH 146/153] Update llama.py --- unsloth/models/llama.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6702aefb..7cbdcfbd 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -51,6 +51,7 @@ pass from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING from transformers import set_seed as transformers_set_seed from peft import LoraConfig, TaskType, get_peft_model as _get_peft_model from peft import PeftModelForCausalLM @@ -1075,7 +1076,12 @@ def from_pretrained( model_max_seq_length = model_config.max_position_embeddings # Check if RoPE Scaling is even allowed - has_rope_scaling = hasattr(model_config, "rope_scaling") + model_function = MODEL_FOR_CAUSAL_LM_MAPPING[model_config.__class__] + has_rope_scaling = False + try: + with open(inspect.getfile(model_function), "r") as file: + has_rope_scaling = "self.config.rope_scaling" in file.read() + except: pass # If max_seq_length is not specified, use maximum fron config if max_seq_length is None: From d855ef9f620b4d1e2c4ce3aa405185f68e59fe19 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 15 Jun 2024 18:14:21 +1000 Subject: [PATCH 147/153] GPU support --- unsloth/models/_utils.py | 7 +++++-- unsloth/models/gemma.py | 11 ++++++++--- unsloth/models/llama.py | 27 +++++++++++++++++---------- unsloth/models/mistral.py | 4 +++- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index a6933893..09d59c0a 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -372,6 +372,10 @@ def prepare_n_gradient_checkpoints( pass +# Unsloth only works on NVIDIA GPUs for now +device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") +device = f"cuda:{device_ids[:device_ids.find(',')]}" + class Unsloth_Offloaded_Gradient_Checkpointer(torch.autograd.Function): """ Saves VRAM by smartly offloading to RAM. @@ -393,7 +397,7 @@ def forward(ctx, forward_function, hidden_states, *args): @torch.cuda.amp.custom_bwd def backward(ctx, dY): (hidden_states,) = ctx.saved_tensors - hidden_states = hidden_states.to("cuda", non_blocking = True).detach() + hidden_states = hidden_states.to(device, non_blocking = True).detach() hidden_states.requires_grad = True with torch.enable_grad(): (output,) = ctx.forward_function(hidden_states, *ctx.args) @@ -457,7 +461,6 @@ def _prepare_backend( # Offloading to disk for modules (lm_head, embed_tokens) -import os import pickle def offload_to_disk(W, model, name, temporary_location : str = "_unsloth_temporary_saved_buffers"): diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index 5dd2a5ab..02355a42 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -38,6 +38,11 @@ GemmaFlashAttention2 = GemmaAttention pass +# Unsloth currently only works on one GPU +import os +device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") +device = f"cuda:{device_ids[:device_ids.find(',')]}" +# Please obtain a commercial license torch_nn_functional_gelu = torch.nn.functional.gelu def fast_geglu_inference(self, X): @@ -45,7 +50,7 @@ def fast_geglu_inference(self, X): # up = self.up_proj(X) bsz, _, hd = X.shape # mlp_size = self.config.intermediate_size - # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = "cuda") + # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = device) gate = fast_linear_forward(self.gate_proj, X)#, out = temp[0]) up = fast_linear_forward(self. up_proj, X)#, out = temp[1]) @@ -72,7 +77,7 @@ def GemmaDecoderLayer_fast_forward( *args, **kwargs, ): if use_cache and hasattr(self, "_flag_for_generation"): #past_key_value is not None: - out_weight = torch.empty(self.input_layernorm.weight.shape, dtype = torch.float32, device = "cuda") + out_weight = torch.empty(self.input_layernorm.weight.shape, dtype = torch.float32, device = device) # Self Attention residual = hidden_states @@ -134,7 +139,7 @@ def GemmaModel_fast_forward_inference( position_ids, attention_mask = None, ): - out_weight = torch.empty_like(self.model.layers[0].input_layernorm.weight, dtype = torch.float32, device = "cuda") + out_weight = torch.empty_like(self.model.layers[0].input_layernorm.weight, dtype = torch.float32, device = device) input_ids = input_ids[:,:self.max_seq_length] hidden_states = self.model.embed_tokens(input_ids) hidden_states = hidden_states.to(self.config.torch_dtype) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 7cbdcfbd..274e6bfd 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -74,6 +74,9 @@ def original_apply_o(self, X): return O pass +import os # Unsloth only works on NVIDIA GPUs for now +device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") +device = f"cuda:{device_ids[:device_ids.find(',')]}" from math import sqrt as math_sqrt KV_CACHE_INCREMENT = 256 # KV Cache update size @@ -132,15 +135,15 @@ def LlamaAttention_fast_forward_inference( # Prefill phase # if not hasattr(self, "paged_attention"): if do_prefill: - self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = "cuda") + self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = device) self.paged_attention_K = self.paged_attention[:,0] self.paged_attention_V = self.paged_attention[:,1] self.paged_attention_K[:seq_len] = K1.permute(2, 0, 1, 3) self.paged_attention_V[:seq_len] = V1.permute(2, 0, 1, 3) - self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda") - self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda") - self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda") - self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda") + self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = device) + self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = device) + self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = device) + self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = device) self.scalar = 1.0 / math_sqrt(self.head_dim) self.half_head_dim = head_dim // 2 elif kv_seq_len >= self.paged_attention.shape[0]: @@ -170,7 +173,7 @@ def LlamaAttention_fast_forward_inference( Qn *= cos Qn.addcmul_(RH_Q, sin) - RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = "cuda") + RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = device) RH_K[:,:,:,:h] = Kn[:,:,:,h:] RH_K[:,:,:,h:] = Kn[:,:,:,:h] torch.neg(RH_K[:,:,:,:h], out = RH_K[:,:,:,:h]) @@ -232,7 +235,7 @@ def fast_swiglu_inference(self, X): # up = self.up_proj(X) bsz, _, hd = X.shape # mlp_size = self.config.intermediate_size - # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = "cuda") + # temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = device) gate = fast_linear_forward(self.gate_proj, X)#, out = temp[0]) up = fast_linear_forward(self. up_proj, X)#, out = temp[1]) @@ -522,7 +525,7 @@ def LlamaModel_fast_forward( position_ids = torch.arange( past_key_values_length, seq_length + past_key_values_length, dtype = torch.int32, - device = "cuda", + device = device, ) position_ids = position_ids.unsqueeze(0).view(-1, seq_length) elif position_ids is not None: @@ -842,8 +845,10 @@ def _CausalLM_fast_forward( if labels is not None: shift_logits = logits if not hasattr(self, "extra_ignored_labels"): + device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now # Fixes https://github.com/unslothai/unsloth/issues/10 - self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda") + self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device) pass shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]])) @@ -1822,7 +1827,9 @@ def patch_peft_model( # Patch cross entropy loss labels # Fixes https://github.com/unslothai/unsloth/issues/10 max_seq_length = model.max_seq_length - extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda") + device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now + extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = device) model.model.extra_ignored_labels = extra_ignored_labels internal_model = model while hasattr(internal_model, "model"): diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 291f0aa5..d41de54d 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -239,8 +239,10 @@ def MistralForCausalLM_fast_forward( if labels is not None: shift_logits = logits if not hasattr(self, "extra_ignored_labels"): + device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now # Fixes https://github.com/unslothai/unsloth/issues/10 - self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda") + self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device) pass shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]])) From 66564461513dee04e897f83dbc3dd16c1cd82550 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 15 Jun 2024 18:21:33 +1000 Subject: [PATCH 148/153] Typo --- unsloth/models/_utils.py | 2 +- unsloth/models/gemma.py | 2 +- unsloth/models/llama.py | 6 +++--- unsloth/models/mistral.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 09d59c0a..1b122fc8 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -373,7 +373,7 @@ def prepare_n_gradient_checkpoints( # Unsloth only works on NVIDIA GPUs for now -device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") +device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," device = f"cuda:{device_ids[:device_ids.find(',')]}" class Unsloth_Offloaded_Gradient_Checkpointer(torch.autograd.Function): diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index 02355a42..98502836 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -40,7 +40,7 @@ # Unsloth currently only works on one GPU import os -device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") +device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," device = f"cuda:{device_ids[:device_ids.find(',')]}" # Please obtain a commercial license diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 274e6bfd..9327b1bb 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -75,7 +75,7 @@ def original_apply_o(self, X): pass import os # Unsloth only works on NVIDIA GPUs for now -device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") +device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," device = f"cuda:{device_ids[:device_ids.find(',')]}" from math import sqrt as math_sqrt @@ -845,7 +845,7 @@ def _CausalLM_fast_forward( if labels is not None: shift_logits = logits if not hasattr(self, "extra_ignored_labels"): - device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now # Fixes https://github.com/unslothai/unsloth/issues/10 self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device) @@ -1827,7 +1827,7 @@ def patch_peft_model( # Patch cross entropy loss labels # Fixes https://github.com/unslothai/unsloth/issues/10 max_seq_length = model.max_seq_length - device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = device) model.model.extra_ignored_labels = extra_ignored_labels diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index d41de54d..e147f215 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -239,7 +239,7 @@ def MistralForCausalLM_fast_forward( if labels is not None: shift_logits = logits if not hasattr(self, "extra_ignored_labels"): - device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now # Fixes https://github.com/unslothai/unsloth/issues/10 self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device) From 9bd5fad07d5961069b895eca530e09913d15812d Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 15 Jun 2024 22:04:08 +1000 Subject: [PATCH 149/153] Update gemma.py --- unsloth/models/gemma.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index 98502836..0cc047d2 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -38,11 +38,9 @@ GemmaFlashAttention2 = GemmaAttention pass -# Unsloth currently only works on one GPU import os device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," -device = f"cuda:{device_ids[:device_ids.find(',')]}" -# Please obtain a commercial license +device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now torch_nn_functional_gelu = torch.nn.functional.gelu def fast_geglu_inference(self, X): From a3061b624baebf6fa99eeff1417f0595b8085883 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 15 Jun 2024 22:10:05 +1000 Subject: [PATCH 150/153] gpu --- unsloth/models/_utils.py | 3 ++- unsloth/models/llama.py | 9 ++++++--- unsloth/models/mistral.py | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 1b122fc8..49b8ba39 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -374,7 +374,8 @@ def prepare_n_gradient_checkpoints( # Unsloth only works on NVIDIA GPUs for now device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," -device = f"cuda:{device_ids[:device_ids.find(',')]}" +device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now +device = f"cuda:{device if device.isdigit() else '0'}" class Unsloth_Offloaded_Gradient_Checkpointer(torch.autograd.Function): """ diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 9327b1bb..f2f79de8 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -76,7 +76,8 @@ def original_apply_o(self, X): import os # Unsloth only works on NVIDIA GPUs for now device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," -device = f"cuda:{device_ids[:device_ids.find(',')]}" +device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now +device = f"cuda:{device if device.isdigit() else '0'}" from math import sqrt as math_sqrt KV_CACHE_INCREMENT = 256 # KV Cache update size @@ -846,7 +847,8 @@ def _CausalLM_fast_forward( shift_logits = logits if not hasattr(self, "extra_ignored_labels"): device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," - device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now + device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now + device = f"cuda:{device if device.isdigit() else '0'}" # Fixes https://github.com/unslothai/unsloth/issues/10 self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device) pass @@ -1828,7 +1830,8 @@ def patch_peft_model( # Fixes https://github.com/unslothai/unsloth/issues/10 max_seq_length = model.max_seq_length device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," - device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now + device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now + device = f"cuda:{device if device.isdigit() else '0'}" extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = device) model.model.extra_ignored_labels = extra_ignored_labels internal_model = model diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index e147f215..832189be 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -240,7 +240,8 @@ def MistralForCausalLM_fast_forward( shift_logits = logits if not hasattr(self, "extra_ignored_labels"): device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + "," - device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now + device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now + device = f"cuda:{device if device.isdigit() else '0'}" # Fixes https://github.com/unslothai/unsloth/issues/10 self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device) pass From 513bd4d28981893327f0c0ff49af0b529522994a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 16 Jun 2024 00:06:05 +1000 Subject: [PATCH 151/153] Multiple GGUF saving --- unsloth/save.py | 224 +++++++++++++++++++++++++++++------------------- 1 file changed, 134 insertions(+), 90 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index cae59cae..17d6962f 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -418,6 +418,11 @@ def unsloth_save_model( print("Unsloth: Saving model...", end = "") if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "") + # [TODO] Is this correct? + if save_method == "lora": + save_pretrained_settings["selected_adapters"] = None + pass + model.save_pretrained(**save_pretrained_settings) if push_to_hub and hasattr(model, "config"): @@ -649,8 +654,9 @@ def unsloth_save_model( model.config = new_config # Save! - + # [TODO] --> is this correct? save_pretrained_settings["selected_adapters"] = None + # Check if pushing to an organization if save_pretrained_settings["push_to_hub"] and (username != actual_username): print(f"Unsloth: Saving to organization with address {new_save_directory}") @@ -834,7 +840,7 @@ def save_to_gguf( model_dtype : str, is_sentencepiece : bool = False, model_directory : str = "unsloth_finetuned_model", - quantization_method : str = "fast_quantized", + quantization_method = "fast_quantized", # Can be a list of options! ["q4_k_m", "q8_0", "q5_k_m"] first_conversion : str = None, _run_installer = None, # Non blocking install of llama.cpp ): @@ -846,6 +852,10 @@ def save_to_gguf( assert(model_dtype == "float16" or model_dtype == "bfloat16") model_dtype = "f16" if model_dtype == "float16" else "bf16" + # Convert quantization_method to list + quantization_method = \ + quantization_method if type(quantization_method) is list else list(quantization_method) + # Check if bfloat16 is supported if model_dtype == "bf16" and not torch.cuda.is_bf16_supported(): logger.warning( @@ -860,8 +870,11 @@ def save_to_gguf( first_conversion = model_dtype pass - if quantization_method.startswith("iq2"): - raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!") + # Check I quants + for quant_method in quantization_method: + if quant_method.startswith("iq2"): + raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!") + pass # Careful convert.py is only for Llama / Mistral based archs use_fast_convert = False @@ -871,25 +884,32 @@ def save_to_gguf( pass logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.") - if quantization_method == "not_quantized": quantization_method = model_dtype - elif quantization_method == "fast_quantized": quantization_method = "q8_0" - elif quantization_method == "quantized": quantization_method = "q4_k_m" - elif quantization_method is None: quantization_method = "q8_0" - pass + # Map quant methods + new_quantization_method = [] + for quant_method in quantization_method: + if quant_method == "not_quantized": quantization_method = model_dtype + elif quant_method == "fast_quantized": quantization_method = "q8_0" + elif quant_method == "quantized": quantization_method = "q4_k_m" + elif quant_method is None: quantization_method = "q8_0" + + # Check if wrong method + if quantization_method not in ALLOWED_QUANTS.keys(): + error = f"Unsloth: Quant method = [{quantization_method}] not supported. Choose from below:\n" + for key, value in ALLOWED_QUANTS.items(): + error += f"[{key}] => {value}\n" + raise RuntimeError(error) + pass - if quantization_method not in ALLOWED_QUANTS.keys(): - error = f"Unsloth: Quant method = [{quantization_method}] not supported. Choose from below:\n" - for key, value in ALLOWED_QUANTS.items(): - error += f"[{key}] => {value}\n" - raise RuntimeError(error) + new_quantization_method.append(quant_method) pass + quantization_method = new_quantization_method print_info = \ f"==((====))== Unsloth: Conversion from QLoRA to GGUF information\n"\ f" \\\ /| [0] Installing llama.cpp will take 3 minutes.\n"\ f"O^O/ \_/ \\ [1] Converting HF to GUUF 16bits will take 3 minutes.\n"\ - f"\ / [2] Converting GGUF 16bits to {quantization_method} will take 20 minutes.\n"\ - f' "-____-" In total, you will have to wait around 26 minutes.\n' + f"\ / [2] Converting GGUF 16bits to {quantization_method} will take 10 minutes each.\n"\ + f' "-____-" In total, you will have to wait at least 16 minutes.\n' print(print_info) # Check first_conversion format @@ -928,24 +948,37 @@ def save_to_gguf( install_llama_cpp_old(-10) pass - if quantization_method == "f32": first_conversion = "f32" - elif quantization_method == "f16": first_conversion = "f16" - elif quantization_method == "bf16": first_conversion = "bf16" - elif quantization_method == "q8_0": first_conversion = "q8_0" - else: - # Quantized models must have f16 as the default argument - if first_conversion == "f32" : pass - elif first_conversion == "f16" : pass - elif first_conversion == "bf16" : pass - elif first_conversion == "q8_0": - logger.warning_once( - "Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\ - "but saves disk space!" - ) - # first_conversion = "f16" + # Determine maximum first_conversion state + if first_conversion == "f32" : strength = 3 + elif first_conversion == "f16" : strength = 2 + elif first_conversion == "bf16" : strength = 1 + elif first_conversion == "q8_0" : strength = 0 + + for quant_method in quantization_method: + if quant_method == "f32": strength = max(strength, 3) + elif quant_method == "f16": strength = max(strength, 2) + elif quant_method == "bf16": strength = max(strength, 1) + elif quant_method == "q8_0": strength = max(strength, 0) + else: + # Quantized models must have f16 as the default argument + if first_conversion == "f32" : pass + elif first_conversion == "f16" : pass + elif first_conversion == "bf16" : pass + elif first_conversion == "q8_0": + logger.warning_once( + "Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\ + "but saves disk space!" + ) + # first_conversion = "f16" + pass pass pass + if strength >= 3: first_conversion = "f32" + elif strength >= 2: first_conversion = "f16" + elif strength >= 1: first_conversion = "bf16" + else: first_conversion = "q8_0" + # Non llama/mistral needs can only use f32 or f16 if not use_fast_convert and \ (first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"): @@ -1033,52 +1066,58 @@ def save_to_gguf( pass print(f"Unsloth: Conversion completed! Output location: {final_location}") - if quantization_method != first_conversion: - old_location = final_location - print(f"Unsloth: [2] Converting GGUF 16bit into {quantization_method}. This will take 20 minutes...") - final_location = f"./{model_directory}-unsloth.{quantization_method.upper()}.gguf" + full_precision_location = final_location - command = f"./{quantize_location} {old_location} "\ - f"{final_location} {quantization_method} {n_cpus}" - - # quantize uses stderr - with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp: - for line in sp.stdout: - line = line.decode("utf-8", errors = "replace") - if "undefined reference" in line: - raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!") - print(line, flush = True, end = "") - if sp.returncode is not None and sp.returncode != 0: - raise subprocess.CalledProcessError(sp.returncode, sp.args) - pass + all_saved_locations = [] + # Convert each type! + for quant_method in quantization_method: + if quant_method != first_conversion: + print(f"Unsloth: [2] Converting GGUF 16bit into {quant_method}. This will take 20 minutes...") + final_location = f"./{model_directory}-unsloth.{quant_method.upper()}.gguf" - # Check if quantization succeeded! - if not os.path.isfile(final_location): - if IS_KAGGLE_ENVIRONMENT: - raise RuntimeError( - f"Unsloth: Quantization failed for {final_location}\n"\ - "You are in a Kaggle environment, which might be the reason this is failing.\n"\ - "Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\ - "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\ - "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\ - "I suggest you to save the 16bit model first, then use manual llama.cpp conversion." - ) - else: - raise RuntimeError( - "Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\ - "You do not need to close this Python program. Run the following commands in a new terminal:\n"\ - "You must run this in the same folder as you're saving your model.\n"\ - "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\ - "cd llama.cpp && make clean && make all -j\n"\ - "Once that's done, redo the quantization." - ) + command = f"./{quantize_location} {full_precision_location} "\ + f"{final_location} {quant_method} {n_cpus}" + + # quantize uses stderr + with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp: + for line in sp.stdout: + line = line.decode("utf-8", errors = "replace") + if "undefined reference" in line: + raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!") + print(line, flush = True, end = "") + if sp.returncode is not None and sp.returncode != 0: + raise subprocess.CalledProcessError(sp.returncode, sp.args) pass - pass - print(f"Unsloth: Conversion completed! Output location: {final_location}") + # Check if quantization succeeded! + if not os.path.isfile(final_location): + if IS_KAGGLE_ENVIRONMENT: + raise RuntimeError( + f"Unsloth: Quantization failed for {final_location}\n"\ + "You are in a Kaggle environment, which might be the reason this is failing.\n"\ + "Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\ + "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\ + "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\ + "I suggest you to save the 16bit model first, then use manual llama.cpp conversion." + ) + else: + raise RuntimeError( + "Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\ + "You do not need to close this Python program. Run the following commands in a new terminal:\n"\ + "You must run this in the same folder as you're saving your model.\n"\ + "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\ + "cd llama.cpp && make clean && make all -j\n"\ + "Once that's done, redo the quantization." + ) + pass + pass + + print(f"Unsloth: Conversion completed! Output location: {final_location}") + all_saved_locations.append(final_location) + pass pass - return final_location + return all_saved_locations pass @@ -1453,7 +1492,7 @@ def unsloth_save_pretrained_gguf( is_sentencepiece_model = check_if_sentencepiece_model(self) # Save to GGUF - file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, + all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, ) @@ -1466,14 +1505,17 @@ def unsloth_save_pretrained_gguf( if push_to_hub: print("Unsloth: Uploading GGUF to Huggingface Hub...") - username = upload_to_huggingface( - self, save_directory, token, - "GGUF converted", "gguf", file_location, old_username, private, - ) - link = f"{username}/{new_save_directory.lstrip('/.')}" \ - if username not in new_save_directory else \ - new_save_directory.lstrip('/.') - print(f"Saved GGUF to https://huggingface.co/{link}") + + for file_location in all_file_locations: + username = upload_to_huggingface( + self, save_directory, token, + "GGUF converted", "gguf", file_location, old_username, private, + ) + link = f"{username}/{new_save_directory.lstrip('/.')}" \ + if username not in new_save_directory else \ + new_save_directory.lstrip('/.') + print(f"Saved GGUF to https://huggingface.co/{link}") + pass pass pass @@ -1604,20 +1646,22 @@ def unsloth_push_to_hub_gguf( is_sentencepiece_model = check_if_sentencepiece_model(self) # Save to GGUF - file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, + all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, ) - print("Unsloth: Uploading GGUF to Huggingface Hub...") - username = upload_to_huggingface( - self, repo_id, token, - "GGUF converted", "gguf", file_location, old_username, private, - ) - link = f"{username}/{new_save_directory.lstrip('/.')}" \ - if username not in new_save_directory else \ - new_save_directory.lstrip('/.') + for file_location in all_file_locations: + print("Unsloth: Uploading GGUF to Huggingface Hub...") + username = upload_to_huggingface( + self, repo_id, token, + "GGUF converted", "gguf", file_location, old_username, private, + ) + link = f"{username}/{new_save_directory.lstrip('/.')}" \ + if username not in new_save_directory else \ + new_save_directory.lstrip('/.') - print(f"Saved GGUF to https://huggingface.co/{link}") + print(f"Saved GGUF to https://huggingface.co/{link}") + pass if fix_bos_token: logger.warning( From fb54fbbc005595468ddc5e1d4f3107395a7d69c6 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 16 Jun 2024 01:16:05 +1000 Subject: [PATCH 152/153] Update save.py --- unsloth/save.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 17d6962f..f7efcc44 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -893,8 +893,8 @@ def save_to_gguf( elif quant_method is None: quantization_method = "q8_0" # Check if wrong method - if quantization_method not in ALLOWED_QUANTS.keys(): - error = f"Unsloth: Quant method = [{quantization_method}] not supported. Choose from below:\n" + if quant_method not in ALLOWED_QUANTS.keys(): + error = f"Unsloth: Quant method = [{quant_method}] not supported. Choose from below:\n" for key, value in ALLOWED_QUANTS.items(): error += f"[{key}] => {value}\n" raise RuntimeError(error) From 4cba3e2c6c1c844821cf3225fe9dbeb9a23004ae Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 16 Jun 2024 02:57:33 +1000 Subject: [PATCH 153/153] Update save.py --- unsloth/save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/save.py b/unsloth/save.py index f7efcc44..940feb40 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -655,7 +655,7 @@ def unsloth_save_model( # Save! # [TODO] --> is this correct? - save_pretrained_settings["selected_adapters"] = None + # save_pretrained_settings["selected_adapters"] = None # Check if pushing to an organization if save_pretrained_settings["push_to_hub"] and (username != actual_username):