Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nightly #648

Merged
merged 168 commits into from
Jun 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
168 commits
Select commit Hold shift + click to select a range
7df08c4
Update llama.py
danielhanchen May 19, 2024
ba5b6ce
offload
danielhanchen May 19, 2024
a07057e
Update llama.py
danielhanchen May 19, 2024
4be9063
Update llama.py
danielhanchen May 19, 2024
3dc3d3f
Update llama.py
danielhanchen May 19, 2024
f1cc1e8
Update llama.py
danielhanchen May 19, 2024
5cb531a
Update llama.py
danielhanchen May 19, 2024
6bd8e60
Update llama.py
danielhanchen May 19, 2024
d1d57ff
Update llama.py
danielhanchen May 19, 2024
7470f67
continued pretraining trainer
danielhanchen May 20, 2024
da9c1a6
Update trainer.py
danielhanchen May 20, 2024
2c68f56
Update trainer.py
danielhanchen May 20, 2024
217bf9d
Update trainer.py
danielhanchen May 20, 2024
6e85384
Update trainer.py
danielhanchen May 21, 2024
77f9c51
is_bfloat16_supported
danielhanchen May 21, 2024
c0e1d27
Update __init__.py
danielhanchen May 21, 2024
2b23b93
Update README.md
danielhanchen May 21, 2024
902e23a
Update llama.py
danielhanchen May 21, 2024
98f41ce
Merge branch 'main' into nightly
danielhanchen May 22, 2024
3193cac
is_bfloat16_supported
danielhanchen May 22, 2024
dfeaf4b
Update __init__.py
danielhanchen May 22, 2024
1e84090
Mistral v3
danielhanchen May 22, 2024
f63f32b
Merge branch 'main' into nightly
danielhanchen May 23, 2024
57ad8e7
Phi 3 medium
danielhanchen May 23, 2024
2b994b2
Update chat_templates.py
danielhanchen May 23, 2024
ff8171f
Update chat_templates.py
danielhanchen May 23, 2024
5ca8b58
Phi-3
danielhanchen May 23, 2024
98c2e81
Merge branch 'main' into nightly
danielhanchen May 23, 2024
3817660
Merge branch 'main' into nightly
danielhanchen May 23, 2024
f858145
Merge branch 'main' into nightly
danielhanchen May 24, 2024
a1328f6
Update save.py
danielhanchen May 24, 2024
fb29673
Update README.md
shimmyshimmer May 25, 2024
fa85556
Untrained tokens
danielhanchen May 26, 2024
c511aca
Update tokenizer_utils.py
danielhanchen May 26, 2024
35e7355
Update tokenizer_utils.py
danielhanchen May 26, 2024
cc0bf44
Update tokenizer_utils.py
danielhanchen May 26, 2024
674ba66
Update tokenizer_utils.py
danielhanchen May 26, 2024
9823f52
Update tokenizer_utils.py
danielhanchen May 26, 2024
c0c761b
Update tokenizer_utils.py
danielhanchen May 26, 2024
e2850c0
Update tokenizer_utils.py
danielhanchen May 26, 2024
8e12780
Update tokenizer_utils.py
danielhanchen May 26, 2024
6f1855e
Update tokenizer_utils.py
danielhanchen May 26, 2024
d27b173
Update tokenizer_utils.py
danielhanchen May 26, 2024
7bf7399
Update tokenizer_utils.py
danielhanchen May 26, 2024
31ecef9
Update tokenizer_utils.py
danielhanchen May 26, 2024
b67d93f
Update tokenizer_utils.py
danielhanchen May 26, 2024
e874ccd
Update tokenizer_utils.py
danielhanchen May 26, 2024
d7b54ff
Update tokenizer_utils.py
danielhanchen May 27, 2024
5a4a512
Update tokenizer_utils.py
danielhanchen May 27, 2024
82c040e
Update tokenizer_utils.py
danielhanchen May 27, 2024
8e227b2
Update tokenizer_utils.py
danielhanchen May 27, 2024
250d386
Update tokenizer_utils.py
danielhanchen May 27, 2024
e6db3ba
Update llama.py
danielhanchen May 27, 2024
e673fa2
Update tokenizer_utils.py
danielhanchen May 27, 2024
222b835
Update tokenizer_utils.py
danielhanchen May 27, 2024
6404aa5
Update tokenizer_utils.py
danielhanchen May 27, 2024
cfea7b2
Update tokenizer_utils.py
danielhanchen May 27, 2024
083e5ba
Update save.py
danielhanchen May 27, 2024
6f2565c
Update save.py
danielhanchen May 27, 2024
c19b04e
Update save.py
danielhanchen May 27, 2024
64b12a2
checkpoint
danielhanchen May 28, 2024
4cd5a8a
Merge branch 'main' into nightly
danielhanchen May 28, 2024
196faec
Update _utils.py
danielhanchen May 28, 2024
235be40
Update tokenizer_utils.py
danielhanchen May 29, 2024
cf9090a
Update tokenizer_utils.py
danielhanchen May 29, 2024
1fb1110
Update tokenizer_utils.py
danielhanchen May 29, 2024
d1bd60c
Update llama.py
danielhanchen May 30, 2024
732ead0
accelerate
danielhanchen May 30, 2024
359ae5c
Update _utils.py
danielhanchen May 30, 2024
8dcfad3
Update _utils.py
danielhanchen May 30, 2024
2bafc57
Update _utils.py
danielhanchen May 30, 2024
90f6311
Update _utils.py
danielhanchen May 30, 2024
7b84ff7
Update _utils.py
danielhanchen May 30, 2024
60f4b9a
Update _utils.py
danielhanchen May 30, 2024
3ebe5a5
Update _utils.py
danielhanchen May 30, 2024
7bbc8ce
Update tokenizer_utils.py
danielhanchen May 30, 2024
6f5c84c
train_dataloader
danielhanchen May 30, 2024
0d269ca
Update llama.py
danielhanchen May 30, 2024
6b7c142
Update llama.py
danielhanchen May 30, 2024
54f3a74
Update llama.py
danielhanchen May 30, 2024
0bc96c5
use_fast_convert
danielhanchen May 30, 2024
02c91b0
Merge branch 'main' into nightly
danielhanchen May 30, 2024
b384ff0
Merge branch 'main' into nightly
danielhanchen May 30, 2024
a8b5d89
Update save.py
danielhanchen May 30, 2024
872d569
Update save.py
danielhanchen May 30, 2024
3a1f5f2
Update save.py
danielhanchen May 30, 2024
bcadc8c
Update save.py
danielhanchen Jun 2, 2024
1381820
remove_special_tokens
danielhanchen Jun 2, 2024
e01b87d
Ollama
danielhanchen Jun 2, 2024
b3479c7
Update chat_templates.py
danielhanchen Jun 3, 2024
86804dc
Update chat_templates.py
danielhanchen Jun 3, 2024
87fdd3a
Update chat_templates.py
danielhanchen Jun 3, 2024
5c5df69
Merge branch 'main' into nightly
danielhanchen Jun 7, 2024
6386d94
Update llama.py
danielhanchen Jun 7, 2024
b1a9551
Update chat_templates.py
danielhanchen Jun 9, 2024
344a05d
Support bfloat16 GGUF
danielhanchen Jun 9, 2024
6b11e0d
Update save.py
danielhanchen Jun 9, 2024
c6e4b5b
Update llama.py
danielhanchen Jun 9, 2024
57f29ab
fast_forward_inference
danielhanchen Jun 9, 2024
d32e972
Update mapper.py
danielhanchen Jun 9, 2024
e121fa5
Update loader.py
danielhanchen Jun 9, 2024
5eaa10f
Update llama.py
danielhanchen Jun 9, 2024
f57d28d
Update tokenizer_utils.py
danielhanchen Jun 10, 2024
8937507
info
danielhanchen Jun 11, 2024
8982edb
edits
danielhanchen Jun 11, 2024
8904605
Create chat template
danielhanchen Jun 11, 2024
2a374c2
Fix tokenizer
danielhanchen Jun 12, 2024
d704b73
Merge branch 'main' into nightly
danielhanchen Jun 13, 2024
8176155
Update tokenizer_utils.py
danielhanchen Jun 13, 2024
21a99f1
fix case where gguf saving fails due to first_conversion dtype (#630)
chrehall68 Jun 13, 2024
dbf2dcf
Support revision parameter in FastLanguageModel.from_pretrained (#629)
chrehall68 Jun 13, 2024
9016171
clears any selected_adapters before calling internal_model.save_pretr…
neph1 Jun 13, 2024
0428920
Update __init__.py (#602)
xyangk Jun 13, 2024
9fdd847
Fixed unsloth/tokenizer_utils.py for chat training (#604)
Oseltamivir Jun 13, 2024
b5fc6aa
Add GGML saving option to Unsloth for easier Ollama model creation an…
mahiatlinux Jun 13, 2024
3fafbf7
docs: Add LoraConfig parameters documentation (#619)
sebdg Jun 13, 2024
273a871
llama.cpp failing (#371)
bet0x Jun 13, 2024
b312b3f
fix libcuda_dirs import for triton 3.0 (#227)
t-vi Jun 13, 2024
1601dca
Update save.py
danielhanchen Jun 13, 2024
26dc502
Update __init__.py
danielhanchen Jun 13, 2024
6a51657
Update fast_lora.py
danielhanchen Jun 13, 2024
4a8ba90
Update save.py
danielhanchen Jun 13, 2024
0abb5ba
Update save.py
danielhanchen Jun 13, 2024
b24dd05
Update save.py
danielhanchen Jun 13, 2024
48c6d6d
Update loader.py
danielhanchen Jun 13, 2024
e35f608
Update save.py
danielhanchen Jun 13, 2024
4822eae
Update save.py
danielhanchen Jun 13, 2024
7d847ed
quantize now llama-quantize
danielhanchen Jun 13, 2024
82f10cb
Update chat_templates.py
danielhanchen Jun 13, 2024
08424f0
Update loader.py
danielhanchen Jun 13, 2024
eb906d0
Update mapper.py
danielhanchen Jun 13, 2024
0a304ae
Update __init__.py
danielhanchen Jun 13, 2024
71edc42
embedding size
danielhanchen Jun 13, 2024
411b881
Merge branch 'main' into nightly
danielhanchen Jun 13, 2024
b74e321
Update qwen2.py
danielhanchen Jun 13, 2024
9c6d415
Merge branch 'main' into nightly
danielhanchen Jun 14, 2024
b82277f
docs
danielhanchen Jun 14, 2024
d98e45e
Update README.md
danielhanchen Jun 14, 2024
b6f0fdb
Update qwen2.py
danielhanchen Jun 14, 2024
6c031e4
README: Fix minor typo. (#559)
shaper Jun 14, 2024
2401dee
Update mistral.py
danielhanchen Jun 14, 2024
1b93d7e
Update qwen2.py
danielhanchen Jun 14, 2024
3581037
Update qwen2.py
danielhanchen Jun 14, 2024
b56b8b8
Update qwen2.py
danielhanchen Jun 14, 2024
fe8c064
Update llama.py
danielhanchen Jun 14, 2024
d8d332a
Update llama.py
danielhanchen Jun 14, 2024
cdb1dbb
Update llama.py
danielhanchen Jun 14, 2024
e8b3cf0
Update README.md
danielhanchen Jun 14, 2024
7e6f000
FastMistralModel
danielhanchen Jun 14, 2024
28995ab
Update mistral.py
danielhanchen Jun 14, 2024
515b1ae
Update mistral.py
danielhanchen Jun 14, 2024
7f28209
Update mistral.py
danielhanchen Jun 14, 2024
453cc48
Update mistral.py
danielhanchen Jun 14, 2024
6633d4a
Update mistral.py
danielhanchen Jun 14, 2024
e5bf125
Auto check rope scaling
danielhanchen Jun 14, 2024
d4f4bce
Merge branch 'main' into nightly
danielhanchen Jun 14, 2024
341565b
Update llama.py
danielhanchen Jun 14, 2024
dd3c6b1
Update llama.py
danielhanchen Jun 15, 2024
6d1ae23
Update llama.py
danielhanchen Jun 15, 2024
d855ef9
GPU support
danielhanchen Jun 15, 2024
da1fe76
Merge branch 'main' into nightly
danielhanchen Jun 15, 2024
6656446
Typo
danielhanchen Jun 15, 2024
9bd5fad
Update gemma.py
danielhanchen Jun 15, 2024
a3061b6
gpu
danielhanchen Jun 15, 2024
7e5155d
Merge branch 'main' into nightly
danielhanchen Jun 15, 2024
513bd4d
Multiple GGUF saving
danielhanchen Jun 15, 2024
fb54fbb
Update save.py
danielhanchen Jun 15, 2024
4cba3e2
Update save.py
danielhanchen Jun 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion unsloth/models/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,8 @@ def prepare_n_gradient_checkpoints(

# Unsloth only works on NVIDIA GPUs for now
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
device = f"cuda:{device_ids[:device_ids.find(',')]}"
device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
device = f"cuda:{device if device.isdigit() else '0'}"

class Unsloth_Offloaded_Gradient_Checkpointer(torch.autograd.Function):
"""
Expand Down
4 changes: 1 addition & 3 deletions unsloth/models/gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,9 @@
GemmaFlashAttention2 = GemmaAttention
pass

# Unsloth currently only works on one GPU
import os
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
device = f"cuda:{device_ids[:device_ids.find(',')]}"
# Please obtain a commercial license
device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now

torch_nn_functional_gelu = torch.nn.functional.gelu
def fast_geglu_inference(self, X):
Expand Down
9 changes: 6 additions & 3 deletions unsloth/models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def original_apply_o(self, X):

import os # Unsloth only works on NVIDIA GPUs for now
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
device = f"cuda:{device_ids[:device_ids.find(',')]}"
device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
device = f"cuda:{device if device.isdigit() else '0'}"

from math import sqrt as math_sqrt
KV_CACHE_INCREMENT = 256 # KV Cache update size
Expand Down Expand Up @@ -846,7 +847,8 @@ def _CausalLM_fast_forward(
shift_logits = logits
if not hasattr(self, "extra_ignored_labels"):
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
device = f"cuda:{device if device.isdigit() else '0'}"
# Fixes https://github.com/unslothai/unsloth/issues/10
self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device)
pass
Expand Down Expand Up @@ -1828,7 +1830,8 @@ def patch_peft_model(
# Fixes https://github.com/unslothai/unsloth/issues/10
max_seq_length = model.max_seq_length
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
device = f"cuda:{device if device.isdigit() else '0'}"
extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = device)
model.model.extra_ignored_labels = extra_ignored_labels
internal_model = model
Expand Down
3 changes: 2 additions & 1 deletion unsloth/models/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,8 @@ def MistralForCausalLM_fast_forward(
shift_logits = logits
if not hasattr(self, "extra_ignored_labels"):
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + ","
device = f"cuda:{device_ids[:device_ids.find(',')]}" # Unsloth only works on NVIDIA GPUs for now
device = device_ids[:device_ids.find(',')] # Unsloth only works on NVIDIA GPUs for now
device = f"cuda:{device if device.isdigit() else '0'}"
# Fixes https://github.com/unslothai/unsloth/issues/10
self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = device)
pass
Expand Down
226 changes: 135 additions & 91 deletions unsloth/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,11 @@ def unsloth_save_model(
print("Unsloth: Saving model...", end = "")
if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")

# [TODO] Is this correct?
if save_method == "lora":
save_pretrained_settings["selected_adapters"] = None
pass

model.save_pretrained(**save_pretrained_settings)

if push_to_hub and hasattr(model, "config"):
Expand Down Expand Up @@ -649,8 +654,9 @@ def unsloth_save_model(
model.config = new_config

# Save!

save_pretrained_settings["selected_adapters"] = None
# [TODO] --> is this correct?
# save_pretrained_settings["selected_adapters"] = None

# Check if pushing to an organization
if save_pretrained_settings["push_to_hub"] and (username != actual_username):
print(f"Unsloth: Saving to organization with address {new_save_directory}")
Expand Down Expand Up @@ -834,7 +840,7 @@ def save_to_gguf(
model_dtype : str,
is_sentencepiece : bool = False,
model_directory : str = "unsloth_finetuned_model",
quantization_method : str = "fast_quantized",
quantization_method = "fast_quantized", # Can be a list of options! ["q4_k_m", "q8_0", "q5_k_m"]
first_conversion : str = None,
_run_installer = None, # Non blocking install of llama.cpp
):
Expand All @@ -846,6 +852,10 @@ def save_to_gguf(
assert(model_dtype == "float16" or model_dtype == "bfloat16")
model_dtype = "f16" if model_dtype == "float16" else "bf16"

# Convert quantization_method to list
quantization_method = \
quantization_method if type(quantization_method) is list else list(quantization_method)

# Check if bfloat16 is supported
if model_dtype == "bf16" and not torch.cuda.is_bf16_supported():
logger.warning(
Expand All @@ -860,8 +870,11 @@ def save_to_gguf(
first_conversion = model_dtype
pass

if quantization_method.startswith("iq2"):
raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
# Check I quants
for quant_method in quantization_method:
if quant_method.startswith("iq2"):
raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
pass

# Careful convert.py is only for Llama / Mistral based archs
use_fast_convert = False
Expand All @@ -871,25 +884,32 @@ def save_to_gguf(
pass
logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.")

if quantization_method == "not_quantized": quantization_method = model_dtype
elif quantization_method == "fast_quantized": quantization_method = "q8_0"
elif quantization_method == "quantized": quantization_method = "q4_k_m"
elif quantization_method is None: quantization_method = "q8_0"
pass
# Map quant methods
new_quantization_method = []
for quant_method in quantization_method:
if quant_method == "not_quantized": quantization_method = model_dtype
elif quant_method == "fast_quantized": quantization_method = "q8_0"
elif quant_method == "quantized": quantization_method = "q4_k_m"
elif quant_method is None: quantization_method = "q8_0"

# Check if wrong method
if quant_method not in ALLOWED_QUANTS.keys():
error = f"Unsloth: Quant method = [{quant_method}] not supported. Choose from below:\n"
for key, value in ALLOWED_QUANTS.items():
error += f"[{key}] => {value}\n"
raise RuntimeError(error)
pass

if quantization_method not in ALLOWED_QUANTS.keys():
error = f"Unsloth: Quant method = [{quantization_method}] not supported. Choose from below:\n"
for key, value in ALLOWED_QUANTS.items():
error += f"[{key}] => {value}\n"
raise RuntimeError(error)
new_quantization_method.append(quant_method)
pass
quantization_method = new_quantization_method

print_info = \
f"==((====))== Unsloth: Conversion from QLoRA to GGUF information\n"\
f" \\\ /| [0] Installing llama.cpp will take 3 minutes.\n"\
f"O^O/ \_/ \\ [1] Converting HF to GUUF 16bits will take 3 minutes.\n"\
f"\ / [2] Converting GGUF 16bits to {quantization_method} will take 20 minutes.\n"\
f' "-____-" In total, you will have to wait around 26 minutes.\n'
f"\ / [2] Converting GGUF 16bits to {quantization_method} will take 10 minutes each.\n"\
f' "-____-" In total, you will have to wait at least 16 minutes.\n'
print(print_info)

# Check first_conversion format
Expand Down Expand Up @@ -928,24 +948,37 @@ def save_to_gguf(
install_llama_cpp_old(-10)
pass

if quantization_method == "f32": first_conversion = "f32"
elif quantization_method == "f16": first_conversion = "f16"
elif quantization_method == "bf16": first_conversion = "bf16"
elif quantization_method == "q8_0": first_conversion = "q8_0"
else:
# Quantized models must have f16 as the default argument
if first_conversion == "f32" : pass
elif first_conversion == "f16" : pass
elif first_conversion == "bf16" : pass
elif first_conversion == "q8_0":
logger.warning_once(
"Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\
"but saves disk space!"
)
# first_conversion = "f16"
# Determine maximum first_conversion state
if first_conversion == "f32" : strength = 3
elif first_conversion == "f16" : strength = 2
elif first_conversion == "bf16" : strength = 1
elif first_conversion == "q8_0" : strength = 0

for quant_method in quantization_method:
if quant_method == "f32": strength = max(strength, 3)
elif quant_method == "f16": strength = max(strength, 2)
elif quant_method == "bf16": strength = max(strength, 1)
elif quant_method == "q8_0": strength = max(strength, 0)
else:
# Quantized models must have f16 as the default argument
if first_conversion == "f32" : pass
elif first_conversion == "f16" : pass
elif first_conversion == "bf16" : pass
elif first_conversion == "q8_0":
logger.warning_once(
"Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\
"but saves disk space!"
)
# first_conversion = "f16"
pass
pass
pass

if strength >= 3: first_conversion = "f32"
elif strength >= 2: first_conversion = "f16"
elif strength >= 1: first_conversion = "bf16"
else: first_conversion = "q8_0"

# Non llama/mistral needs can only use f32 or f16
if not use_fast_convert and \
(first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"):
Expand Down Expand Up @@ -1033,52 +1066,58 @@ def save_to_gguf(
pass
print(f"Unsloth: Conversion completed! Output location: {final_location}")

if quantization_method != first_conversion:
old_location = final_location
print(f"Unsloth: [2] Converting GGUF 16bit into {quantization_method}. This will take 20 minutes...")
final_location = f"./{model_directory}-unsloth.{quantization_method.upper()}.gguf"
full_precision_location = final_location

command = f"./{quantize_location} {old_location} "\
f"{final_location} {quantization_method} {n_cpus}"

# quantize uses stderr
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
if sp.returncode is not None and sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, sp.args)
pass
all_saved_locations = []
# Convert each type!
for quant_method in quantization_method:
if quant_method != first_conversion:
print(f"Unsloth: [2] Converting GGUF 16bit into {quant_method}. This will take 20 minutes...")
final_location = f"./{model_directory}-unsloth.{quant_method.upper()}.gguf"

# Check if quantization succeeded!
if not os.path.isfile(final_location):
if IS_KAGGLE_ENVIRONMENT:
raise RuntimeError(
f"Unsloth: Quantization failed for {final_location}\n"\
"You are in a Kaggle environment, which might be the reason this is failing.\n"\
"Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
"This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
"`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
"I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
)
else:
raise RuntimeError(
"Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\
"You do not need to close this Python program. Run the following commands in a new terminal:\n"\
"You must run this in the same folder as you're saving your model.\n"\
"git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
"cd llama.cpp && make clean && make all -j\n"\
"Once that's done, redo the quantization."
)
command = f"./{quantize_location} {full_precision_location} "\
f"{final_location} {quant_method} {n_cpus}"

# quantize uses stderr
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
if sp.returncode is not None and sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, sp.args)
pass
pass

print(f"Unsloth: Conversion completed! Output location: {final_location}")
# Check if quantization succeeded!
if not os.path.isfile(final_location):
if IS_KAGGLE_ENVIRONMENT:
raise RuntimeError(
f"Unsloth: Quantization failed for {final_location}\n"\
"You are in a Kaggle environment, which might be the reason this is failing.\n"\
"Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
"This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
"`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
"I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
)
else:
raise RuntimeError(
"Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\
"You do not need to close this Python program. Run the following commands in a new terminal:\n"\
"You must run this in the same folder as you're saving your model.\n"\
"git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
"cd llama.cpp && make clean && make all -j\n"\
"Once that's done, redo the quantization."
)
pass
pass

print(f"Unsloth: Conversion completed! Output location: {final_location}")
all_saved_locations.append(final_location)
pass
pass

return final_location
return all_saved_locations
pass


Expand Down Expand Up @@ -1453,7 +1492,7 @@ def unsloth_save_pretrained_gguf(
is_sentencepiece_model = check_if_sentencepiece_model(self)

# Save to GGUF
file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model,
all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model,
new_save_directory, quantization_method, first_conversion, makefile,
)

Expand All @@ -1466,14 +1505,17 @@ def unsloth_save_pretrained_gguf(

if push_to_hub:
print("Unsloth: Uploading GGUF to Huggingface Hub...")
username = upload_to_huggingface(
self, save_directory, token,
"GGUF converted", "gguf", file_location, old_username, private,
)
link = f"{username}/{new_save_directory.lstrip('/.')}" \
if username not in new_save_directory else \
new_save_directory.lstrip('/.')
print(f"Saved GGUF to https://huggingface.co/{link}")

for file_location in all_file_locations:
username = upload_to_huggingface(
self, save_directory, token,
"GGUF converted", "gguf", file_location, old_username, private,
)
link = f"{username}/{new_save_directory.lstrip('/.')}" \
if username not in new_save_directory else \
new_save_directory.lstrip('/.')
print(f"Saved GGUF to https://huggingface.co/{link}")
pass
pass
pass

Expand Down Expand Up @@ -1604,20 +1646,22 @@ def unsloth_push_to_hub_gguf(
is_sentencepiece_model = check_if_sentencepiece_model(self)

# Save to GGUF
file_location = save_to_gguf(model_type, model_dtype, is_sentencepiece_model,
all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model,
new_save_directory, quantization_method, first_conversion, makefile,
)

print("Unsloth: Uploading GGUF to Huggingface Hub...")
username = upload_to_huggingface(
self, repo_id, token,
"GGUF converted", "gguf", file_location, old_username, private,
)
link = f"{username}/{new_save_directory.lstrip('/.')}" \
if username not in new_save_directory else \
new_save_directory.lstrip('/.')
for file_location in all_file_locations:
print("Unsloth: Uploading GGUF to Huggingface Hub...")
username = upload_to_huggingface(
self, repo_id, token,
"GGUF converted", "gguf", file_location, old_username, private,
)
link = f"{username}/{new_save_directory.lstrip('/.')}" \
if username not in new_save_directory else \
new_save_directory.lstrip('/.')

print(f"Saved GGUF to https://huggingface.co/{link}")
print(f"Saved GGUF to https://huggingface.co/{link}")
pass

if fix_bos_token:
logger.warning(
Expand Down