Lightning-AI · rasbt · Sep 24, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024
@@ -24,7 +24,7 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - {os: "macOS-12", python-version: "3.10"}
+          - {os: "macOS-14", python-version: "3.10"}
           - {os: "ubuntu-22.04", python-version: "3.11"}
           - {os: "ubuntu-22.04", python-version: "3.10"}
           - {os: "ubuntu-22.04", python-version: "3.9"}

@@ -1,3 +1,4 @@
+.ipynb_checkpoints/
 __pycache__
 .idea
 .DS_Store

@@ -3,11 +3,12 @@
 from collections import OrderedDict
 import os
 from pathlib import Path
+import sys
 
 import pytest
 import re
 import torch
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 from tests.conftest import RunIf
 
 from lightning.fabric.accelerators import CUDAAccelerator
@@ -20,6 +21,13 @@
 from litgpt.scripts.download import download_from_hub
 
 
+if sys.platform == "darwin" and os.getenv("GITHUB_ACTIONS") == "true":
+    USE_MPS = False
+elif torch.backends.mps.is_available():
+    USE_MPS = True
+else:
+    USE_MPS = False
+
 
 @pytest.fixture
 def mock_llm():
@@ -83,11 +91,12 @@ def test_llm_load_random_init(tmp_path):
     download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
 
     torch.manual_seed(123)
-    llm = LLM.load(
-        model="pythia-160m",
-        init="random",
-        tokenizer_dir=Path(tmp_path/"EleutherAI/pythia-14m")
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="pythia-160m",
+            init="random",
+            tokenizer_dir=Path(tmp_path/"EleutherAI/pythia-14m")
+        )
 
     input_text = "some text text"
     output_text = llm.generate(input_text, max_new_tokens=15)
@@ -110,10 +119,11 @@ def test_llm_load_random_init(tmp_path):
 
 def test_llm_load_hub_init(tmp_path):
     torch.manual_seed(123)
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-        init="pretrained"
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+            init="pretrained"
+        )
 
     text_1 = llm.generate("text", max_new_tokens=10, top_k=1)
     assert len(text_1) > 0
@@ -159,9 +169,10 @@ def test_more_than_1_device_for_sequential_gpu(tmp_path):
         model_name = "EleutherAI/pythia-14m"
     else:
         model_name = "EleutherAI/pythia-160m"
-    llm = LLM.load(
-        model=model_name,
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model=model_name,
+        )
 
     with pytest.raises(NotImplementedError, match=f"Support for multiple devices is currently only implemented for generate_strategy='sequential'|'tensor_parallel'."):
         llm.distribute(devices=2)
@@ -181,9 +192,10 @@ def test_more_than_1_device_for_sequential_gpu(tmp_path):
 
 @RunIf(min_cuda_gpus=2)
 def test_more_than_1_device_for_tensor_parallel_gpu(tmp_path):
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+        )
 
     if os.getenv("CI") != "true":
         # this crashes the CI, maybe because of process forking; works fine locally though
@@ -193,20 +205,24 @@ def test_more_than_1_device_for_tensor_parallel_gpu(tmp_path):
 
 @RunIf(min_cuda_gpus=1)
 def test_sequential_tp_incompatibility_with_random_weights(tmp_path):
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-        tokenizer_dir="EleutherAI/pythia-14m",
-        init="random"
-    )
+
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+            tokenizer_dir="EleutherAI/pythia-14m",
+            init="random"
+        )
     for strategy in ("sequential", "tensor_parallel"):
         with pytest.raises(NotImplementedError, match=re.escape("The LLM was initialized with init='random' but .distribute() currently only supports pretrained weights.")):
             llm.distribute(devices=1, generate_strategy=strategy)
 
 
 def test_sequential_tp_cpu(tmp_path):
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+            distribute=None,
+        )
     for strategy in ("sequential", "tensor_parallel"):
         with pytest.raises(NotImplementedError, match=f"generate_strategy='{strategy}' is only supported for accelerator='cuda'|'gpu'."):
             llm.distribute(
@@ -235,19 +251,21 @@ def test_initialization_for_trainer(tmp_path):
 
 @RunIf(min_cuda_gpus=1)
 def test_quantization_is_applied(tmp_path):
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+        )
     llm.distribute(devices=1, quantize="bnb.nf4", precision="bf16-true")
     strtype = str(type(llm.model.lm_head))
     assert "NF4Linear" in strtype, strtype
 
 
 @RunIf(min_cuda_gpus=1)
 def test_fixed_kv_cache(tmp_path):
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+        )
     llm.distribute(devices=1, fixed_kv_cache_size=100)
 
     # Request too many tokens
@@ -258,15 +276,17 @@ def test_fixed_kv_cache(tmp_path):
 def test_invalid_accelerator(tmp_path):
     llm = LLM.load(
         model="EleutherAI/pythia-14m",
+        distribute=None
     )
     with pytest.raises(ValueError, match="Invalid accelerator"):
         llm.distribute(accelerator="invalid")
 
 
 def test_returned_benchmark_dir(tmp_path):
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+        )
 
     text, bench_d = llm.benchmark(prompt="hello world")
     assert isinstance(bench_d["Inference speed in tokens/sec"], list)
@@ -305,6 +325,7 @@ def test_benchmark_dict_to_markdown_table_single_values():
 
     assert benchmark_dict_to_markdown_table(bench_d) == expected_output
 
+
 def test_benchmark_dict_to_markdown_table_multiple_values():
     bench_d_list = {
         'Inference speed in tokens/sec': [17.034547562152305, 32.8974175404589, 33.04784205046782, 32.445697744648584,
@@ -335,17 +356,19 @@ def test_benchmark_dict_to_markdown_table_multiple_values():
 
 
 def test_state_dict(tmp_path):
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+        )
     assert isinstance(llm.state_dict(), OrderedDict)
     assert llm.state_dict()['lm_head.weight'].shape == torch.Size([50304, 128])
 
 
 def test_save_method(tmp_path):
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+        )
 
     target_dir = "saved_model"
     llm.save(target_dir)
@@ -366,9 +389,10 @@ def test_save_method(tmp_path):
 
 
 def test_forward_method(tmp_path):
-    llm = LLM.load(
-        model="EleutherAI/pythia-14m",
-    )
+    with patch("torch.backends.mps.is_available", return_value=USE_MPS):
+        llm = LLM.load(
+            model="EleutherAI/pythia-14m",
+        )
     inputs = torch.ones(6, 128, dtype=torch.int64).to(next(llm.model.parameters()).device)
 
     assert llm(inputs).shape == torch.Size([6, 128, 50304])

@@ -7,6 +7,7 @@
 from itertools import repeat
 from pathlib import Path
 from unittest.mock import ANY, MagicMock, Mock, call, patch
+import sys
 from typing import Iterable
 
 import pytest
@@ -20,6 +21,12 @@
 from litgpt.utils import save_config, auto_download_checkpoint
 
 
+skip_in_ci_on_macos = pytest.mark.skipif(
+    sys.platform == "darwin" and os.getenv("GITHUB_ACTIONS") == "true",
+    reason="Skipped on macOS in CI environment because CI machine does not have enough memory to run this test."
+)
+
+
 @pytest.mark.parametrize(
     ("generated", "stop_tokens", "expected"),
     [
@@ -80,6 +87,7 @@ def test_decode():
     assert text == decoded, (text, decoded)
 
 
+@skip_in_ci_on_macos
 @patch("litgpt.chat.base.input")
 @pytest.mark.parametrize("stop_iteration", [KeyboardInterrupt, ""])
 def test_main(mocked_input, stop_iteration, fake_checkpoint_dir, monkeypatch, tensor_like):
@@ -129,6 +137,7 @@ def test_cli():
     assert "Chat with a model" in output
 
 
+@skip_in_ci_on_macos
 @patch("litgpt.chat.base.input")
 @patch("litgpt.chat.base.merge_lora")
 def test_merge_lora_if_needed(mocked_merge_lora, mocked_input, fake_checkpoint_dir, monkeypatch, tensor_like):
@@ -152,6 +161,7 @@ def test_merge_lora_if_needed(mocked_merge_lora, mocked_input, fake_checkpoint_d
     mocked_merge_lora.assert_called_once()
 
 
+@skip_in_ci_on_macos
 def test_litgpt_chat_endtoend():
     from litgpt.chat.base import main
 
@@ -172,6 +182,7 @@ def test_litgpt_chat_endtoend():
     assert simulated_input.call_count == 2
 
 
+@skip_in_ci_on_macos
 def test_litgpt_generate_endtoend():
     from litgpt.generate.base import main
 

@@ -379,7 +379,7 @@ def test_against_original_gemma(model_name, device, dtype):
     theirs_state_dict = {}
     copy_weights_llama(ours_config, theirs_state_dict, ours_state_dict, untie_weights=True)
     theirs_model = GemmaForCausalLM(theirs_config).to(device)
-    theirs_model.load_state_dict(theirs_state_dict, strict=False)
+    theirs_model.load_state_dict(theirs_state_dict, strict=False,)
 
     # test end to end
     x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
@@ -459,7 +459,7 @@ def test_against_original_gemma_2(model_name, device, dtype):
     assert x.size(1) == T
     ours_y = ours_model(x)
     theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
-    torch.testing.assert_close(ours_y, theirs_y)
+    torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)
 
 
 def test_check_conversion_supported_adapter():

@@ -5,6 +5,7 @@
 import sys
 from contextlib import redirect_stderr, redirect_stdout
 from io import StringIO
+import os
 from pathlib import Path
 from unittest import mock
 from unittest.mock import ANY, Mock, call
@@ -18,6 +19,12 @@
 from litgpt.generate.base import sample
 
 
+skip_in_ci_on_macos = pytest.mark.skipif(
+    sys.platform == "darwin" and os.getenv("GITHUB_ACTIONS") == "true",
+    reason="Skipped on macOS in CI environment because CI machine does not have enough memory to run this test."
+)
+
+
 @pytest.mark.parametrize(
     "max_seq_length", (pytest.param(10, marks=pytest.mark.xfail(raises=NotImplementedError, strict=True)), 20 + 5)
 )
@@ -51,6 +58,7 @@ def multinomial(*args, **kwargs):
     torch.testing.assert_close(out, expected)
 
 
+@skip_in_ci_on_macos
 def test_main(fake_checkpoint_dir, monkeypatch, tensor_like):
     config_path = fake_checkpoint_dir / "model_config.yaml"
     config = {"block_size": 128, "vocab_size": 50, "n_layer": 2, "n_head": 4, "n_embd": 8, "rotary_percentage": 1}

@@ -5,6 +5,7 @@
 import sys
 from contextlib import redirect_stderr, redirect_stdout
 from io import StringIO
+import os
 from pathlib import Path
 from unittest.mock import ANY, Mock, call
 
@@ -13,6 +14,13 @@
 import yaml
 
 
+skip_in_ci_on_macos = pytest.mark.skipif(
+    sys.platform == "darwin" and os.getenv("GITHUB_ACTIONS") == "true",
+    reason="Skipped on macOS in CI environment because CI machine does not have enough memory to run this test."
+)
+
+
+@skip_in_ci_on_macos
 @pytest.mark.parametrize("version", ("v1", "v2"))
 def test_main(fake_checkpoint_dir, monkeypatch, version, tensor_like):
     if version == "v1":

@@ -659,7 +659,7 @@ def test_against_original_gemma_2(model_name, device, dtype):
     assert x.size(1) == T
     ours_y = ours_model(x)
     theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
-    torch.testing.assert_close(ours_y, theirs_y)
+    torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)
 
 
 @RunIf(dynamo=True)