huggingface · sayakpaul · Oct 31, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
@@ -237,3 +237,5 @@ with torch.no_grad():
 ```
 
 By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs.
+
+This workflow is also compatible when working with LoRAs via `load_lora_weights()`. However, note that only LoRAs not involving any text encoder components are supported in this workflow at the moment.
diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py
@@ -31,6 +31,7 @@
     delete_adapter_layers,
     deprecate,
     is_accelerate_available,
+    is_accelerate_version,
     is_peft_available,
     is_transformers_available,
     logging,
@@ -214,9 +215,18 @@ def _optionally_disable_offloading(cls, _pipeline):
         is_model_cpu_offload = False
         is_sequential_cpu_offload = False
 
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
         if _pipeline is not None and _pipeline.hf_device_map is None:
             for _, component in _pipeline.components.items():
-                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                if (
+                    isinstance(component, nn.Module)
+                    and hasattr(component, "_hf_hook")
+                    and not model_has_device_map(component)
+                ):
                     if not is_model_cpu_offload:
                         is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                     if not is_sequential_cpu_offload:

diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -39,6 +39,7 @@
     get_adapter_name,
     get_peft_kwargs,
     is_accelerate_available,
+    is_accelerate_version,
     is_peft_version,
     is_torch_version,
     logging,
@@ -398,9 +399,18 @@ def _optionally_disable_offloading(cls, _pipeline):
         is_model_cpu_offload = False
         is_sequential_cpu_offload = False
 
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
         if _pipeline is not None and _pipeline.hf_device_map is None:
             for _, component in _pipeline.components.items():
-                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                if (
+                    isinstance(component, nn.Module)
+                    and hasattr(component, "_hf_hook")
+                    and not model_has_device_map(component)
+                ):
                     if not is_model_cpu_offload:
                         is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                     if not is_sequential_cpu_offload:

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -389,6 +389,11 @@ def to(self, *args, **kwargs):
 
         device = device or device_arg
 
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
         # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
         def module_is_sequentially_offloaded(module):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
@@ -406,6 +411,16 @@ def module_is_offloaded(module):
 
             return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.CpuOffload)
 
+        # device-mapped modules should not go through any device placements.
+        device_mapped_components = [
+            key for key, component in self.components.items() if model_has_device_map(component)
+        ]
+        if device_mapped_components:
+            raise ValueError(
+                "The following pipeline components have been found to use a device map: "
+                f"{device_mapped_components}. This is incompatible with explicitly setting the device using `to()`."
+            )
+
         # .to("cuda") would raise an error if the pipeline is sequentially offloaded, so we raise our own to make it clearer
         pipeline_is_sequentially_offloaded = any(
             module_is_sequentially_offloaded(module) for _, module in self.components.items()
@@ -1002,6 +1017,22 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
+
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
+        # device-mapped modules should not go through any device placements.
+        device_mapped_components = [
+            key for key, component in self.components.items() if model_has_device_map(component)
+        ]
+        if device_mapped_components:
+            raise ValueError(
+                "The following pipeline components have been found to use a device map: "
+                f"{device_mapped_components}. This is incompatible with `enable_model_cpu_offload()`."
+            )
+
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
         if is_pipeline_device_mapped:
             raise ValueError(
@@ -1104,6 +1135,22 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
+
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
+        # device-mapped modules should not go through any device placements.
+        device_mapped_components = [
+            key for key, component in self.components.items() if model_has_device_map(component)
+        ]
+        if device_mapped_components:
+            raise ValueError(
+                "The following pipeline components have been found to use a device map: "
+                f"{device_mapped_components}. This is incompatible with `enable_sequential_cpu_offload()`."
+            )
+
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
             from accelerate import cpu_offload
         else:

diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -506,9 +506,14 @@ def test_to_dtype(self):
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
         self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
 
+    @unittest.skip("Test currently not supported.")
     def test_sequential_cpu_offload_forward_pass(self):
         pass
 
+    @unittest.skip("Test currently not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
 
 @nightly
 class AudioLDM2PipelineSlowTests(unittest.TestCase):

diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
@@ -6,9 +6,11 @@
 from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
+from diffusers.image_processor import VaeImageProcessor
 from diffusers.utils.testing_utils import (
     numpy_cosine_similarity_distance,
     require_torch_gpu,
+    require_torch_multi_gpu,
     slow,
     torch_device,
 )
@@ -249,3 +251,87 @@ def test_flux_inference(self):
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
 
         assert max_diff < 1e-4
+
+    @require_torch_multi_gpu
+    @torch.no_grad()
+    def test_flux_component_sharding(self):
+        """
+        internal note: test was run on `audace`.
+        """
+
+        ckpt_id = "black-forest-labs/FLUX.1-dev"
+        dtype = torch.bfloat16
+        prompt = "a photo of a cat with tiger-like look"
+
+        pipeline = FluxPipeline.from_pretrained(
+            ckpt_id,
+            transformer=None,
+            vae=None,
+            device_map="balanced",
+            max_memory={0: "16GB", 1: "16GB"},
+            torch_dtype=dtype,
+        )
+        prompt_embeds, pooled_prompt_embeds, _ = pipeline.encode_prompt(
+            prompt=prompt, prompt_2=None, max_sequence_length=512
+        )
+
+        del pipeline.text_encoder
+        del pipeline.text_encoder_2
+        del pipeline.tokenizer
+        del pipeline.tokenizer_2
+        del pipeline
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        transformer = FluxTransformer2DModel.from_pretrained(
+            ckpt_id, subfolder="transformer", device_map="auto", max_memory={0: "16GB", 1: "16GB"}, torch_dtype=dtype
+        )
+        pipeline = FluxPipeline.from_pretrained(
+            ckpt_id,
+            text_encoder=None,
+            text_encoder_2=None,
+            tokenizer=None,
+            tokenizer_2=None,
+            vae=None,
+            transformer=transformer,
+            torch_dtype=dtype,
+        )
+
+        height, width = 768, 1360
+        # No need to wrap it up under `torch.no_grad()` as pipeline call method
+        # is already wrapped under that.
+        latents = pipeline(
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            num_inference_steps=10,
+            guidance_scale=3.5,
+            height=height,
+            width=width,
+            output_type="latent",
+            generator=torch.manual_seed(0),
+        ).images
+        latent_slice = latents[0, :3, :3].flatten().float().cpu().numpy()
+        expected_slice = np.array([-0.377, -0.3008, -0.5117, -0.252, 0.0615, -0.3477, -0.1309, -0.1914, 0.1533])
+
+        assert numpy_cosine_similarity_distance(latent_slice, expected_slice) < 1e-4
+
+        del pipeline.transformer
+        del pipeline
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype).to(torch_device)
+        vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
+        image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
+
+        latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
+        latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
+
+        image = vae.decode(latents, return_dict=False)[0]
+        image = image_processor.postprocess(image, output_type="np")
+        image_slice = image[0, :3, :3, -1].flatten()
+        expected_slice = np.array([0.127, 0.1113, 0.1055, 0.1172, 0.1172, 0.1074, 0.1191, 0.1191, 0.1152])
+
+        assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4
diff --git a/tests/pipelines/musicldm/test_musicldm.py b/tests/pipelines/musicldm/test_musicldm.py
@@ -404,6 +404,10 @@ def test_to_dtype(self):
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
         self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
 
+    @unittest.skip("Test currently not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
 
 @nightly
 @require_torch_gpu

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
@@ -30,19 +30,24 @@
 )
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import IPAdapterMixin
+from diffusers.models.adapter import MultiAdapter
 from diffusers.models.attention_processor import AttnProcessor
 from diffusers.models.controlnet_xs import UNetControlNetXSModel
 from diffusers.models.unets.unet_3d_condition import UNet3DConditionModel
 from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet
 from diffusers.models.unets.unet_motion_model import UNetMotionModel
+from diffusers.pipelines.controlnet import MultiControlNetModel
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
 from diffusers.utils.testing_utils import (
     CaptureLogger,
+    nightly,
     require_torch,
+    require_torch_multi_gpu,
     skip_mps,
+    slow,
     torch_device,
 )
 
@@ -59,6 +64,10 @@
 from ..others.test_utils import TOKEN, USER, is_staging_test
 
 
+if is_accelerate_available():
+    from accelerate.utils import compute_module_sizes
+
+
 def to_np(tensor):
     if isinstance(tensor, torch.Tensor):
         tensor = tensor.detach().cpu().numpy()
@@ -1907,6 +1916,99 @@ def test_StableDiffusionMixin_component(self):
             )
         )
 
+    @require_torch_multi_gpu
+    @slow
+    @nightly
+    def test_calling_to_raises_error_device_mapped_components(self):
+        if "Combined" in self.pipeline_class.__name__:
+            return
+
+        # TODO (sayakpaul): skip these for now. revisit later.
+        components = self.get_dummy_components()
+        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
+            return
+
+        pipe = self.pipeline_class(**components)
+        max_model_size = max(
+            compute_module_sizes(module)[""]
+            for _, module in pipe.components.items()
+            if isinstance(module, torch.nn.Module)
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            max_memory = {0: max_model_size, 1: max_model_size}
+            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
+
+        with self.assertRaises(ValueError) as err_context:
+            loaded_pipe.to(torch_device)
+
+        self.assertTrue(
+            "The following pipeline components have been found" in str(err_context.exception)
+            and "This is incompatible with explicitly setting the device using `to()`" in str(err_context.exception)
+        )
+
+    @require_torch_multi_gpu
+    @slow
+    @nightly
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        if "Combined" in self.pipeline_class.__name__:
+            return
+
+        # TODO (sayakpaul): skip these for now. revisit later.
+        components = self.get_dummy_components()
+        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
+            return
+
+        pipe = self.pipeline_class(**components)
+        max_model_size = max(
+            compute_module_sizes(module)[""]
+            for _, module in pipe.components.items()
+            if isinstance(module, torch.nn.Module)
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            max_memory = {0: max_model_size, 1: max_model_size}
+            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
+
+        with self.assertRaises(ValueError) as err_context:
+            loaded_pipe.enable_model_cpu_offload()
+
+        self.assertTrue(
+            "The following pipeline components have been found" in str(err_context.exception)
+            and "This is incompatible with `enable_model_cpu_offload()`" in str(err_context.exception)
+        )
+
+    @require_torch_multi_gpu
+    @slow
+    @nightly
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        if "Combined" in self.pipeline_class.__name__:
+            return
+
+        # TODO (sayakpaul): skip these for now. revisit later.
+        components = self.get_dummy_components()
+        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
+            return
+
+        pipe = self.pipeline_class(**components)
+        max_model_size = max(
+            compute_module_sizes(module)[""]
+            for _, module in pipe.components.items()
+            if isinstance(module, torch.nn.Module)
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            max_memory = {0: max_model_size, 1: max_model_size}
+            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
+
+        with self.assertRaises(ValueError) as err_context:
+            loaded_pipe.enable_sequential_cpu_offload()
+
+        self.assertTrue(
+            "The following pipeline components have been found" in str(err_context.exception)
+            and "This is incompatible with `enable_sequential_cpu_offload()`" in str(err_context.exception)
+        )
+
 
 @is_staging_test
 class PipelinePushToHubTester(unittest.TestCase):