Merge branch 'main' into remove-transformeroutput-deprecation

huggingface · Jun 26, 2024 · 7ad3765 · 7ad3765
2 parents 013c9b7 + ea6938a
commit 7ad3765
Show file tree

Hide file tree

Showing 28 changed files with 2,149 additions and 28 deletions.
diff --git a/.github/workflows/mirror_community_pipeline.yml b/.github/workflows/mirror_community_pipeline.yml
@@ -22,6 +22,9 @@ on:
 
 jobs:
   mirror_community_pipeline:
+    env:
+      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_COMMUNITY_MIRROR }}
+
     runs-on: ubuntu-latest
     steps:
       # Checkout to correct ref
@@ -86,4 +89,14 @@ jobs:
         run: huggingface-cli upload diffusers/community-pipelines-mirror ./examples/community ${PATH_IN_REPO} --repo-type dataset
         env:
             PATH_IN_REPO: ${{ env.PATH_IN_REPO }}
-            HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
+            HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
+
+      - name: Report success status
+        if: ${{ success() }}
+        run: |
+          pip install requests && python utils/notify_community_pipelines_mirror.py --status=success
+      
+      - name: Report failure status
+        if: ${{ failure() }}
+        run: |
+          pip install requests && python utils/notify_community_pipelines_mirror.py --status=failure
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
@@ -330,6 +330,7 @@ jobs:
     - name: Run example tests on GPU
       env:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        RUN_COMPILE: yes
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
     - name: Failure short reports

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -257,6 +257,8 @@
       title: PriorTransformer
     - local: api/models/controlnet
       title: ControlNetModel
+    - local: api/models/controlnet_hunyuandit
+      title: HunyuanDiT2DControlNetModel
     - local: api/models/controlnet_sd3
       title: SD3ControlNetModel
     title: Models
@@ -282,6 +284,8 @@
       title: Consistency Models
     - local: api/pipelines/controlnet
       title: ControlNet
+    - local: api/pipelines/controlnet_hunyuandit
+      title: ControlNet with Hunyuan-DiT
     - local: api/pipelines/controlnet_sd3
       title: ControlNet with Stable Diffusion 3
     - local: api/pipelines/controlnet_sdxl

diff --git a/docs/source/en/api/models/controlnet_hunyuandit.md b/docs/source/en/api/models/controlnet_hunyuandit.md
@@ -0,0 +1,37 @@
+<!--Copyright 2024 The HuggingFace Team and Tencent Hunyuan Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# HunyuanDiT2DControlNetModel
+
+HunyuanDiT2DControlNetModel is an implementation of ControlNet for [Hunyuan-DiT](https://arxiv.org/abs/2405.08748).
+
+ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
+
+With a ControlNet model, you can provide an additional control image to condition and control Hunyuan-DiT generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
+
+The abstract from the paper is:
+
+*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
+
+This code is implemented by Tencent Hunyuan Team. You can find pre-trained checkpoints for Hunyuan-DiT ControlNets on [Tencent Hunyuan](https://huggingface.co/Tencent-Hunyuan).
+
+## Example For Loading HunyuanDiT2DControlNetModel
+
+```py
+from diffusers import HunyuanDiT2DControlNetModel
+import torch
+controlnet = HunyuanDiT2DControlNetModel.from_pretrained("Tencent-Hunyuan/HunyuanDiT-v1.1-ControlNet-Diffusers-Pose", torch_dtype=torch.float16)
+```
+
+## HunyuanDiT2DControlNetModel
+
+[[autodoc]] HunyuanDiT2DControlNetModel
diff --git a/docs/source/en/api/pipelines/controlnet_hunyuandit.md b/docs/source/en/api/pipelines/controlnet_hunyuandit.md
@@ -0,0 +1,36 @@
+<!--Copyright 2024 The HuggingFace Team and Tencent Hunyuan Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ControlNet with Hunyuan-DiT
+
+HunyuanDiTControlNetPipeline is an implementation of ControlNet for [Hunyuan-DiT](https://arxiv.org/abs/2405.08748).
+
+ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
+
+With a ControlNet model, you can provide an additional control image to condition and control Hunyuan-DiT generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
+
+The abstract from the paper is:
+
+*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
+
+This code is implemented by Tencent Hunyuan Team. You can find pre-trained checkpoints for Hunyuan-DiT ControlNets on [Tencent Hunyuan](https://huggingface.co/Tencent-Hunyuan).
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## HunyuanDiTControlNetPipeline
+[[autodoc]] HunyuanDiTControlNetPipeline
+	- all
+	- __call__
diff --git a/docs/source/en/api/pipelines/hunyuandit.md b/docs/source/en/api/pipelines/hunyuandit.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team and Tencent Hunyuan Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

diff --git a/docs/source/en/using-diffusers/pag.md b/docs/source/en/using-diffusers/pag.md
@@ -44,6 +44,13 @@ pipeline.enable_model_cpu_offload()
 > [!TIP]
 > The `pag_applied_layers` argument allows you to specify which layers PAG is applied to. Additionally, you can use `set_pag_applied_layers` method to update these layers after the pipeline has been created. Check out the [pag_applied_layers](#pag_applied_layers) section to learn more about applying PAG to other layers.
 
+If you already have a pipeline created and loaded, you can enable PAG on it using the `from_pipe` API with the `enable_pag` flag. Internally, a PAG pipeline is created based on the pipeline and task you specified. In the example below, since we used `AutoPipelineForText2Image` and passed a `StableDiffusionXLPipeline`, a `StableDiffusionXLPAGPipeline` is created accordingly. Note that this does not require additional memory, and you will have both `StableDiffusionXLPipeline` and  `StableDiffusionXLPAGPipeline` loaded and ready to use. You can read more about the `from_pipe` API and how to reuse pipelines in diffuser[here](https://huggingface.co/docs/diffusers/using-diffusers/loading#reuse-a-pipeline)
+
+```py
+pipeline_sdxl = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0, torch_dtype=torch.float16")
+pipeline = AutoPipelineForText2Image.from_pipe(pipeline_sdxl, enable_pag=True)
+```
+
 To generate an image, you will also need to pass a `pag_scale`. When `pag_scale` increases, images gain more semantically coherent structures and exhibit fewer artifacts. However overly large guidance scale can lead to smoother textures and slight saturation in the images, similarly to CFG. `pag_scale=3.0` is used in the official demo and works well in most of the use cases, but feel free to experiment and select the appropriate value according to your needs! PAG is disabled when `pag_scale=0`.
 
 ```py
@@ -74,7 +81,7 @@ for pag_scale in [0.0, 3.0]:
 </hfoption>
 <hfoption id="Image-to-image">
 
-Similary, you can use PAG with image-to-image pipelines.
+You can use PAG with image-to-image pipelines.
 
 ```py
 from diffusers import AutoPipelineForImage2Image
@@ -88,7 +95,32 @@ pipeline = AutoPipelineForImage2Image.from_pretrained(
     torch_dtype=torch.float16
 )
 pipeline.enable_model_cpu_offload()
+```
+
+If you already have a image-to-image pipeline and would like enable PAG on it, you can run this
+
+```py
+pipeline_t2i = AutoPipelineForImage2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+pipeline = AutoPipelineForImage2Image.from_pipe(pipeline_t2i, enable_pag=True)
+```
+
+It is also very easy to directly switch from a text-to-image pipeline to PAG enabled image-to-image pipeline
+
+```py
+pipeline_pag = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+pipeline = AutoPipelineForImage2Image.from_pipe(pipeline_t2i, enable_pag=True)
+```
+
+If you have a PAG enabled text-to-image pipeline, you can directly switch to a image-to-image pipeline with PAG still enabled
+
+```py
+pipeline_pag = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", enable_pag=True, torch_dtype=torch.float16)
+pipeline = AutoPipelineForImage2Image.from_pipe(pipeline_t2i)
+```
 
+Now let's generate an image!
+
+```py
 pag_scales =  4.0
 guidance_scales = 7.0
 
@@ -120,7 +152,25 @@ pipeline = AutoPipelineForInpainting.from_pretrained(
     torch_dtype=torch.float16
 )
 pipeline.enable_model_cpu_offload()
+```
+
+You can enable PAG on an exisiting inpainting pipeline like this
+
+```py
+pipeline_inpaint = AutoPipelineForInpaiting.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_inpaint, enable_pag=True)
+```
+
+This still works when your pipeline has a different task: 
 
+```py
+pipeline_t2i = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_t2i, enable_pag=True)
+```
+
+Let's generate an image! 
+
+```py
 img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
 mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
 init_image = load_image(img_url).convert("RGB")
@@ -169,6 +219,12 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
 pipeline.enable_model_cpu_offload()
 ```
 
+<Tip>
+
+If you already have a controlnet pipeline and want to enable PAG, you can use the `from_pipe` API: `AutoPipelineForText2Image.from_pipe(pipeline_controlnet, enable_pag=True)`
+
+</Tip>
+
 You can use the pipeline in the same way you normally use ControlNet pipelines, with the added option to specify a `pag_scale` parameter. Note that PAG works well for unconditional generation. In this example, we will generate an image without a prompt.
 
 ```py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -83,7 +83,9 @@
             "ControlNetModel",
             "ControlNetXSAdapter",
             "DiTTransformer2DModel",
+            "HunyuanDiT2DControlNetModel",
             "HunyuanDiT2DModel",
+            "HunyuanDiT2DMultiControlNetModel",
             "I2VGenXLUNet",
             "Kandinsky3UNet",
             "ModelMixin",
@@ -234,6 +236,7 @@
             "BlipDiffusionPipeline",
             "CLIPImageProjection",
             "CycleDiffusionPipeline",
+            "HunyuanDiTControlNetPipeline",
             "HunyuanDiTPipeline",
             "I2VGenXLPipeline",
             "IFImg2ImgPipeline",
@@ -500,7 +503,9 @@
             ControlNetModel,
             ControlNetXSAdapter,
             DiTTransformer2DModel,
+            HunyuanDiT2DControlNetModel,
             HunyuanDiT2DModel,
+            HunyuanDiT2DMultiControlNetModel,
             I2VGenXLUNet,
             Kandinsky3UNet,
             ModelMixin,
@@ -629,6 +634,7 @@
             AudioLDMPipeline,
             CLIPImageProjection,
             CycleDiffusionPipeline,
+            HunyuanDiTControlNetPipeline,
             HunyuanDiTPipeline,
             I2VGenXLPipeline,
             IFImg2ImgPipeline,

diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -457,6 +457,15 @@ def save_attn_procs(
         )
         if is_custom_diffusion:
             state_dict = self._get_custom_diffusion_state_dict()
+            if save_function is None and safe_serialization:
+                # safetensors does not support saving dicts with non-tensor values
+                empty_state_dict = {k: v for k, v in state_dict.items() if not isinstance(v, torch.Tensor)}
+                if len(empty_state_dict) > 0:
+                    logger.warning(
+                        f"Safetensors does not support saving dicts with non-tensor values. "
+                        f"The following keys will be ignored: {empty_state_dict.keys()}"
+                    )
+                state_dict = {k: v for k, v in state_dict.items() if isinstance(v, torch.Tensor)}
         else:
             if not USE_PEFT_BACKEND:
                 raise ValueError("PEFT backend is required for saving LoRAs using the `save_attn_procs()` method.")

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -33,6 +33,7 @@
     _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
     _import_structure["autoencoders.vq_model"] = ["VQModel"]
     _import_structure["controlnet"] = ["ControlNetModel"]
+    _import_structure["controlnet_hunyuan"] = ["HunyuanDiT2DControlNetModel", "HunyuanDiT2DMultiControlNetModel"]
     _import_structure["controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
     _import_structure["controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
     _import_structure["embeddings"] = ["ImageProjection"]
@@ -75,6 +76,7 @@
             VQModel,
         )
         from .controlnet import ControlNetModel
+        from .controlnet_hunyuan import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel
         from .controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
         from .controlnet_xs import ControlNetXSAdapter, UNetControlNetXSModel
         from .embeddings import ImageProjection