huggingface · sayakpaul · Aug 19, 2024 · Aug 19, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -31,6 +31,7 @@
     "loaders": ["FromOriginalModelMixin"],
     "models": [],
     "pipelines": [],
+    "quantizers.quantization_config": ["BitsAndBytesConfig"],
     "schedulers": [],
     "utils": [
         "OptionalDependencyNotAvailable",
@@ -123,7 +124,6 @@
             "VQModel",
         ]
     )
-
     _import_structure["optimization"] = [
         "get_constant_schedule",
         "get_constant_schedule_with_warmup",
@@ -155,6 +155,7 @@
             "StableDiffusionMixin",
         ]
     )
+    _import_structure["quantizers"] = ["DiffusersQuantizer"]
     _import_structure["schedulers"].extend(
         [
             "AmusedScheduler",
@@ -526,6 +527,7 @@
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     from .configuration_utils import ConfigMixin
+    from .quantizers.quantization_config import BitsAndBytesConfig
 
     try:
         if not is_onnx_available():
@@ -619,6 +621,7 @@
             ScoreSdeVePipeline,
             StableDiffusionMixin,
         )
+        from .quantizers import DiffusersQuantizer
         from .schedulers import (
             AmusedScheduler,
             CMStochasticIterativeScheduler,

diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
@@ -526,7 +526,8 @@ def extract_init_dict(cls, config_dict, **kwargs):
                 init_dict[key] = config_dict.pop(key)
 
         # 4. Give nice warning if unexpected values have been passed
-        if len(config_dict) > 0:
+        only_quant_config_remaining = len(config_dict) == 1 and "quantization_config" in config_dict
+        if len(config_dict) > 0 and not only_quant_config_remaining:
             logger.warning(
                 f"The config attributes {config_dict} were passed to {cls.__name__}, "
                 "but are not expected and will be ignored. Please verify your "
@@ -586,10 +587,20 @@ def to_json_saveable(value):
                 value = value.as_posix()
             return value
 
+        # IFWatermarker, for example, doesn't have a `config`.
+        if hasattr(self, "config") and "quantization_config" in self.config:
+            config_dict["quantization_config"] = (
+                self.config.quantization_config.to_dict()
+                if not isinstance(self.config.quantization_config, dict)
+                else self.config.quantization_config
+            )
+
         config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()}
         # Don't save "_ignore_files" or "_use_default_values"
         config_dict.pop("_ignore_files", None)
         config_dict.pop("_use_default_values", None)
+        # pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
+        _ = config_dict.pop("_pre_quantization_dtype", None)
 
         return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
 

diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -25,6 +25,7 @@
 import torch
 from huggingface_hub.utils import EntryNotFoundError
 
+from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
     SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_FILE_EXTENSION,
@@ -53,11 +54,36 @@
 
 
 # Adapted from `transformers` (see modeling_utils.py)
-def _determine_device_map(model: torch.nn.Module, device_map, max_memory, torch_dtype):
+def _determine_device_map(
+    model: torch.nn.Module, device_map, max_memory, torch_dtype, keep_in_fp32_modules=[], hf_quantizer=None
+):
     if isinstance(device_map, str):
+        special_dtypes = {}
+        if hf_quantizer is not None:
+            special_dtypes.update(hf_quantizer.get_special_dtypes_update(model, torch_dtype))
+        special_dtypes.update(
+            {
+                name: torch.float32
+                for name, _ in model.named_parameters()
+                if any(m in name for m in keep_in_fp32_modules)
+            }
+        )
+
+        target_dtype = torch_dtype
+        if hf_quantizer is not None:
+            target_dtype = hf_quantizer.adjust_target_dtype(target_dtype)
+
         no_split_modules = model._get_no_split_modules(device_map)
         device_map_kwargs = {"no_split_module_classes": no_split_modules}
 
+        if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters:
+            device_map_kwargs["special_dtypes"] = special_dtypes
+        elif len(special_dtypes) > 0:
+            logger.warning(
+                "This model has some weights that should be kept in higher precision, you need to upgrade "
+                "`accelerate` to properly deal with them (`pip install --upgrade accelerate`)."
+            )
+
         if device_map != "sequential":
             max_memory = get_balanced_memory(
                 model,
@@ -69,8 +95,14 @@ def _determine_device_map(model: torch.nn.Module, device_map, max_memory, torch_
         else:
             max_memory = get_max_memory(max_memory)
 
+        if hf_quantizer is not None:
+            max_memory = hf_quantizer.adjust_max_memory(max_memory)
+
         device_map_kwargs["max_memory"] = max_memory
-        device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs)
+        device_map = infer_auto_device_map(model, dtype=target_dtype, **device_map_kwargs)
+
+        if hf_quantizer is not None:
+            hf_quantizer.validate_environment(device_map=device_map)
 
     return device_map
 
@@ -99,6 +131,8 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
     """
     Reads a checkpoint file, returning properly formatted errors if they arise.
     """
+    if isinstance(checkpoint_file, dict):
 model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata) 
 state_dict = load_state_dict(model_file, variant=variant) 
 model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata) 
 state_dict = load_state_dict(model_file, variant=variant) 
+        return checkpoint_file
     try:
         file_extension = os.path.basename(checkpoint_file).split(".")[-1]
         if file_extension == SAFETENSORS_FILE_EXTENSION:
@@ -136,29 +170,57 @@ def load_model_dict_into_meta(
     device: Optional[Union[str, torch.device]] = None,
     dtype: Optional[Union[str, torch.dtype]] = None,
     model_name_or_path: Optional[str] = None,
+    hf_quantizer=None,
+    keep_in_fp32_modules=None,
 ) -> List[str]:
-    device = device or torch.device("cpu")
+    device = device or torch.device("cpu") if hf_quantizer is None else device
     dtype = dtype or torch.float32
+    is_quantized = hf_quantizer is not None
 
     accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
 
-    unexpected_keys = []
     empty_state_dict = model.state_dict()
+    unexpected_keys = [param_name for param_name in state_dict if param_name not in empty_state_dict]
+    is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn")
+
     for param_name, param in state_dict.items():
         if param_name not in empty_state_dict:
-            unexpected_keys.append(param_name)
             continue
 
-        if empty_state_dict[param_name].shape != param.shape:
+        # We convert floating dtypes to the `dtype` passed except for float8_e4m3fn type. We also want to keep the buffers/params
+        # in int/uint/bool and not cast them.
+        is_param_float8_e4m3fn = is_torch_e4m3fn_available and param.dtype == torch.float8_e4m3fn
+        if dtype is not None and torch.is_floating_point(param) and not is_param_float8_e4m3fn:
+            if (
+                keep_in_fp32_modules is not None
+                and any(
+                    module_to_keep_in_fp32 in param_name.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules
+                )
+                and dtype == torch.float16
+            ):
+                param = param.to(torch.float32)
+            else:
+                param = param.to(dtype)
+
+        is_quant_method_bnb = getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
+        if not is_quantized and not is_quant_method_bnb and empty_state_dict[param_name].shape != param.shape:
             model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
             raise ValueError(
                 f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
             )
 
-        if accepts_dtype:
-            set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype)
+        if (
+            not is_quantized
+            or (not hf_quantizer.requires_parameters_quantization)
+            or (not hf_quantizer.check_quantized_param(model, param, param_name, state_dict, param_device=device))
+        ):
+            if accepts_dtype:
+                set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype)
+            else:
+                set_module_tensor_to_device(model, param_name, device, value=param)
         else:
-            set_module_tensor_to_device(model, param_name, device, value=param)
+            hf_quantizer.create_quantized_param(model, param, param_name, device, state_dict, unexpected_keys)
+
     return unexpected_keys
 
 
@@ -228,3 +290,32 @@ def _fetch_index_file(
             index_file = None
 
     return index_file
+
+
+# Adapted from
+# https://github.com/bghira/SimpleTuner/blob/cea2457ab063f6dedb9e697830ae68a96be90641/helpers/training/save_hooks.py#L64
+def _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata):
+    weight_map = sharded_metadata.get("weight_map", None)
+    if weight_map is None:
+        raise KeyError("'weight_map' key not found in the shard index file.")
+
+    # Collect all unique safetensors files from weight_map
+    files_to_load = set(weight_map.values())
+    is_safetensors = all(f.endswith(".safetensors") for f in files_to_load)
+    merged_state_dict = {}
+
+    # Load tensors from each unique file
+    for file_name in files_to_load:
+        part_file_path = os.path.join(sharded_ckpt_cached_folder, file_name)
+        if not os.path.exists(part_file_path):
+            raise FileNotFoundError(f"Part file {file_name} not found.")
+
+        if is_safetensors:
+            with safetensors.safe_open(part_file_path, framework="pt", device="cpu") as f:
+                for tensor_key in f.keys():
+                    if tensor_key in weight_map:
+                        merged_state_dict[tensor_key] = f.get_tensor(tensor_key)
+        else:
+            merged_state_dict.update(torch.load(part_file_path, weights_only=True, map_location="cpu"))
+
+    return merged_state_dict