From 76ae12eb28b99add5dc229d101a25d7b187ce650 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 15:04:16 +0800
Subject: [PATCH 01/20] fix cannot pickle 'module' object for 8 bit

---
 gptqmodel/models/base.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 306ed57d..465eb329 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -27,7 +27,7 @@
 from ..utils.model import (auto_dtype_from_config, convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format,
                            find_layers, get_checkpoints, get_device, get_module_by_name_prefix,
                            get_module_by_name_suffix, get_moe_layer_modules, gptqmodel_post_init, make_quant,
-                           move_to, nested_move_to, pack_model, simple_dispatch_model)
+                           move_to, nested_move_to, pack_model, simple_dispatch_model, deepcopy_model_with_modules)
 from ..version import __version__
 from ._const import CPU, CUDA_0, SUPPORTED_MODELS
 
@@ -540,8 +540,14 @@ def save_quantized(
 
         # internal is always gptq v2 but allow users to pass gptq (v1) via config
         if format is None and quantize_config.format == FORMAT.GPTQ:
-            # Model qzeros may be edited in place.
-            # TODO: avoid inplace modification of the weights
+            # fix ModelCloud/GPTQModel/issues/47
+            # fix gptqmodel_cuda cannot be serialized
+            # no need to set it back, no calculation below
+            from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
+            for module in model.named_modules():
+                if len(module) == 2 and isinstance (module[1], QuantLinear):
+                    module[1].gptqmodel_cuda = None
+                    print(module)
             model = copy.deepcopy(self.model)
             model = convert_gptq_v2_to_v1_format(
                 model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel

From 25c7eaae9b81242eb2328532b4461b51cda723ea Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 15:09:23 +0800
Subject: [PATCH 02/20] remove unused import

---
 gptqmodel/models/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 465eb329..1ce1c879 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -27,7 +27,7 @@
 from ..utils.model import (auto_dtype_from_config, convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format,
                            find_layers, get_checkpoints, get_device, get_module_by_name_prefix,
                            get_module_by_name_suffix, get_moe_layer_modules, gptqmodel_post_init, make_quant,
-                           move_to, nested_move_to, pack_model, simple_dispatch_model, deepcopy_model_with_modules)
+                           move_to, nested_move_to, pack_model, simple_dispatch_model)
 from ..version import __version__
 from ._const import CPU, CUDA_0, SUPPORTED_MODELS
 

From e714b7115e9c415fd0f2833afd3e5f4dded45651 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 15:19:08 +0800
Subject: [PATCH 03/20] remove print

---
 gptqmodel/models/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 1ce1c879..107a90fa 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -547,7 +547,6 @@ def save_quantized(
             for module in model.named_modules():
                 if len(module) == 2 and isinstance (module[1], QuantLinear):
                     module[1].gptqmodel_cuda = None
-                    print(module)
             model = copy.deepcopy(self.model)
             model = convert_gptq_v2_to_v1_format(
                 model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel

From 035d8e81a856018a943be391c677ac884ea1c1ea Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 16:44:15 +0800
Subject: [PATCH 04/20] check with tuple

---
 gptqmodel/models/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 107a90fa..778120a3 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -4,7 +4,7 @@
 import os
 import re
 from os.path import isfile, join
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union, Tuple
 
 import accelerate
 import torch
@@ -545,7 +545,7 @@ def save_quantized(
             # no need to set it back, no calculation below
             from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
             for module in model.named_modules():
-                if len(module) == 2 and isinstance (module[1], QuantLinear):
+                if isinstance (module, Tuple) and isinstance (module[1], QuantLinear):
                     module[1].gptqmodel_cuda = None
             model = copy.deepcopy(self.model)
             model = convert_gptq_v2_to_v1_format(

From 9394229504e02c6251d1204872b3edcb5c7aaba1 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 17:11:17 +0800
Subject: [PATCH 05/20] revert to len check

---
 gptqmodel/models/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 778120a3..7eb84a11 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -545,7 +545,7 @@ def save_quantized(
             # no need to set it back, no calculation below
             from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
             for module in model.named_modules():
-                if isinstance (module, Tuple) and isinstance (module[1], QuantLinear):
+                if len(module) == 2 and isinstance (module[1], QuantLinear):
                     module[1].gptqmodel_cuda = None
             model = copy.deepcopy(self.model)
             model = convert_gptq_v2_to_v1_format(

From b27baa6b67174675a79dfdf9c362ed6dfc4e1d54 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 17:26:16 +0800
Subject: [PATCH 06/20] add test for 8bit

---
 tests/test_quant_formats.py | 61 ++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index f1b4f6eb..2b3475a4 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -5,14 +5,28 @@
 import unittest  # noqa: E402
 
 import torch.cuda  # noqa: E402
-from gptqmodel import GPTQModel, __version__  # noqa: E402
-from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QuantizeConfig  # noqa: E402
-from gptqmodel.quantization.config import META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import GPTQModel  # noqa: E402
+from gptqmodel import __version__  # noqa: E402
+from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QuantizeConfig  # noqa: E402
+from gptqmodel.quantization.config import META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL
+
 
 class TestQuantization(unittest.TestCase):
+
+    def setUp(self):
+        self.pretrained_model_dir = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_dir, use_fast=True)
+        self.calibration_dataset = [
+            self.tokenizer(
+                "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
+            ),
+            self.tokenizer("Today I am in Paris and it is a wonderful day."),
+        ]
+
     @parameterized.expand(
         [
             (False, True, FORMAT.GPTQ_V2),
@@ -21,16 +35,6 @@ class TestQuantization(unittest.TestCase):
         ]
     )
     def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT):
-        pretrained_model_dir = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
-
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-        calibration_dataset = [
-            tokenizer(
-                "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
-            ),
-            tokenizer("Today I am in Paris and it is a wonderful day."),
-        ]
-
         quantize_config = QuantizeConfig(
             bits=4,
             group_size=128,
@@ -40,17 +44,15 @@ def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT):
         )
 
         model = GPTQModel.from_pretrained(
-            pretrained_model_dir,
+            self.pretrained_model_dir,
             quantize_config=quantize_config,
             use_flash_attention_2=False,
         )
 
-        model.quantize(calibration_dataset)
+        model.quantize(self.calibration_dataset)
 
         with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(
-                tmpdirname,
-            )
+            model.save_quantized(tmpdirname)
 
             logging.info(f"Saved config mem: {model.quantize_config}")
 
@@ -117,3 +119,26 @@ def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT):
                 format=format,
             )
             assert isinstance(model.quantize_config, QuantizeConfig)
+
+    def test_gptq_8bit(self):
+        quantize_config = QuantizeConfig(
+            bits=8,
+            group_size=128,
+            format=FORMAT.GPTQ,
+        )
+
+        model = GPTQModel.from_pretrained(
+            self.pretrained_model_dir,
+            quantize_config=quantize_config,
+            use_flash_attention_2=False,
+        )
+
+        model.quantize(self.calibration_dataset)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            err = None
+            try:
+                model.save_quantized(tmpdirname)
+            except Exception as e:
+                err = e
+            self.assertTrue(err is None)

From 69ae019b0a107716ade0bc7949aeb76480670268 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 17:38:39 +0800
Subject: [PATCH 07/20] set same QuantizeConfig

---
 tests/test_quant_formats.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index 2b3475a4..f5665869 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -125,6 +125,8 @@ def test_gptq_8bit(self):
             bits=8,
             group_size=128,
             format=FORMAT.GPTQ,
+            desc_act=True,
+            sym=False,
         )
 
         model = GPTQModel.from_pretrained(
@@ -140,5 +142,6 @@ def test_gptq_8bit(self):
             try:
                 model.save_quantized(tmpdirname)
             except Exception as e:
+                print(e)
                 err = e
             self.assertTrue(err is None)

From 09d76aad8182f3a1d43c77511444adcbc0281b9e Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 17:49:30 +0800
Subject: [PATCH 08/20] check if it's 4 bit

---
 gptqmodel/models/base.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 7eb84a11..1c2ec057 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -543,10 +543,11 @@ def save_quantized(
             # fix ModelCloud/GPTQModel/issues/47
             # fix gptqmodel_cuda cannot be serialized
             # no need to set it back, no calculation below
-            from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
-            for module in model.named_modules():
-                if len(module) == 2 and isinstance (module[1], QuantLinear):
-                    module[1].gptqmodel_cuda = None
+            if quantize_config.bits != 4 :
+                from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
+                for module in model.named_modules():
+                    if len(module) == 2 and isinstance (module[1], QuantLinear):
+                        module[1].gptqmodel_cuda = None
             model = copy.deepcopy(self.model)
             model = convert_gptq_v2_to_v1_format(
                 model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel

From 2548a7a62247b8ad043334691d8d06196af6dfd8 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 18:04:59 +0800
Subject: [PATCH 09/20] fix grammar

---
 gptqmodel/models/base.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 1c2ec057..3fd08468 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -540,13 +540,15 @@ def save_quantized(
 
         # internal is always gptq v2 but allow users to pass gptq (v1) via config
         if format is None and quantize_config.format == FORMAT.GPTQ:
+            # Model qzeros may be edited in place.
+            # TODO: avoid inplace modification of the weights
             # fix ModelCloud/GPTQModel/issues/47
             # fix gptqmodel_cuda cannot be serialized
             # no need to set it back, no calculation below
             if quantize_config.bits != 4 :
                 from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
-                for module in model.named_modules():
-                    if len(module) == 2 and isinstance (module[1], QuantLinear):
+                for name, module in model.named_modules():
+                    if isinstance (module[1], QuantLinear):
                         module[1].gptqmodel_cuda = None
             model = copy.deepcopy(self.model)
             model = convert_gptq_v2_to_v1_format(

From 931302b9b65728dfda7d5fa109068a8663637e05 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 19:08:08 +0800
Subject: [PATCH 10/20] remove params

---
 tests/test_quant_formats.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index f5665869..972e3c6d 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -125,8 +125,6 @@ def test_gptq_8bit(self):
             bits=8,
             group_size=128,
             format=FORMAT.GPTQ,
-            desc_act=True,
-            sym=False,
         )
 
         model = GPTQModel.from_pretrained(

From 922801bc5efd0e3648675be37e0ac60d086d65c7 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 20:56:57 +0800
Subject: [PATCH 11/20] it's not a list

---
 gptqmodel/models/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 3fd08468..6e01fa76 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -548,8 +548,8 @@ def save_quantized(
             if quantize_config.bits != 4 :
                 from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
                 for name, module in model.named_modules():
-                    if isinstance (module[1], QuantLinear):
-                        module[1].gptqmodel_cuda = None
+                    if isinstance (module, QuantLinear):
+                        module.gptqmodel_cuda = None
             model = copy.deepcopy(self.model)
             model = convert_gptq_v2_to_v1_format(
                 model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel

From 20ba7b5aee192cf6259423e758dd0c0e7cbabd53 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 21:43:31 +0800
Subject: [PATCH 12/20] set gptqmodel_cuda back

---
 gptqmodel/models/base.py    | 139 ++++++++++++++-------------
 tests/test_quant_formats.py | 186 ++++++++++++++++++------------------
 2 files changed, 168 insertions(+), 157 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 6e01fa76..d654eade 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -73,12 +73,12 @@ class BaseGPTQModel(nn.Module):
     info: Dict[str, str] = {}
 
     def __init__(
-        self,
-        model: PreTrainedModel,
-        quantized: bool,
-        quantize_config: QuantizeConfig,
-        is_triton_backend: bool = False,
-        qlinear_kernel: nn.Module = None,
+            self,
+            model: PreTrainedModel,
+            quantized: bool,
+            quantize_config: QuantizeConfig,
+            is_triton_backend: bool = False,
+            qlinear_kernel: nn.Module = None,
     ):
         super().__init__()
 
@@ -102,9 +102,9 @@ def hf_device_map(self):
         return getattr(self.model, "hf_device_map", None)
 
     def _prepare_dataset_for_quantization(
-        self,
-        calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
-        batch_size: int = 1,
+            self,
+            calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
+            batch_size: int = 1,
     ):
         def _convert_tensor_to_list(tensor):
             if isinstance(tensor, torch.Tensor):
@@ -138,7 +138,7 @@ def _convert_tensor_to_list(tensor):
             pad_token_id = self.config.eos_token_id
 
         new_calibration_dataset = [
-            collate_data(new_calibration_dataset[start : start + batch_size], pad_token_id)
+            collate_data(new_calibration_dataset[start: start + batch_size], pad_token_id)
             for start in range(0, len(new_calibration_dataset), batch_size)
         ]
         for new_example in new_calibration_dataset:
@@ -148,13 +148,13 @@ def _convert_tensor_to_list(tensor):
 
     @torch.inference_mode()
     def quantize(
-        self,
-        calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
-        batch_size: int = 1,
-        use_triton: bool = False,
-        use_cuda_fp16: bool = True,
-        autotune_warmup_after_quantized: bool = False,
-        calibration_enable_gpu_cache: bool = True,
+            self,
+            calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
+            batch_size: int = 1,
+            use_triton: bool = False,
+            use_cuda_fp16: bool = True,
+            autotune_warmup_after_quantized: bool = False,
+            calibration_enable_gpu_cache: bool = True,
     ):
         if self.quantized:
             raise EnvironmentError("quantize() is called a model that is already quantized")
@@ -183,7 +183,7 @@ def quantize(
 
         if len(calibration_dataset) < MIN_CALIBRATION_DATASET_SIZE:
             logger.warning(f"Calibration dataset size should be greater than {MIN_CALIBRATION_DATASET_SIZE}. "
-                             f"Current size: {len(calibration_dataset)}.")
+                           f"Current size: {len(calibration_dataset)}.")
 
         # Calculate the average length of the average input_ids
         total_input_ids_length = 0
@@ -194,7 +194,7 @@ def quantize(
 
         if avg < MIN_CALIBRATION_DATASET_INPUT_IDS_AVG_LENGTH:
             logger.warning(f"The average length of input_ids of calibration_dataset should be greater than "
-                             f"{MIN_CALIBRATION_DATASET_INPUT_IDS_AVG_LENGTH}! Current AVG is {avg}.")
+                           f"{MIN_CALIBRATION_DATASET_INPUT_IDS_AVG_LENGTH}! Current AVG is {avg}.")
 
         device_map = self.hf_device_map
         if device_map:
@@ -240,8 +240,8 @@ def store_input_hook(_, args, kwargs):
                 position_ids.append(move_to(pos_ids, data_device))
             one_kwargs = {}
             for (
-                k,
-                v,
+                    k,
+                    v,
             ) in kwargs.items():  # make sure other arguments also be captured
                 if k not in ["hidden_states", "attention_mask", "position_ids"]:
                     one_kwargs[k] = nested_move_to(v, data_device)
@@ -469,13 +469,13 @@ def prepare_inputs_for_generation(self, *args, **kwargs):
         return self.model.prepare_inputs_for_generation(*args, **kwargs)
 
     def save_quantized(
-        self,
-        save_dir: str,
-        safetensors_metadata: Optional[Dict[str, str]] = None,
-        format: Optional[FORMAT] = None,
-        use_safetensors: bool = True,
-        max_shard_size: str = "10GB",
-        model_base_name: Optional[str] = None
+            self,
+            save_dir: str,
+            safetensors_metadata: Optional[Dict[str, str]] = None,
+            format: Optional[FORMAT] = None,
+            use_safetensors: bool = True,
+            max_shard_size: str = "10GB",
+            model_base_name: Optional[str] = None
     ):
         """save quantized model and configs to local disk"""
         os.makedirs(save_dir, exist_ok=True)
@@ -497,8 +497,8 @@ def save_quantized(
 
         if model_base_name is None:
             model_base_name = (
-                self.quantize_config.model_file_base_name or
-                f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g"
+                    self.quantize_config.model_file_base_name or
+                    f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g"
             )
 
         state_dict = self.model.state_dict()
@@ -545,12 +545,23 @@ def save_quantized(
             # fix ModelCloud/GPTQModel/issues/47
             # fix gptqmodel_cuda cannot be serialized
             # no need to set it back, no calculation below
-            if quantize_config.bits != 4 :
+            if quantize_config.bits != 4:
+                cuda_name_modules = []
                 from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
-                for name, module in model.named_modules():
-                    if isinstance (module, QuantLinear):
-                        module.gptqmodel_cuda = None
-            model = copy.deepcopy(self.model)
+                for item in model.named_modules():
+                    if len(item) > 1 and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"):
+                        cuda_name_modules.append((item[0], item[1].gptqmodel_cuda))
+                        item[1].gptqmodel_cuda = None
+                model = copy.deepcopy(self.model)
+
+                for item in model.named_modules():
+                    if len(item) > 1 and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"):
+                        for cuda_name, cuda_module in cuda_name_modules:
+                            if item[0] == cuda_name:
+                                item[1].gptqmodel_cuda = cuda_module
+                del cuda_name_modules
+            else:
+                model = copy.deepcopy(self.model)
             model = convert_gptq_v2_to_v1_format(
                 model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel
             )
@@ -641,22 +652,22 @@ def save_quantized(
         quantize_config.save_pretrained(save_dir)
 
     def save_pretrained(
-        self,
-        save_dir: str,
-        **kwargs,
+            self,
+            save_dir: str,
+            **kwargs,
     ):
         logger.warning("You are using save_pretrained, which will re-direct to save_quantized.")
         self.save_quantized(save_dir=save_dir, **kwargs)
 
     @classmethod
     def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: str,
-        quantize_config: QuantizeConfig,
-        max_memory: Optional[dict] = None,
-        trust_remote_code: bool = False,
-        torch_dtype: [str | torch.dtype] = "auto",
-        **model_init_kwargs,
+            cls,
+            pretrained_model_name_or_path: str,
+            quantize_config: QuantizeConfig,
+            max_memory: Optional[dict] = None,
+            trust_remote_code: bool = False,
+            torch_dtype: [str | torch.dtype] = "auto",
+            **model_init_kwargs,
     ):
         """load un-quantized pretrained model to cpu"""
 
@@ -739,25 +750,25 @@ def skip(*args, **kwargs):
 
     @classmethod
     def from_quantized(
-        cls,
-        model_name_or_path: Optional[str],
-        device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
-        max_memory: Optional[dict] = None,
-        device: Optional[Union[str, int]] = None,
-        use_triton: bool = True,
-        use_marlin: bool = True,
-        torch_dtype: [str | torch.dtype] = "auto",
-        use_cuda_fp16: bool = True,
-        quantize_config: Optional[QuantizeConfig] = None,
-        model_basename: Optional[str] = None,
-        use_safetensors: bool = True,
-        trust_remote_code: bool = False,
-        warmup_triton: bool = False,
-        disable_exllama: bool = False,
-        disable_exllamav2: bool = False,
-        format: Optional[FORMAT] = None,
-        allow_unsafe_loading: bool = False,
-        **kwargs,
+            cls,
+            model_name_or_path: Optional[str],
+            device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
+            max_memory: Optional[dict] = None,
+            device: Optional[Union[str, int]] = None,
+            use_triton: bool = True,
+            use_marlin: bool = True,
+            torch_dtype: [str | torch.dtype] = "auto",
+            use_cuda_fp16: bool = True,
+            quantize_config: Optional[QuantizeConfig] = None,
+            model_basename: Optional[str] = None,
+            use_safetensors: bool = True,
+            trust_remote_code: bool = False,
+            warmup_triton: bool = False,
+            disable_exllama: bool = False,
+            disable_exllamav2: bool = False,
+            format: Optional[FORMAT] = None,
+            allow_unsafe_loading: bool = False,
+            **kwargs,
     ):
         """load quantized model from local disk"""
         # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index 972e3c6d..9645725f 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -27,110 +27,110 @@ def setUp(self):
             self.tokenizer("Today I am in Paris and it is a wonderful day."),
         ]
 
-    @parameterized.expand(
-        [
-            (False, True, FORMAT.GPTQ_V2),
-            (False, False, FORMAT.GPTQ),
-            (True, True, FORMAT.MARLIN),
-        ]
-    )
-    def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT):
-        quantize_config = QuantizeConfig(
-            bits=4,
-            group_size=128,
-            desc_act=False if format == FORMAT.MARLIN else True,
-            sym=sym,
-            format=format,
-        )
-
-        model = GPTQModel.from_pretrained(
-            self.pretrained_model_dir,
-            quantize_config=quantize_config,
-            use_flash_attention_2=False,
-        )
-
-        model.quantize(self.calibration_dataset)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_quantized(tmpdirname)
-
-            logging.info(f"Saved config mem: {model.quantize_config}")
-
-            with open(tmpdirname + "/" + QUANT_CONFIG_FILENAME, "r") as f:
-                file_dict = json.loads(f.read())
-                # skip comparison of these two model path specific fields that do not exist in memory
-                file_dict["model_name_or_path"] = None
-                file_dict["model_file_base_name"] = None
-
-                # make sure the json dict saved to file matches config in memory
-                assert model.quantize_config.to_dict() == file_dict
-                logging.info(f"Saved config file: {file_dict}")
-
-            model = GPTQModel.from_quantized(
-                tmpdirname,
-                device="cuda:0",
-                use_marlin=use_marlin,
-            )
-
-            logging.info(f"Loaded config: {model.quantize_config}")
-            assert model.quantize_config.meta_get_versionable(META_FIELD_QUANTIZER) == (
-                META_QUANTIZER_GPTQMODEL,
-                __version__,
-            )
-            del model
-            torch.cuda.empty_cache()
-
-            # skip compat test with sym=False and v1 since we do meta version safety check
-            if not sym and format == FORMAT.GPTQ:
-                return
-
-            # test compat: 1) with simple dict type 2) is_marlin_format
-            compat_quantize_config = {
-                "bits": 4,
-                "group_size": 128,
-                "sym": sym,
-                "desc_act": False if format == FORMAT.MARLIN else True,
-                "is_marlin_format": use_marlin,
-            }
-
-            model = GPTQModel.from_quantized(
-                tmpdirname,
-                device="cuda:0",
-                quantize_config=compat_quantize_config,
-            )
-            assert isinstance(model.quantize_config, QuantizeConfig)
-
-            del model
-            torch.cuda.empty_cache()
-
-            # test checkpoint_format hint to from_quantized()
-            os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}")
-
-            compat_quantize_config = {
-                "bits": 4,
-                "group_size": 128,
-                "sym": sym,
-                "desc_act": False if format == FORMAT.MARLIN else True,
-            }
-            model = GPTQModel.from_quantized(
-                tmpdirname,
-                device="cuda:0",
-                quantize_config=compat_quantize_config,
-                format=format,
-            )
-            assert isinstance(model.quantize_config, QuantizeConfig)
+    # @parameterized.expand(
+    #     [
+    #         (False, True, FORMAT.GPTQ_V2),
+    #         (False, False, FORMAT.GPTQ),
+    #         (True, True, FORMAT.MARLIN),
+    #     ]
+    # )
+    # def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT):
+    #     quantize_config = QuantizeConfig(
+    #         bits=4,
+    #         group_size=128,
+    #         desc_act=False if format == FORMAT.MARLIN else True,
+    #         sym=sym,
+    #         format=format,
+    #     )
+    #
+    #     model = GPTQModel.from_pretrained(
+    #         self.pretrained_model_dir,
+    #         quantize_config=quantize_config,
+    #         use_flash_attention_2=False,
+    #     )
+    #
+    #     model.quantize(self.calibration_dataset)
+    #
+    #     with tempfile.TemporaryDirectory() as tmpdirname:
+    #         model.save_quantized(tmpdirname)
+    #
+    #         logging.info(f"Saved config mem: {model.quantize_config}")
+    #
+    #         with open(tmpdirname + "/" + QUANT_CONFIG_FILENAME, "r") as f:
+    #             file_dict = json.loads(f.read())
+    #             # skip comparison of these two model path specific fields that do not exist in memory
+    #             file_dict["model_name_or_path"] = None
+    #             file_dict["model_file_base_name"] = None
+    #
+    #             # make sure the json dict saved to file matches config in memory
+    #             assert model.quantize_config.to_dict() == file_dict
+    #             logging.info(f"Saved config file: {file_dict}")
+    #
+    #         model = GPTQModel.from_quantized(
+    #             tmpdirname,
+    #             device="cuda:0",
+    #             use_marlin=use_marlin,
+    #         )
+    #
+    #         logging.info(f"Loaded config: {model.quantize_config}")
+    #         assert model.quantize_config.meta_get_versionable(META_FIELD_QUANTIZER) == (
+    #             META_QUANTIZER_GPTQMODEL,
+    #             __version__,
+    #         )
+    #         del model
+    #         torch.cuda.empty_cache()
+    #
+    #         # skip compat test with sym=False and v1 since we do meta version safety check
+    #         if not sym and format == FORMAT.GPTQ:
+    #             return
+    #
+    #         # test compat: 1) with simple dict type 2) is_marlin_format
+    #         compat_quantize_config = {
+    #             "bits": 4,
+    #             "group_size": 128,
+    #             "sym": sym,
+    #             "desc_act": False if format == FORMAT.MARLIN else True,
+    #             "is_marlin_format": use_marlin,
+    #         }
+    #
+    #         model = GPTQModel.from_quantized(
+    #             tmpdirname,
+    #             device="cuda:0",
+    #             quantize_config=compat_quantize_config,
+    #         )
+    #         assert isinstance(model.quantize_config, QuantizeConfig)
+    #
+    #         del model
+    #         torch.cuda.empty_cache()
+    #
+    #         # test checkpoint_format hint to from_quantized()
+    #         os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}")
+    #
+    #         compat_quantize_config = {
+    #             "bits": 4,
+    #             "group_size": 128,
+    #             "sym": sym,
+    #             "desc_act": False if format == FORMAT.MARLIN else True,
+    #         }
+    #         model = GPTQModel.from_quantized(
+    #             tmpdirname,
+    #             device="cuda:0",
+    #             quantize_config=compat_quantize_config,
+    #             format=format,
+    #         )
+    #         assert isinstance(model.quantize_config, QuantizeConfig)
 
     def test_gptq_8bit(self):
         quantize_config = QuantizeConfig(
             bits=8,
             group_size=128,
             format=FORMAT.GPTQ,
+            desc_act=True
         )
 
         model = GPTQModel.from_pretrained(
             self.pretrained_model_dir,
             quantize_config=quantize_config,
-            use_flash_attention_2=False,
         )
 
         model.quantize(self.calibration_dataset)

From b8b9f31d9008dba60d85734949ebbeb50695de5b Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 21:49:04 +0800
Subject: [PATCH 13/20] check is tuple

---
 gptqmodel/models/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index d654eade..0e0c82c9 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -549,13 +549,13 @@ def save_quantized(
                 cuda_name_modules = []
                 from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
                 for item in model.named_modules():
-                    if len(item) > 1 and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"):
+                    if isinstance(item, tuple) and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"):
                         cuda_name_modules.append((item[0], item[1].gptqmodel_cuda))
                         item[1].gptqmodel_cuda = None
                 model = copy.deepcopy(self.model)
 
                 for item in model.named_modules():
-                    if len(item) > 1 and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"):
+                    if isinstance(item, tuple) and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"):
                         for cuda_name, cuda_module in cuda_name_modules:
                             if item[0] == cuda_name:
                                 item[1].gptqmodel_cuda = cuda_module

From abfd7b972bbc520614cc00b25b3b49d0738deb98 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 21:53:13 +0800
Subject: [PATCH 14/20] format

---
 gptqmodel/models/base.py    |   2 +-
 tests/test_quant_formats.py | 194 ++++++++++++++++++------------------
 2 files changed, 97 insertions(+), 99 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 0e0c82c9..049dedc2 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -4,7 +4,7 @@
 import os
 import re
 from os.path import isfile, join
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict, List, Optional, Union
 
 import accelerate
 import torch
diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index 9645725f..bfffa52d 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -5,13 +5,11 @@
 import unittest  # noqa: E402
 
 import torch.cuda  # noqa: E402
-from parameterized import parameterized  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
-from gptqmodel import GPTQModel  # noqa: E402
-from gptqmodel import __version__  # noqa: E402
+from gptqmodel import GPTQModel, __version__  # noqa: E402
 from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QuantizeConfig  # noqa: E402
 from gptqmodel.quantization.config import META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQuantization(unittest.TestCase):
@@ -27,105 +25,105 @@ def setUp(self):
             self.tokenizer("Today I am in Paris and it is a wonderful day."),
         ]
 
-    # @parameterized.expand(
-    #     [
-    #         (False, True, FORMAT.GPTQ_V2),
-    #         (False, False, FORMAT.GPTQ),
-    #         (True, True, FORMAT.MARLIN),
-    #     ]
-    # )
-    # def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT):
-    #     quantize_config = QuantizeConfig(
-    #         bits=4,
-    #         group_size=128,
-    #         desc_act=False if format == FORMAT.MARLIN else True,
-    #         sym=sym,
-    #         format=format,
-    #     )
-    #
-    #     model = GPTQModel.from_pretrained(
-    #         self.pretrained_model_dir,
-    #         quantize_config=quantize_config,
-    #         use_flash_attention_2=False,
-    #     )
-    #
-    #     model.quantize(self.calibration_dataset)
-    #
-    #     with tempfile.TemporaryDirectory() as tmpdirname:
-    #         model.save_quantized(tmpdirname)
-    #
-    #         logging.info(f"Saved config mem: {model.quantize_config}")
-    #
-    #         with open(tmpdirname + "/" + QUANT_CONFIG_FILENAME, "r") as f:
-    #             file_dict = json.loads(f.read())
-    #             # skip comparison of these two model path specific fields that do not exist in memory
-    #             file_dict["model_name_or_path"] = None
-    #             file_dict["model_file_base_name"] = None
-    #
-    #             # make sure the json dict saved to file matches config in memory
-    #             assert model.quantize_config.to_dict() == file_dict
-    #             logging.info(f"Saved config file: {file_dict}")
-    #
-    #         model = GPTQModel.from_quantized(
-    #             tmpdirname,
-    #             device="cuda:0",
-    #             use_marlin=use_marlin,
-    #         )
-    #
-    #         logging.info(f"Loaded config: {model.quantize_config}")
-    #         assert model.quantize_config.meta_get_versionable(META_FIELD_QUANTIZER) == (
-    #             META_QUANTIZER_GPTQMODEL,
-    #             __version__,
-    #         )
-    #         del model
-    #         torch.cuda.empty_cache()
-    #
-    #         # skip compat test with sym=False and v1 since we do meta version safety check
-    #         if not sym and format == FORMAT.GPTQ:
-    #             return
-    #
-    #         # test compat: 1) with simple dict type 2) is_marlin_format
-    #         compat_quantize_config = {
-    #             "bits": 4,
-    #             "group_size": 128,
-    #             "sym": sym,
-    #             "desc_act": False if format == FORMAT.MARLIN else True,
-    #             "is_marlin_format": use_marlin,
-    #         }
-    #
-    #         model = GPTQModel.from_quantized(
-    #             tmpdirname,
-    #             device="cuda:0",
-    #             quantize_config=compat_quantize_config,
-    #         )
-    #         assert isinstance(model.quantize_config, QuantizeConfig)
-    #
-    #         del model
-    #         torch.cuda.empty_cache()
-    #
-    #         # test checkpoint_format hint to from_quantized()
-    #         os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}")
-    #
-    #         compat_quantize_config = {
-    #             "bits": 4,
-    #             "group_size": 128,
-    #             "sym": sym,
-    #             "desc_act": False if format == FORMAT.MARLIN else True,
-    #         }
-    #         model = GPTQModel.from_quantized(
-    #             tmpdirname,
-    #             device="cuda:0",
-    #             quantize_config=compat_quantize_config,
-    #             format=format,
-    #         )
-    #         assert isinstance(model.quantize_config, QuantizeConfig)
+    @parameterized.expand(
+        [
+            (False, True, FORMAT.GPTQ_V2),
+            (False, False, FORMAT.GPTQ),
+            (True, True, FORMAT.MARLIN),
+        ]
+    )
+    def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT):
+        quantize_config = QuantizeConfig(
+            bits=4,
+            group_size=128,
+            desc_act=False if format == FORMAT.MARLIN else True,
+            sym=sym,
+            format=format,
+        )
+
+        model = GPTQModel.from_pretrained(
+            self.pretrained_model_dir,
+            quantize_config=quantize_config,
+            use_flash_attention_2=False,
+        )
+
+        model.quantize(self.calibration_dataset)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_quantized(tmpdirname)
+
+            logging.info(f"Saved config mem: {model.quantize_config}")
+
+            with open(tmpdirname + "/" + QUANT_CONFIG_FILENAME, "r") as f:
+                file_dict = json.loads(f.read())
+                # skip comparison of these two model path specific fields that do not exist in memory
+                file_dict["model_name_or_path"] = None
+                file_dict["model_file_base_name"] = None
+
+                # make sure the json dict saved to file matches config in memory
+                assert model.quantize_config.to_dict() == file_dict
+                logging.info(f"Saved config file: {file_dict}")
+
+            model = GPTQModel.from_quantized(
+                tmpdirname,
+                device="cuda:0",
+                use_marlin=use_marlin,
+            )
+
+            logging.info(f"Loaded config: {model.quantize_config}")
+            assert model.quantize_config.meta_get_versionable(META_FIELD_QUANTIZER) == (
+                META_QUANTIZER_GPTQMODEL,
+                __version__,
+            )
+            del model
+            torch.cuda.empty_cache()
+
+            # skip compat test with sym=False and v1 since we do meta version safety check
+            if not sym and format == FORMAT.GPTQ:
+                return
+
+            # test compat: 1) with simple dict type 2) is_marlin_format
+            compat_quantize_config = {
+                "bits": 4,
+                "group_size": 128,
+                "sym": sym,
+                "desc_act": False if format == FORMAT.MARLIN else True,
+                "is_marlin_format": use_marlin,
+            }
+
+            model = GPTQModel.from_quantized(
+                tmpdirname,
+                device="cuda:0",
+                quantize_config=compat_quantize_config,
+            )
+            assert isinstance(model.quantize_config, QuantizeConfig)
+
+            del model
+            torch.cuda.empty_cache()
+
+            # test checkpoint_format hint to from_quantized()
+            os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}")
+
+            compat_quantize_config = {
+                "bits": 4,
+                "group_size": 128,
+                "sym": sym,
+                "desc_act": False if format == FORMAT.MARLIN else True,
+            }
+            model = GPTQModel.from_quantized(
+                tmpdirname,
+                device="cuda:0",
+                quantize_config=compat_quantize_config,
+                format=format,
+            )
+            assert isinstance(model.quantize_config, QuantizeConfig)
 
     def test_gptq_8bit(self):
         quantize_config = QuantizeConfig(
             bits=8,
             group_size=128,
             format=FORMAT.GPTQ,
-            desc_act=True
+            # desc_act=True
         )
 
         model = GPTQModel.from_pretrained(

From 9f994283037c874d1f5236fc27850d36fdf64633 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 21:58:13 +0800
Subject: [PATCH 15/20] set desc_act=True

---
 tests/test_quant_formats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index bfffa52d..54ef85d5 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -123,7 +123,7 @@ def test_gptq_8bit(self):
             bits=8,
             group_size=128,
             format=FORMAT.GPTQ,
-            # desc_act=True
+            desc_act=True
         )
 
         model = GPTQModel.from_pretrained(

From a8d4e3481b3fa33c854633d3647e38af91d93447 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 22:03:07 +0800
Subject: [PATCH 16/20] set desc_act=True

---
 tests/test_quant_formats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index 54ef85d5..bfffa52d 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -123,7 +123,7 @@ def test_gptq_8bit(self):
             bits=8,
             group_size=128,
             format=FORMAT.GPTQ,
-            desc_act=True
+            # desc_act=True
         )
 
         model = GPTQModel.from_pretrained(

From a947e3bef377c554a7cd83ac0de3eb1a89963361 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 22:04:37 +0800
Subject: [PATCH 17/20] format

---
 gptqmodel/models/base.py | 92 ++++++++++++++++++++--------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 049dedc2..3e93de15 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -73,12 +73,12 @@ class BaseGPTQModel(nn.Module):
     info: Dict[str, str] = {}
 
     def __init__(
-            self,
-            model: PreTrainedModel,
-            quantized: bool,
-            quantize_config: QuantizeConfig,
-            is_triton_backend: bool = False,
-            qlinear_kernel: nn.Module = None,
+        self,
+        model: PreTrainedModel,
+        quantized: bool,
+        quantize_config: QuantizeConfig,
+        is_triton_backend: bool = False,
+        qlinear_kernel: nn.Module = None,
     ):
         super().__init__()
 
@@ -148,13 +148,13 @@ def _convert_tensor_to_list(tensor):
 
     @torch.inference_mode()
     def quantize(
-            self,
-            calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
-            batch_size: int = 1,
-            use_triton: bool = False,
-            use_cuda_fp16: bool = True,
-            autotune_warmup_after_quantized: bool = False,
-            calibration_enable_gpu_cache: bool = True,
+        self,
+        calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
+        batch_size: int = 1,
+        use_triton: bool = False,
+        use_cuda_fp16: bool = True,
+        autotune_warmup_after_quantized: bool = False,
+        calibration_enable_gpu_cache: bool = True,
     ):
         if self.quantized:
             raise EnvironmentError("quantize() is called a model that is already quantized")
@@ -469,13 +469,13 @@ def prepare_inputs_for_generation(self, *args, **kwargs):
         return self.model.prepare_inputs_for_generation(*args, **kwargs)
 
     def save_quantized(
-            self,
-            save_dir: str,
-            safetensors_metadata: Optional[Dict[str, str]] = None,
-            format: Optional[FORMAT] = None,
-            use_safetensors: bool = True,
-            max_shard_size: str = "10GB",
-            model_base_name: Optional[str] = None
+        self,
+        save_dir: str,
+        safetensors_metadata: Optional[Dict[str, str]] = None,
+        format: Optional[FORMAT] = None,
+        use_safetensors: bool = True,
+        max_shard_size: str = "10GB",
+        model_base_name: Optional[str] = None
     ):
         """save quantized model and configs to local disk"""
         os.makedirs(save_dir, exist_ok=True)
@@ -661,13 +661,13 @@ def save_pretrained(
 
     @classmethod
     def from_pretrained(
-            cls,
-            pretrained_model_name_or_path: str,
-            quantize_config: QuantizeConfig,
-            max_memory: Optional[dict] = None,
-            trust_remote_code: bool = False,
-            torch_dtype: [str | torch.dtype] = "auto",
-            **model_init_kwargs,
+        cls,
+        pretrained_model_name_or_path: str,
+        quantize_config: QuantizeConfig,
+        max_memory: Optional[dict] = None,
+        trust_remote_code: bool = False,
+        torch_dtype: [str | torch.dtype] = "auto",
+        **model_init_kwargs,
     ):
         """load un-quantized pretrained model to cpu"""
 
@@ -750,25 +750,25 @@ def skip(*args, **kwargs):
 
     @classmethod
     def from_quantized(
-            cls,
-            model_name_or_path: Optional[str],
-            device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
-            max_memory: Optional[dict] = None,
-            device: Optional[Union[str, int]] = None,
-            use_triton: bool = True,
-            use_marlin: bool = True,
-            torch_dtype: [str | torch.dtype] = "auto",
-            use_cuda_fp16: bool = True,
-            quantize_config: Optional[QuantizeConfig] = None,
-            model_basename: Optional[str] = None,
-            use_safetensors: bool = True,
-            trust_remote_code: bool = False,
-            warmup_triton: bool = False,
-            disable_exllama: bool = False,
-            disable_exllamav2: bool = False,
-            format: Optional[FORMAT] = None,
-            allow_unsafe_loading: bool = False,
-            **kwargs,
+        cls,
+        model_name_or_path: Optional[str],
+        device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
+        max_memory: Optional[dict] = None,
+        device: Optional[Union[str, int]] = None,
+        use_triton: bool = True,
+        use_marlin: bool = True,
+        torch_dtype: [str | torch.dtype] = "auto",
+        use_cuda_fp16: bool = True,
+        quantize_config: Optional[QuantizeConfig] = None,
+        model_basename: Optional[str] = None,
+        use_safetensors: bool = True,
+        trust_remote_code: bool = False,
+        warmup_triton: bool = False,
+        disable_exllama: bool = False,
+        disable_exllamav2: bool = False,
+        format: Optional[FORMAT] = None,
+        allow_unsafe_loading: bool = False,
+        **kwargs,
     ):
         """load quantized model from local disk"""
         # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.

From 08aebba7e9ab0f2620eed8798031fb724995fd93 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 22:05:55 +0800
Subject: [PATCH 18/20] format

---
 gptqmodel/models/base.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 3e93de15..8e71eb37 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -239,10 +239,7 @@ def store_input_hook(_, args, kwargs):
             if pos_ids is not None:
                 position_ids.append(move_to(pos_ids, data_device))
             one_kwargs = {}
-            for (
-                    k,
-                    v,
-            ) in kwargs.items():  # make sure other arguments also be captured
+            for (k, v) in kwargs.items():  # make sure other arguments also be captured
                 if k not in ["hidden_states", "attention_mask", "position_ids"]:
                     one_kwargs[k] = nested_move_to(v, data_device)
             layer_input_kwargs.append(one_kwargs)
@@ -652,9 +649,9 @@ def save_quantized(
         quantize_config.save_pretrained(save_dir)
 
     def save_pretrained(
-            self,
-            save_dir: str,
-            **kwargs,
+        self,
+        save_dir: str,
+        **kwargs,
     ):
         logger.warning("You are using save_pretrained, which will re-direct to save_quantized.")
         self.save_quantized(save_dir=save_dir, **kwargs)

From 9ffe45cf41f2fe97d6b8fad0daec5310c15c6ab1 Mon Sep 17 00:00:00 2001
From: Qubitium <Qubitium@modelcloud.ai>
Date: Mon, 24 Jun 2024 14:06:51 +0000
Subject: [PATCH 19/20] Refractor fix

---
 gptqmodel/models/base.py                      | 21 +++++++++----------
 gptqmodel/nn_modules/qlinear/__init__.py      |  5 +++++
 gptqmodel/nn_modules/qlinear/qlinear_cuda.py  |  4 ++--
 .../nn_modules/qlinear/qlinear_cuda_old.py    |  4 ++--
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 8e71eb37..92a5f836 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -543,19 +543,18 @@ def save_quantized(
             # fix gptqmodel_cuda cannot be serialized
             # no need to set it back, no calculation below
             if quantize_config.bits != 4:
-                cuda_name_modules = []
-                from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear
-                for item in model.named_modules():
-                    if isinstance(item, tuple) and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"):
-                        cuda_name_modules.append((item[0], item[1].gptqmodel_cuda))
-                        item[1].gptqmodel_cuda = None
+                cuda_name_modules = {}
+                from gptqmodel.nn_modules.qlinear.qlinear_cuda import BaseCudaQuantLinear
+                for name, module in model.named_modules():
+                    if isinstance(module, BaseCudaQuantLinear):
+                        cuda_name_modules[name] = module.gptqmodel_cuda
+                        module.gptqmodel_cuda = None
                 model = copy.deepcopy(self.model)
 
-                for item in model.named_modules():
-                    if isinstance(item, tuple) and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"):
-                        for cuda_name, cuda_module in cuda_name_modules:
-                            if item[0] == cuda_name:
-                                item[1].gptqmodel_cuda = cuda_module
+                for name, modules in model.named_modules():
+                    if isinstance(module, BaseCudaQuantLinear) and name in cuda_name_modules:
+                        module.gptqmodel_cuda = cuda_name_modules[name]
+
                 del cuda_name_modules
             else:
                 model = copy.deepcopy(self.model)
diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py
index 7c0eefc0..7ef6cb94 100644
--- a/gptqmodel/nn_modules/qlinear/__init__.py
+++ b/gptqmodel/nn_modules/qlinear/__init__.py
@@ -4,3 +4,8 @@
 class BaseQuantLinear(nn.Module):
     # override me
     QUANT_TYPE = "base"
+
+
+class BaseCudaQuantLinear(BaseQuantLinear):
+    # override me
+    QUANT_TYPE = "base-cuda"
diff --git a/gptqmodel/nn_modules/qlinear/qlinear_cuda.py b/gptqmodel/nn_modules/qlinear/qlinear_cuda.py
index d2919406..dbd55ada 100644
--- a/gptqmodel/nn_modules/qlinear/qlinear_cuda.py
+++ b/gptqmodel/nn_modules/qlinear/qlinear_cuda.py
@@ -7,12 +7,12 @@
 import torch
 import torch.nn as nn
 import transformers
-from gptqmodel.nn_modules.qlinear import BaseQuantLinear
+from gptqmodel.nn_modules.qlinear import BaseCudaQuantLinear
 
 logger = getLogger(__name__)
 
 
-class QuantLinear(BaseQuantLinear):
+class QuantLinear(BaseCudaQuantLinear):
     QUANT_TYPE = "cuda"
 
     def __init__(
diff --git a/gptqmodel/nn_modules/qlinear/qlinear_cuda_old.py b/gptqmodel/nn_modules/qlinear/qlinear_cuda_old.py
index 7002dbdd..6a95ae9a 100644
--- a/gptqmodel/nn_modules/qlinear/qlinear_cuda_old.py
+++ b/gptqmodel/nn_modules/qlinear/qlinear_cuda_old.py
@@ -7,12 +7,12 @@
 import torch
 import torch.nn as nn
 import transformers
-from gptqmodel.nn_modules.qlinear import BaseQuantLinear
+from gptqmodel.nn_modules.qlinear import BaseCudaQuantLinear
 
 logger = getLogger(__name__)
 
 
-class QuantLinear(BaseQuantLinear):
+class QuantLinear(BaseCudaQuantLinear):
     QUANT_TYPE = "cuda-old"
 
     def __init__(

From 020ed8a4cb964c41c60933238130d78fbd77a120 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Mon, 24 Jun 2024 22:08:05 +0800
Subject: [PATCH 20/20] desc_act=True

---
 tests/test_quant_formats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index bfffa52d..54ef85d5 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -123,7 +123,7 @@ def test_gptq_8bit(self):
             bits=8,
             group_size=128,
             format=FORMAT.GPTQ,
-            # desc_act=True
+            desc_act=True
         )
 
         model = GPTQModel.from_pretrained(