From 76ae12eb28b99add5dc229d101a25d7b187ce650 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 15:04:16 +0800 Subject: [PATCH 01/20] fix cannot pickle 'module' object for 8 bit --- gptqmodel/models/base.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 306ed57d..465eb329 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -27,7 +27,7 @@ from ..utils.model import (auto_dtype_from_config, convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format, find_layers, get_checkpoints, get_device, get_module_by_name_prefix, get_module_by_name_suffix, get_moe_layer_modules, gptqmodel_post_init, make_quant, - move_to, nested_move_to, pack_model, simple_dispatch_model) + move_to, nested_move_to, pack_model, simple_dispatch_model, deepcopy_model_with_modules) from ..version import __version__ from ._const import CPU, CUDA_0, SUPPORTED_MODELS @@ -540,8 +540,14 @@ def save_quantized( # internal is always gptq v2 but allow users to pass gptq (v1) via config if format is None and quantize_config.format == FORMAT.GPTQ: - # Model qzeros may be edited in place. - # TODO: avoid inplace modification of the weights + # fix ModelCloud/GPTQModel/issues/47 + # fix gptqmodel_cuda cannot be serialized + # no need to set it back, no calculation below + from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear + for module in model.named_modules(): + if len(module) == 2 and isinstance (module[1], QuantLinear): + module[1].gptqmodel_cuda = None + print(module) model = copy.deepcopy(self.model) model = convert_gptq_v2_to_v1_format( model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel From 25c7eaae9b81242eb2328532b4461b51cda723ea Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 15:09:23 +0800 Subject: [PATCH 02/20] remove unused import --- gptqmodel/models/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 465eb329..1ce1c879 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -27,7 +27,7 @@ from ..utils.model import (auto_dtype_from_config, convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format, find_layers, get_checkpoints, get_device, get_module_by_name_prefix, get_module_by_name_suffix, get_moe_layer_modules, gptqmodel_post_init, make_quant, - move_to, nested_move_to, pack_model, simple_dispatch_model, deepcopy_model_with_modules) + move_to, nested_move_to, pack_model, simple_dispatch_model) from ..version import __version__ from ._const import CPU, CUDA_0, SUPPORTED_MODELS From e714b7115e9c415fd0f2833afd3e5f4dded45651 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 15:19:08 +0800 Subject: [PATCH 03/20] remove print --- gptqmodel/models/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 1ce1c879..107a90fa 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -547,7 +547,6 @@ def save_quantized( for module in model.named_modules(): if len(module) == 2 and isinstance (module[1], QuantLinear): module[1].gptqmodel_cuda = None - print(module) model = copy.deepcopy(self.model) model = convert_gptq_v2_to_v1_format( model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel From 035d8e81a856018a943be391c677ac884ea1c1ea Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 16:44:15 +0800 Subject: [PATCH 04/20] check with tuple --- gptqmodel/models/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 107a90fa..778120a3 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -4,7 +4,7 @@ import os import re from os.path import isfile, join -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Tuple import accelerate import torch @@ -545,7 +545,7 @@ def save_quantized( # no need to set it back, no calculation below from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear for module in model.named_modules(): - if len(module) == 2 and isinstance (module[1], QuantLinear): + if isinstance (module, Tuple) and isinstance (module[1], QuantLinear): module[1].gptqmodel_cuda = None model = copy.deepcopy(self.model) model = convert_gptq_v2_to_v1_format( From 9394229504e02c6251d1204872b3edcb5c7aaba1 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 17:11:17 +0800 Subject: [PATCH 05/20] revert to len check --- gptqmodel/models/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 778120a3..7eb84a11 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -545,7 +545,7 @@ def save_quantized( # no need to set it back, no calculation below from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear for module in model.named_modules(): - if isinstance (module, Tuple) and isinstance (module[1], QuantLinear): + if len(module) == 2 and isinstance (module[1], QuantLinear): module[1].gptqmodel_cuda = None model = copy.deepcopy(self.model) model = convert_gptq_v2_to_v1_format( From b27baa6b67174675a79dfdf9c362ed6dfc4e1d54 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 17:26:16 +0800 Subject: [PATCH 06/20] add test for 8bit --- tests/test_quant_formats.py | 61 ++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index f1b4f6eb..2b3475a4 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -5,14 +5,28 @@ import unittest # noqa: E402 import torch.cuda # noqa: E402 -from gptqmodel import GPTQModel, __version__ # noqa: E402 -from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QuantizeConfig # noqa: E402 -from gptqmodel.quantization.config import META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import GPTQModel # noqa: E402 +from gptqmodel import __version__ # noqa: E402 +from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QuantizeConfig # noqa: E402 +from gptqmodel.quantization.config import META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL + class TestQuantization(unittest.TestCase): + + def setUp(self): + self.pretrained_model_dir = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" + + self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_dir, use_fast=True) + self.calibration_dataset = [ + self.tokenizer( + "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." + ), + self.tokenizer("Today I am in Paris and it is a wonderful day."), + ] + @parameterized.expand( [ (False, True, FORMAT.GPTQ_V2), @@ -21,16 +35,6 @@ class TestQuantization(unittest.TestCase): ] ) def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT): - pretrained_model_dir = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" - - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) - calibration_dataset = [ - tokenizer( - "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." - ), - tokenizer("Today I am in Paris and it is a wonderful day."), - ] - quantize_config = QuantizeConfig( bits=4, group_size=128, @@ -40,17 +44,15 @@ def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT): ) model = GPTQModel.from_pretrained( - pretrained_model_dir, + self.pretrained_model_dir, quantize_config=quantize_config, use_flash_attention_2=False, ) - model.quantize(calibration_dataset) + model.quantize(self.calibration_dataset) with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained( - tmpdirname, - ) + model.save_quantized(tmpdirname) logging.info(f"Saved config mem: {model.quantize_config}") @@ -117,3 +119,26 @@ def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT): format=format, ) assert isinstance(model.quantize_config, QuantizeConfig) + + def test_gptq_8bit(self): + quantize_config = QuantizeConfig( + bits=8, + group_size=128, + format=FORMAT.GPTQ, + ) + + model = GPTQModel.from_pretrained( + self.pretrained_model_dir, + quantize_config=quantize_config, + use_flash_attention_2=False, + ) + + model.quantize(self.calibration_dataset) + + with tempfile.TemporaryDirectory() as tmpdirname: + err = None + try: + model.save_quantized(tmpdirname) + except Exception as e: + err = e + self.assertTrue(err is None) From 69ae019b0a107716ade0bc7949aeb76480670268 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 17:38:39 +0800 Subject: [PATCH 07/20] set same QuantizeConfig --- tests/test_quant_formats.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index 2b3475a4..f5665869 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -125,6 +125,8 @@ def test_gptq_8bit(self): bits=8, group_size=128, format=FORMAT.GPTQ, + desc_act=True, + sym=False, ) model = GPTQModel.from_pretrained( @@ -140,5 +142,6 @@ def test_gptq_8bit(self): try: model.save_quantized(tmpdirname) except Exception as e: + print(e) err = e self.assertTrue(err is None) From 09d76aad8182f3a1d43c77511444adcbc0281b9e Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 17:49:30 +0800 Subject: [PATCH 08/20] check if it's 4 bit --- gptqmodel/models/base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 7eb84a11..1c2ec057 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -543,10 +543,11 @@ def save_quantized( # fix ModelCloud/GPTQModel/issues/47 # fix gptqmodel_cuda cannot be serialized # no need to set it back, no calculation below - from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear - for module in model.named_modules(): - if len(module) == 2 and isinstance (module[1], QuantLinear): - module[1].gptqmodel_cuda = None + if quantize_config.bits != 4 : + from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear + for module in model.named_modules(): + if len(module) == 2 and isinstance (module[1], QuantLinear): + module[1].gptqmodel_cuda = None model = copy.deepcopy(self.model) model = convert_gptq_v2_to_v1_format( model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel From 2548a7a62247b8ad043334691d8d06196af6dfd8 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 18:04:59 +0800 Subject: [PATCH 09/20] fix grammar --- gptqmodel/models/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 1c2ec057..3fd08468 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -540,13 +540,15 @@ def save_quantized( # internal is always gptq v2 but allow users to pass gptq (v1) via config if format is None and quantize_config.format == FORMAT.GPTQ: + # Model qzeros may be edited in place. + # TODO: avoid inplace modification of the weights # fix ModelCloud/GPTQModel/issues/47 # fix gptqmodel_cuda cannot be serialized # no need to set it back, no calculation below if quantize_config.bits != 4 : from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear - for module in model.named_modules(): - if len(module) == 2 and isinstance (module[1], QuantLinear): + for name, module in model.named_modules(): + if isinstance (module[1], QuantLinear): module[1].gptqmodel_cuda = None model = copy.deepcopy(self.model) model = convert_gptq_v2_to_v1_format( From 931302b9b65728dfda7d5fa109068a8663637e05 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 19:08:08 +0800 Subject: [PATCH 10/20] remove params --- tests/test_quant_formats.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index f5665869..972e3c6d 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -125,8 +125,6 @@ def test_gptq_8bit(self): bits=8, group_size=128, format=FORMAT.GPTQ, - desc_act=True, - sym=False, ) model = GPTQModel.from_pretrained( From 922801bc5efd0e3648675be37e0ac60d086d65c7 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 20:56:57 +0800 Subject: [PATCH 11/20] it's not a list --- gptqmodel/models/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 3fd08468..6e01fa76 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -548,8 +548,8 @@ def save_quantized( if quantize_config.bits != 4 : from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear for name, module in model.named_modules(): - if isinstance (module[1], QuantLinear): - module[1].gptqmodel_cuda = None + if isinstance (module, QuantLinear): + module.gptqmodel_cuda = None model = copy.deepcopy(self.model) model = convert_gptq_v2_to_v1_format( model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel From 20ba7b5aee192cf6259423e758dd0c0e7cbabd53 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 21:43:31 +0800 Subject: [PATCH 12/20] set gptqmodel_cuda back --- gptqmodel/models/base.py | 139 ++++++++++++++------------- tests/test_quant_formats.py | 186 ++++++++++++++++++------------------ 2 files changed, 168 insertions(+), 157 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 6e01fa76..d654eade 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -73,12 +73,12 @@ class BaseGPTQModel(nn.Module): info: Dict[str, str] = {} def __init__( - self, - model: PreTrainedModel, - quantized: bool, - quantize_config: QuantizeConfig, - is_triton_backend: bool = False, - qlinear_kernel: nn.Module = None, + self, + model: PreTrainedModel, + quantized: bool, + quantize_config: QuantizeConfig, + is_triton_backend: bool = False, + qlinear_kernel: nn.Module = None, ): super().__init__() @@ -102,9 +102,9 @@ def hf_device_map(self): return getattr(self.model, "hf_device_map", None) def _prepare_dataset_for_quantization( - self, - calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], - batch_size: int = 1, + self, + calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], + batch_size: int = 1, ): def _convert_tensor_to_list(tensor): if isinstance(tensor, torch.Tensor): @@ -138,7 +138,7 @@ def _convert_tensor_to_list(tensor): pad_token_id = self.config.eos_token_id new_calibration_dataset = [ - collate_data(new_calibration_dataset[start : start + batch_size], pad_token_id) + collate_data(new_calibration_dataset[start: start + batch_size], pad_token_id) for start in range(0, len(new_calibration_dataset), batch_size) ] for new_example in new_calibration_dataset: @@ -148,13 +148,13 @@ def _convert_tensor_to_list(tensor): @torch.inference_mode() def quantize( - self, - calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], - batch_size: int = 1, - use_triton: bool = False, - use_cuda_fp16: bool = True, - autotune_warmup_after_quantized: bool = False, - calibration_enable_gpu_cache: bool = True, + self, + calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], + batch_size: int = 1, + use_triton: bool = False, + use_cuda_fp16: bool = True, + autotune_warmup_after_quantized: bool = False, + calibration_enable_gpu_cache: bool = True, ): if self.quantized: raise EnvironmentError("quantize() is called a model that is already quantized") @@ -183,7 +183,7 @@ def quantize( if len(calibration_dataset) < MIN_CALIBRATION_DATASET_SIZE: logger.warning(f"Calibration dataset size should be greater than {MIN_CALIBRATION_DATASET_SIZE}. " - f"Current size: {len(calibration_dataset)}.") + f"Current size: {len(calibration_dataset)}.") # Calculate the average length of the average input_ids total_input_ids_length = 0 @@ -194,7 +194,7 @@ def quantize( if avg < MIN_CALIBRATION_DATASET_INPUT_IDS_AVG_LENGTH: logger.warning(f"The average length of input_ids of calibration_dataset should be greater than " - f"{MIN_CALIBRATION_DATASET_INPUT_IDS_AVG_LENGTH}! Current AVG is {avg}.") + f"{MIN_CALIBRATION_DATASET_INPUT_IDS_AVG_LENGTH}! Current AVG is {avg}.") device_map = self.hf_device_map if device_map: @@ -240,8 +240,8 @@ def store_input_hook(_, args, kwargs): position_ids.append(move_to(pos_ids, data_device)) one_kwargs = {} for ( - k, - v, + k, + v, ) in kwargs.items(): # make sure other arguments also be captured if k not in ["hidden_states", "attention_mask", "position_ids"]: one_kwargs[k] = nested_move_to(v, data_device) @@ -469,13 +469,13 @@ def prepare_inputs_for_generation(self, *args, **kwargs): return self.model.prepare_inputs_for_generation(*args, **kwargs) def save_quantized( - self, - save_dir: str, - safetensors_metadata: Optional[Dict[str, str]] = None, - format: Optional[FORMAT] = None, - use_safetensors: bool = True, - max_shard_size: str = "10GB", - model_base_name: Optional[str] = None + self, + save_dir: str, + safetensors_metadata: Optional[Dict[str, str]] = None, + format: Optional[FORMAT] = None, + use_safetensors: bool = True, + max_shard_size: str = "10GB", + model_base_name: Optional[str] = None ): """save quantized model and configs to local disk""" os.makedirs(save_dir, exist_ok=True) @@ -497,8 +497,8 @@ def save_quantized( if model_base_name is None: model_base_name = ( - self.quantize_config.model_file_base_name or - f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g" + self.quantize_config.model_file_base_name or + f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g" ) state_dict = self.model.state_dict() @@ -545,12 +545,23 @@ def save_quantized( # fix ModelCloud/GPTQModel/issues/47 # fix gptqmodel_cuda cannot be serialized # no need to set it back, no calculation below - if quantize_config.bits != 4 : + if quantize_config.bits != 4: + cuda_name_modules = [] from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear - for name, module in model.named_modules(): - if isinstance (module, QuantLinear): - module.gptqmodel_cuda = None - model = copy.deepcopy(self.model) + for item in model.named_modules(): + if len(item) > 1 and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"): + cuda_name_modules.append((item[0], item[1].gptqmodel_cuda)) + item[1].gptqmodel_cuda = None + model = copy.deepcopy(self.model) + + for item in model.named_modules(): + if len(item) > 1 and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"): + for cuda_name, cuda_module in cuda_name_modules: + if item[0] == cuda_name: + item[1].gptqmodel_cuda = cuda_module + del cuda_name_modules + else: + model = copy.deepcopy(self.model) model = convert_gptq_v2_to_v1_format( model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel ) @@ -641,22 +652,22 @@ def save_quantized( quantize_config.save_pretrained(save_dir) def save_pretrained( - self, - save_dir: str, - **kwargs, + self, + save_dir: str, + **kwargs, ): logger.warning("You are using save_pretrained, which will re-direct to save_quantized.") self.save_quantized(save_dir=save_dir, **kwargs) @classmethod def from_pretrained( - cls, - pretrained_model_name_or_path: str, - quantize_config: QuantizeConfig, - max_memory: Optional[dict] = None, - trust_remote_code: bool = False, - torch_dtype: [str | torch.dtype] = "auto", - **model_init_kwargs, + cls, + pretrained_model_name_or_path: str, + quantize_config: QuantizeConfig, + max_memory: Optional[dict] = None, + trust_remote_code: bool = False, + torch_dtype: [str | torch.dtype] = "auto", + **model_init_kwargs, ): """load un-quantized pretrained model to cpu""" @@ -739,25 +750,25 @@ def skip(*args, **kwargs): @classmethod def from_quantized( - cls, - model_name_or_path: Optional[str], - device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, - max_memory: Optional[dict] = None, - device: Optional[Union[str, int]] = None, - use_triton: bool = True, - use_marlin: bool = True, - torch_dtype: [str | torch.dtype] = "auto", - use_cuda_fp16: bool = True, - quantize_config: Optional[QuantizeConfig] = None, - model_basename: Optional[str] = None, - use_safetensors: bool = True, - trust_remote_code: bool = False, - warmup_triton: bool = False, - disable_exllama: bool = False, - disable_exllamav2: bool = False, - format: Optional[FORMAT] = None, - allow_unsafe_loading: bool = False, - **kwargs, + cls, + model_name_or_path: Optional[str], + device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, + max_memory: Optional[dict] = None, + device: Optional[Union[str, int]] = None, + use_triton: bool = True, + use_marlin: bool = True, + torch_dtype: [str | torch.dtype] = "auto", + use_cuda_fp16: bool = True, + quantize_config: Optional[QuantizeConfig] = None, + model_basename: Optional[str] = None, + use_safetensors: bool = True, + trust_remote_code: bool = False, + warmup_triton: bool = False, + disable_exllama: bool = False, + disable_exllamav2: bool = False, + format: Optional[FORMAT] = None, + allow_unsafe_loading: bool = False, + **kwargs, ): """load quantized model from local disk""" # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones. diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index 972e3c6d..9645725f 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -27,110 +27,110 @@ def setUp(self): self.tokenizer("Today I am in Paris and it is a wonderful day."), ] - @parameterized.expand( - [ - (False, True, FORMAT.GPTQ_V2), - (False, False, FORMAT.GPTQ), - (True, True, FORMAT.MARLIN), - ] - ) - def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT): - quantize_config = QuantizeConfig( - bits=4, - group_size=128, - desc_act=False if format == FORMAT.MARLIN else True, - sym=sym, - format=format, - ) - - model = GPTQModel.from_pretrained( - self.pretrained_model_dir, - quantize_config=quantize_config, - use_flash_attention_2=False, - ) - - model.quantize(self.calibration_dataset) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_quantized(tmpdirname) - - logging.info(f"Saved config mem: {model.quantize_config}") - - with open(tmpdirname + "/" + QUANT_CONFIG_FILENAME, "r") as f: - file_dict = json.loads(f.read()) - # skip comparison of these two model path specific fields that do not exist in memory - file_dict["model_name_or_path"] = None - file_dict["model_file_base_name"] = None - - # make sure the json dict saved to file matches config in memory - assert model.quantize_config.to_dict() == file_dict - logging.info(f"Saved config file: {file_dict}") - - model = GPTQModel.from_quantized( - tmpdirname, - device="cuda:0", - use_marlin=use_marlin, - ) - - logging.info(f"Loaded config: {model.quantize_config}") - assert model.quantize_config.meta_get_versionable(META_FIELD_QUANTIZER) == ( - META_QUANTIZER_GPTQMODEL, - __version__, - ) - del model - torch.cuda.empty_cache() - - # skip compat test with sym=False and v1 since we do meta version safety check - if not sym and format == FORMAT.GPTQ: - return - - # test compat: 1) with simple dict type 2) is_marlin_format - compat_quantize_config = { - "bits": 4, - "group_size": 128, - "sym": sym, - "desc_act": False if format == FORMAT.MARLIN else True, - "is_marlin_format": use_marlin, - } - - model = GPTQModel.from_quantized( - tmpdirname, - device="cuda:0", - quantize_config=compat_quantize_config, - ) - assert isinstance(model.quantize_config, QuantizeConfig) - - del model - torch.cuda.empty_cache() - - # test checkpoint_format hint to from_quantized() - os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}") - - compat_quantize_config = { - "bits": 4, - "group_size": 128, - "sym": sym, - "desc_act": False if format == FORMAT.MARLIN else True, - } - model = GPTQModel.from_quantized( - tmpdirname, - device="cuda:0", - quantize_config=compat_quantize_config, - format=format, - ) - assert isinstance(model.quantize_config, QuantizeConfig) + # @parameterized.expand( + # [ + # (False, True, FORMAT.GPTQ_V2), + # (False, False, FORMAT.GPTQ), + # (True, True, FORMAT.MARLIN), + # ] + # ) + # def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT): + # quantize_config = QuantizeConfig( + # bits=4, + # group_size=128, + # desc_act=False if format == FORMAT.MARLIN else True, + # sym=sym, + # format=format, + # ) + # + # model = GPTQModel.from_pretrained( + # self.pretrained_model_dir, + # quantize_config=quantize_config, + # use_flash_attention_2=False, + # ) + # + # model.quantize(self.calibration_dataset) + # + # with tempfile.TemporaryDirectory() as tmpdirname: + # model.save_quantized(tmpdirname) + # + # logging.info(f"Saved config mem: {model.quantize_config}") + # + # with open(tmpdirname + "/" + QUANT_CONFIG_FILENAME, "r") as f: + # file_dict = json.loads(f.read()) + # # skip comparison of these two model path specific fields that do not exist in memory + # file_dict["model_name_or_path"] = None + # file_dict["model_file_base_name"] = None + # + # # make sure the json dict saved to file matches config in memory + # assert model.quantize_config.to_dict() == file_dict + # logging.info(f"Saved config file: {file_dict}") + # + # model = GPTQModel.from_quantized( + # tmpdirname, + # device="cuda:0", + # use_marlin=use_marlin, + # ) + # + # logging.info(f"Loaded config: {model.quantize_config}") + # assert model.quantize_config.meta_get_versionable(META_FIELD_QUANTIZER) == ( + # META_QUANTIZER_GPTQMODEL, + # __version__, + # ) + # del model + # torch.cuda.empty_cache() + # + # # skip compat test with sym=False and v1 since we do meta version safety check + # if not sym and format == FORMAT.GPTQ: + # return + # + # # test compat: 1) with simple dict type 2) is_marlin_format + # compat_quantize_config = { + # "bits": 4, + # "group_size": 128, + # "sym": sym, + # "desc_act": False if format == FORMAT.MARLIN else True, + # "is_marlin_format": use_marlin, + # } + # + # model = GPTQModel.from_quantized( + # tmpdirname, + # device="cuda:0", + # quantize_config=compat_quantize_config, + # ) + # assert isinstance(model.quantize_config, QuantizeConfig) + # + # del model + # torch.cuda.empty_cache() + # + # # test checkpoint_format hint to from_quantized() + # os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}") + # + # compat_quantize_config = { + # "bits": 4, + # "group_size": 128, + # "sym": sym, + # "desc_act": False if format == FORMAT.MARLIN else True, + # } + # model = GPTQModel.from_quantized( + # tmpdirname, + # device="cuda:0", + # quantize_config=compat_quantize_config, + # format=format, + # ) + # assert isinstance(model.quantize_config, QuantizeConfig) def test_gptq_8bit(self): quantize_config = QuantizeConfig( bits=8, group_size=128, format=FORMAT.GPTQ, + desc_act=True ) model = GPTQModel.from_pretrained( self.pretrained_model_dir, quantize_config=quantize_config, - use_flash_attention_2=False, ) model.quantize(self.calibration_dataset) From b8b9f31d9008dba60d85734949ebbeb50695de5b Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 21:49:04 +0800 Subject: [PATCH 13/20] check is tuple --- gptqmodel/models/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index d654eade..0e0c82c9 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -549,13 +549,13 @@ def save_quantized( cuda_name_modules = [] from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear for item in model.named_modules(): - if len(item) > 1 and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"): + if isinstance(item, tuple) and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"): cuda_name_modules.append((item[0], item[1].gptqmodel_cuda)) item[1].gptqmodel_cuda = None model = copy.deepcopy(self.model) for item in model.named_modules(): - if len(item) > 1 and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"): + if isinstance(item, tuple) and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"): for cuda_name, cuda_module in cuda_name_modules: if item[0] == cuda_name: item[1].gptqmodel_cuda = cuda_module From abfd7b972bbc520614cc00b25b3b49d0738deb98 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 21:53:13 +0800 Subject: [PATCH 14/20] format --- gptqmodel/models/base.py | 2 +- tests/test_quant_formats.py | 194 ++++++++++++++++++------------------ 2 files changed, 97 insertions(+), 99 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 0e0c82c9..049dedc2 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -4,7 +4,7 @@ import os import re from os.path import isfile, join -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict, List, Optional, Union import accelerate import torch diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index 9645725f..bfffa52d 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -5,13 +5,11 @@ import unittest # noqa: E402 import torch.cuda # noqa: E402 -from parameterized import parameterized # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - -from gptqmodel import GPTQModel # noqa: E402 -from gptqmodel import __version__ # noqa: E402 +from gptqmodel import GPTQModel, __version__ # noqa: E402 from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QuantizeConfig # noqa: E402 from gptqmodel.quantization.config import META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL +from parameterized import parameterized # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 class TestQuantization(unittest.TestCase): @@ -27,105 +25,105 @@ def setUp(self): self.tokenizer("Today I am in Paris and it is a wonderful day."), ] - # @parameterized.expand( - # [ - # (False, True, FORMAT.GPTQ_V2), - # (False, False, FORMAT.GPTQ), - # (True, True, FORMAT.MARLIN), - # ] - # ) - # def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT): - # quantize_config = QuantizeConfig( - # bits=4, - # group_size=128, - # desc_act=False if format == FORMAT.MARLIN else True, - # sym=sym, - # format=format, - # ) - # - # model = GPTQModel.from_pretrained( - # self.pretrained_model_dir, - # quantize_config=quantize_config, - # use_flash_attention_2=False, - # ) - # - # model.quantize(self.calibration_dataset) - # - # with tempfile.TemporaryDirectory() as tmpdirname: - # model.save_quantized(tmpdirname) - # - # logging.info(f"Saved config mem: {model.quantize_config}") - # - # with open(tmpdirname + "/" + QUANT_CONFIG_FILENAME, "r") as f: - # file_dict = json.loads(f.read()) - # # skip comparison of these two model path specific fields that do not exist in memory - # file_dict["model_name_or_path"] = None - # file_dict["model_file_base_name"] = None - # - # # make sure the json dict saved to file matches config in memory - # assert model.quantize_config.to_dict() == file_dict - # logging.info(f"Saved config file: {file_dict}") - # - # model = GPTQModel.from_quantized( - # tmpdirname, - # device="cuda:0", - # use_marlin=use_marlin, - # ) - # - # logging.info(f"Loaded config: {model.quantize_config}") - # assert model.quantize_config.meta_get_versionable(META_FIELD_QUANTIZER) == ( - # META_QUANTIZER_GPTQMODEL, - # __version__, - # ) - # del model - # torch.cuda.empty_cache() - # - # # skip compat test with sym=False and v1 since we do meta version safety check - # if not sym and format == FORMAT.GPTQ: - # return - # - # # test compat: 1) with simple dict type 2) is_marlin_format - # compat_quantize_config = { - # "bits": 4, - # "group_size": 128, - # "sym": sym, - # "desc_act": False if format == FORMAT.MARLIN else True, - # "is_marlin_format": use_marlin, - # } - # - # model = GPTQModel.from_quantized( - # tmpdirname, - # device="cuda:0", - # quantize_config=compat_quantize_config, - # ) - # assert isinstance(model.quantize_config, QuantizeConfig) - # - # del model - # torch.cuda.empty_cache() - # - # # test checkpoint_format hint to from_quantized() - # os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}") - # - # compat_quantize_config = { - # "bits": 4, - # "group_size": 128, - # "sym": sym, - # "desc_act": False if format == FORMAT.MARLIN else True, - # } - # model = GPTQModel.from_quantized( - # tmpdirname, - # device="cuda:0", - # quantize_config=compat_quantize_config, - # format=format, - # ) - # assert isinstance(model.quantize_config, QuantizeConfig) + @parameterized.expand( + [ + (False, True, FORMAT.GPTQ_V2), + (False, False, FORMAT.GPTQ), + (True, True, FORMAT.MARLIN), + ] + ) + def test_quantize(self, use_marlin: bool, sym: bool, format: FORMAT): + quantize_config = QuantizeConfig( + bits=4, + group_size=128, + desc_act=False if format == FORMAT.MARLIN else True, + sym=sym, + format=format, + ) + + model = GPTQModel.from_pretrained( + self.pretrained_model_dir, + quantize_config=quantize_config, + use_flash_attention_2=False, + ) + + model.quantize(self.calibration_dataset) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_quantized(tmpdirname) + + logging.info(f"Saved config mem: {model.quantize_config}") + + with open(tmpdirname + "/" + QUANT_CONFIG_FILENAME, "r") as f: + file_dict = json.loads(f.read()) + # skip comparison of these two model path specific fields that do not exist in memory + file_dict["model_name_or_path"] = None + file_dict["model_file_base_name"] = None + + # make sure the json dict saved to file matches config in memory + assert model.quantize_config.to_dict() == file_dict + logging.info(f"Saved config file: {file_dict}") + + model = GPTQModel.from_quantized( + tmpdirname, + device="cuda:0", + use_marlin=use_marlin, + ) + + logging.info(f"Loaded config: {model.quantize_config}") + assert model.quantize_config.meta_get_versionable(META_FIELD_QUANTIZER) == ( + META_QUANTIZER_GPTQMODEL, + __version__, + ) + del model + torch.cuda.empty_cache() + + # skip compat test with sym=False and v1 since we do meta version safety check + if not sym and format == FORMAT.GPTQ: + return + + # test compat: 1) with simple dict type 2) is_marlin_format + compat_quantize_config = { + "bits": 4, + "group_size": 128, + "sym": sym, + "desc_act": False if format == FORMAT.MARLIN else True, + "is_marlin_format": use_marlin, + } + + model = GPTQModel.from_quantized( + tmpdirname, + device="cuda:0", + quantize_config=compat_quantize_config, + ) + assert isinstance(model.quantize_config, QuantizeConfig) + + del model + torch.cuda.empty_cache() + + # test checkpoint_format hint to from_quantized() + os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}") + + compat_quantize_config = { + "bits": 4, + "group_size": 128, + "sym": sym, + "desc_act": False if format == FORMAT.MARLIN else True, + } + model = GPTQModel.from_quantized( + tmpdirname, + device="cuda:0", + quantize_config=compat_quantize_config, + format=format, + ) + assert isinstance(model.quantize_config, QuantizeConfig) def test_gptq_8bit(self): quantize_config = QuantizeConfig( bits=8, group_size=128, format=FORMAT.GPTQ, - desc_act=True + # desc_act=True ) model = GPTQModel.from_pretrained( From 9f994283037c874d1f5236fc27850d36fdf64633 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 21:58:13 +0800 Subject: [PATCH 15/20] set desc_act=True --- tests/test_quant_formats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index bfffa52d..54ef85d5 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -123,7 +123,7 @@ def test_gptq_8bit(self): bits=8, group_size=128, format=FORMAT.GPTQ, - # desc_act=True + desc_act=True ) model = GPTQModel.from_pretrained( From a8d4e3481b3fa33c854633d3647e38af91d93447 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 22:03:07 +0800 Subject: [PATCH 16/20] set desc_act=True --- tests/test_quant_formats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index 54ef85d5..bfffa52d 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -123,7 +123,7 @@ def test_gptq_8bit(self): bits=8, group_size=128, format=FORMAT.GPTQ, - desc_act=True + # desc_act=True ) model = GPTQModel.from_pretrained( From a947e3bef377c554a7cd83ac0de3eb1a89963361 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 22:04:37 +0800 Subject: [PATCH 17/20] format --- gptqmodel/models/base.py | 92 ++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 049dedc2..3e93de15 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -73,12 +73,12 @@ class BaseGPTQModel(nn.Module): info: Dict[str, str] = {} def __init__( - self, - model: PreTrainedModel, - quantized: bool, - quantize_config: QuantizeConfig, - is_triton_backend: bool = False, - qlinear_kernel: nn.Module = None, + self, + model: PreTrainedModel, + quantized: bool, + quantize_config: QuantizeConfig, + is_triton_backend: bool = False, + qlinear_kernel: nn.Module = None, ): super().__init__() @@ -148,13 +148,13 @@ def _convert_tensor_to_list(tensor): @torch.inference_mode() def quantize( - self, - calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], - batch_size: int = 1, - use_triton: bool = False, - use_cuda_fp16: bool = True, - autotune_warmup_after_quantized: bool = False, - calibration_enable_gpu_cache: bool = True, + self, + calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], + batch_size: int = 1, + use_triton: bool = False, + use_cuda_fp16: bool = True, + autotune_warmup_after_quantized: bool = False, + calibration_enable_gpu_cache: bool = True, ): if self.quantized: raise EnvironmentError("quantize() is called a model that is already quantized") @@ -469,13 +469,13 @@ def prepare_inputs_for_generation(self, *args, **kwargs): return self.model.prepare_inputs_for_generation(*args, **kwargs) def save_quantized( - self, - save_dir: str, - safetensors_metadata: Optional[Dict[str, str]] = None, - format: Optional[FORMAT] = None, - use_safetensors: bool = True, - max_shard_size: str = "10GB", - model_base_name: Optional[str] = None + self, + save_dir: str, + safetensors_metadata: Optional[Dict[str, str]] = None, + format: Optional[FORMAT] = None, + use_safetensors: bool = True, + max_shard_size: str = "10GB", + model_base_name: Optional[str] = None ): """save quantized model and configs to local disk""" os.makedirs(save_dir, exist_ok=True) @@ -661,13 +661,13 @@ def save_pretrained( @classmethod def from_pretrained( - cls, - pretrained_model_name_or_path: str, - quantize_config: QuantizeConfig, - max_memory: Optional[dict] = None, - trust_remote_code: bool = False, - torch_dtype: [str | torch.dtype] = "auto", - **model_init_kwargs, + cls, + pretrained_model_name_or_path: str, + quantize_config: QuantizeConfig, + max_memory: Optional[dict] = None, + trust_remote_code: bool = False, + torch_dtype: [str | torch.dtype] = "auto", + **model_init_kwargs, ): """load un-quantized pretrained model to cpu""" @@ -750,25 +750,25 @@ def skip(*args, **kwargs): @classmethod def from_quantized( - cls, - model_name_or_path: Optional[str], - device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, - max_memory: Optional[dict] = None, - device: Optional[Union[str, int]] = None, - use_triton: bool = True, - use_marlin: bool = True, - torch_dtype: [str | torch.dtype] = "auto", - use_cuda_fp16: bool = True, - quantize_config: Optional[QuantizeConfig] = None, - model_basename: Optional[str] = None, - use_safetensors: bool = True, - trust_remote_code: bool = False, - warmup_triton: bool = False, - disable_exllama: bool = False, - disable_exllamav2: bool = False, - format: Optional[FORMAT] = None, - allow_unsafe_loading: bool = False, - **kwargs, + cls, + model_name_or_path: Optional[str], + device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, + max_memory: Optional[dict] = None, + device: Optional[Union[str, int]] = None, + use_triton: bool = True, + use_marlin: bool = True, + torch_dtype: [str | torch.dtype] = "auto", + use_cuda_fp16: bool = True, + quantize_config: Optional[QuantizeConfig] = None, + model_basename: Optional[str] = None, + use_safetensors: bool = True, + trust_remote_code: bool = False, + warmup_triton: bool = False, + disable_exllama: bool = False, + disable_exllamav2: bool = False, + format: Optional[FORMAT] = None, + allow_unsafe_loading: bool = False, + **kwargs, ): """load quantized model from local disk""" # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones. From 08aebba7e9ab0f2620eed8798031fb724995fd93 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 22:05:55 +0800 Subject: [PATCH 18/20] format --- gptqmodel/models/base.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 3e93de15..8e71eb37 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -239,10 +239,7 @@ def store_input_hook(_, args, kwargs): if pos_ids is not None: position_ids.append(move_to(pos_ids, data_device)) one_kwargs = {} - for ( - k, - v, - ) in kwargs.items(): # make sure other arguments also be captured + for (k, v) in kwargs.items(): # make sure other arguments also be captured if k not in ["hidden_states", "attention_mask", "position_ids"]: one_kwargs[k] = nested_move_to(v, data_device) layer_input_kwargs.append(one_kwargs) @@ -652,9 +649,9 @@ def save_quantized( quantize_config.save_pretrained(save_dir) def save_pretrained( - self, - save_dir: str, - **kwargs, + self, + save_dir: str, + **kwargs, ): logger.warning("You are using save_pretrained, which will re-direct to save_quantized.") self.save_quantized(save_dir=save_dir, **kwargs) From 9ffe45cf41f2fe97d6b8fad0daec5310c15c6ab1 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Mon, 24 Jun 2024 14:06:51 +0000 Subject: [PATCH 19/20] Refractor fix --- gptqmodel/models/base.py | 21 +++++++++---------- gptqmodel/nn_modules/qlinear/__init__.py | 5 +++++ gptqmodel/nn_modules/qlinear/qlinear_cuda.py | 4 ++-- .../nn_modules/qlinear/qlinear_cuda_old.py | 4 ++-- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 8e71eb37..92a5f836 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -543,19 +543,18 @@ def save_quantized( # fix gptqmodel_cuda cannot be serialized # no need to set it back, no calculation below if quantize_config.bits != 4: - cuda_name_modules = [] - from gptqmodel.nn_modules.qlinear.qlinear_cuda import QuantLinear - for item in model.named_modules(): - if isinstance(item, tuple) and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"): - cuda_name_modules.append((item[0], item[1].gptqmodel_cuda)) - item[1].gptqmodel_cuda = None + cuda_name_modules = {} + from gptqmodel.nn_modules.qlinear.qlinear_cuda import BaseCudaQuantLinear + for name, module in model.named_modules(): + if isinstance(module, BaseCudaQuantLinear): + cuda_name_modules[name] = module.gptqmodel_cuda + module.gptqmodel_cuda = None model = copy.deepcopy(self.model) - for item in model.named_modules(): - if isinstance(item, tuple) and isinstance(item[1], QuantLinear) and hasattr(item[1], "gptqmodel_cuda"): - for cuda_name, cuda_module in cuda_name_modules: - if item[0] == cuda_name: - item[1].gptqmodel_cuda = cuda_module + for name, modules in model.named_modules(): + if isinstance(module, BaseCudaQuantLinear) and name in cuda_name_modules: + module.gptqmodel_cuda = cuda_name_modules[name] + del cuda_name_modules else: model = copy.deepcopy(self.model) diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py index 7c0eefc0..7ef6cb94 100644 --- a/gptqmodel/nn_modules/qlinear/__init__.py +++ b/gptqmodel/nn_modules/qlinear/__init__.py @@ -4,3 +4,8 @@ class BaseQuantLinear(nn.Module): # override me QUANT_TYPE = "base" + + +class BaseCudaQuantLinear(BaseQuantLinear): + # override me + QUANT_TYPE = "base-cuda" diff --git a/gptqmodel/nn_modules/qlinear/qlinear_cuda.py b/gptqmodel/nn_modules/qlinear/qlinear_cuda.py index d2919406..dbd55ada 100644 --- a/gptqmodel/nn_modules/qlinear/qlinear_cuda.py +++ b/gptqmodel/nn_modules/qlinear/qlinear_cuda.py @@ -7,12 +7,12 @@ import torch import torch.nn as nn import transformers -from gptqmodel.nn_modules.qlinear import BaseQuantLinear +from gptqmodel.nn_modules.qlinear import BaseCudaQuantLinear logger = getLogger(__name__) -class QuantLinear(BaseQuantLinear): +class QuantLinear(BaseCudaQuantLinear): QUANT_TYPE = "cuda" def __init__( diff --git a/gptqmodel/nn_modules/qlinear/qlinear_cuda_old.py b/gptqmodel/nn_modules/qlinear/qlinear_cuda_old.py index 7002dbdd..6a95ae9a 100644 --- a/gptqmodel/nn_modules/qlinear/qlinear_cuda_old.py +++ b/gptqmodel/nn_modules/qlinear/qlinear_cuda_old.py @@ -7,12 +7,12 @@ import torch import torch.nn as nn import transformers -from gptqmodel.nn_modules.qlinear import BaseQuantLinear +from gptqmodel.nn_modules.qlinear import BaseCudaQuantLinear logger = getLogger(__name__) -class QuantLinear(BaseQuantLinear): +class QuantLinear(BaseCudaQuantLinear): QUANT_TYPE = "cuda-old" def __init__( From 020ed8a4cb964c41c60933238130d78fbd77a120 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Mon, 24 Jun 2024 22:08:05 +0800 Subject: [PATCH 20/20] desc_act=True --- tests/test_quant_formats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index bfffa52d..54ef85d5 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -123,7 +123,7 @@ def test_gptq_8bit(self): bits=8, group_size=128, format=FORMAT.GPTQ, - # desc_act=True + desc_act=True ) model = GPTQModel.from_pretrained(