pytorch · jerryzh168 · Mar 13, 2024
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -17,13 +17,15 @@
     get_symmetric_quantization_config,
 )
 
+import torchao.quantization.quant_api as quant_api
 from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
 from torchao.quantization.quant_api import apply_dynamic_quant
 from torchao.quantization.quant_api import (
     Quantizer,
     TwoStepQuantizer,
     Int8DynActInt4WeightGPTQQuantizer,
 )
+from torchao.quantization.utils import is_lm_eval_available
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor
 from model import Transformer
@@ -130,11 +132,19 @@ def test_dynamic_quant_gpu_unified_api_eager_mode_impl(self):
         compiled = m(*example_inputs)
         torch.testing.assert_close(quantized, compiled, atol=0, rtol=0)
 
+    @unittest.skipIf(not is_lm_eval_available(), "Skipping the test when lm_eval is not available")
     def test_gptq(self):
         # should be similar to TorchCompileDynamicQuantizer
+        # from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
+        # Int8DynActInt4WeightGPTQQuantizer = quant_api.Int8DynActInt4WeightGPTQQuantizer
+
         precision = torch.bfloat16
         device = "cpu"
-        checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+        try:
+            checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+        except:
+            print("didn't find model")
+            return
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
         model.load_state_dict(checkpoint, assign=True)

diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
@@ -17,12 +17,12 @@
 import torch.nn.functional as F
 # from model import Transformer  # pyre-ignore[21]
 from torch.utils._pytree import tree_flatten, tree_unflatten
+from .utils import is_lm_eval_available
 
 aten = torch.ops.aten
 
 ## generate.py ##
 
-
 def encode_tokens(tokenizer, string, bos=True, device="cuda"):
 
     tokens = tokenizer.encode(string)
@@ -37,14 +37,7 @@ def model_forward(model, x, input_pos):
 
 ## eval.py ##
 
-try:
-    import lm_eval  # pyre-ignore[21]  # noqa: F401
-
-    lm_eval_available = True
-except:
-    lm_eval_available = False
-
-if lm_eval_available:
+if is_lm_eval_available():
     try:  # lm_eval version 0.4
         from lm_eval.evaluator import evaluate  # pyre-ignore[21]
         from lm_eval.models.huggingface import HFLM as eval_wrapper  # pyre-ignore[21]
@@ -56,7 +49,7 @@ def model_forward(model, x, input_pos):
         get_task_dict = tasks.get_task_dict
         evaluate = evaluator.evaluate
 else:
-    print("lm_eval is not installed, GPTQ may not be usable")
+    raise Exception("lm_eval is not installed, can't import GPTQ")
 
 def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
     model: torch.nn.Module,  # pyre-ignore[11]
@@ -93,7 +86,6 @@ def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
     input_pos = torch.arange(0, T, device=device)
 
     # no caches in executorch llama2 7b model?
-    print("setting up cache")
     with torch.device(device):
         model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
 

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -208,17 +208,18 @@ def replace_conv2d_1x1(conv):
     )
 
 
-from .GPTQ import lm_eval_available
+from .utils import is_lm_eval_available
 
-if lm_eval_available:
+print("lm_eval_available:", is_lm_eval_available())
+if is_lm_eval_available():
     from .GPTQ import (  # pyre-ignore[21]
         evaluate,
         GenericGPTQRunner,
         get_task_dict,
         InputRecorder,
-        lm_eval,
         MultiInput,
     )
+    print("after import")
 
 
     class GPTQQuantizer(Quantizer):
@@ -633,4 +634,4 @@ def _convert_for_runtime(self, model):
             )
             return model
 else:
-    print("lm_eval not available, skip defining GPTQQuantizer")
+    print("lm_eval not available, skip importing GPTQQuantizer")
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -13,6 +13,7 @@
     "compute_error",
     "_apply_logging_hook",
     "get_model_size_in_bytes",
+    "is_lm_eval_available",
 ]
 
 
@@ -86,3 +87,15 @@ def get_model_size_in_bytes(model):
     for b in model.buffers():
         s += b.nelement() * b.element_size()
     return s
+
+
+def is_lm_eval_available():
+    lm_eval_available = False
+    try:
+        import lm_eval  # pyre-ignore[21]  # noqa: F401
+
+        lm_eval_available = True
+    except:
+        lm_eval_available = False
+    print("func: is lm eval available:", lm_eval_available)
+    return lm_eval_available