huggingface · dacorvo · Mar 24, 2024 · Mar 20, 2024
@@ -24,7 +24,7 @@ def main():
         device = torch.device(args.device)
 
     def get_int_matmul(device):
-        if device.type == ("cuda"):
+        if device.type == ("cuda") or device.type == ("cpu"):
             return torch._int_mm
         return torch.matmul
 

@@ -61,6 +61,13 @@ def main():
     parser.add_argument("--max_new_tokens", type=int, default=20, help="The maximum number of tokens to generate.")
     parser.add_argument("--batch_size", type=int, default=32, help="The batch_size for evaluation (and calibration).")
     parser.add_argument("--validation_batch", type=int, default=4, help="The number of batch to use for calibration.")
+    parser.add_argument(
+        "--load_dtype",
+        type=str,
+        default="float16",
+        choices=["float16", "float32", "bfloat16"],
+        help="Precision to load the initial model",
+    )
     parser.add_argument(
         "--weights",
         type=str,
@@ -96,7 +103,12 @@ def main():
     else:
         device = torch.device(args.device)
 
-    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
+    torch_dtype = (
+        torch.float16
+        if args.load_dtype == "float16"
+        else torch.bfloat16 if args.load_dtype == "bfloat16" else torch.float32
+    )
+    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch_dtype, low_cpu_mem_usage=True).to(
         device
     )
     tokenizer = AutoTokenizer.from_pretrained(args.model)

@@ -3,6 +3,7 @@
 from typing import Callable, List
 
 import torch
+from packaging import version
 
 from .core import dtype_info
 from .qtensor import QTensor, qfallback
@@ -181,7 +182,13 @@ def mm(op, input, other):
     n, m = input.shape
     p = other.shape[-1]
     if (
-        input.device.type == "cuda"
+        (
+            input.device.type == "cuda"
+            or (
+                input.device.type == "cpu"
+                and version.parse(torch.__version__).release >= version.parse("2.4.0").release
+            )
+        )
         and input.qtype == qint8
         and other.qtype == qint8
         and n > 16