huggingface · fxmarty · Jul 5, 2023 · Jul 5, 2023 · Jul 5, 2023 · Jul 5, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -108,6 +108,17 @@ COPY server/Makefile-flash-att-v2 Makefile
 # Build specific version of flash attention v2
 RUN make build-flash-attention-v2
 
+# Build Transformers exllama kernels
+FROM kernel-builder as exllama-kernels-builder
+
+WORKDIR /usr/src
+
+COPY server/exllama_kernels/ .
+
+
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
 # Build Transformers CUDA kernels
 FROM kernel-builder as custom-kernels-builder
 
@@ -161,6 +172,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
 
 # Copy build artifacts from custom kernels builder
 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 
 # Copy builds artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

diff --git a/Makefile b/Makefile
@@ -56,3 +56,6 @@ run-bloom:
 
 run-bloom-quantize:
 	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
+
+clean:
+	rm -rf target aml
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
@@ -230,15 +230,19 @@ def local_launcher(
             shard_uds_path,
         ]
 
+        env = os.environ
+
         if num_shard is not None:
             args.extend(["--num-shard", str(num_shard)])
-        if quantize:
+        if quantize is not None:
             args.append("--quantize")
-            args.append("bitsandbytes")
+            args.append(quantize)
+            if quantize == "gptq":
+                env["GPTQ_GROUPSIZE"] = "128"
+                env["GPTQ_BITS"] = "4"
         if trust_remote_code:
             args.append("--trust-remote-code")
 
-        env = os.environ
         env["LOG_LEVEL"] = "info,text_generation_router=debug"
 
         if not use_flash_attention:
@@ -275,9 +279,9 @@ def docker_launcher(
 
         if num_shard is not None:
             args.extend(["--num-shard", str(num_shard)])
-        if quantize:
+        if quantize is not None:
             args.append("--quantize")
-            args.append("bitsandbytes")
+            args.append(quantize)
         if trust_remote_code:
             args.append("--trust-remote-code")
 

diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
@@ -0,0 +1,102 @@
+{
+  "generated_text": ", and I am going to visit the Louvre",
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "seed": null,
+    "prefill": [
+      {
+        "id": 2,
+        "text": "</s>",
+        "logprob": null
+      },
+      {
+        "id": 20628,
+        "text": "Today",
+        "logprob": -11.2265625
+      },
+      {
+        "id": 306,
+        "text": "I",
+        "logprob": -4.1757812
+      },
+      {
+        "id": 626,
+        "text": "am",
+        "logprob": -1.9746094
+      },
+      {
+        "id": 297,
+        "text": "in",
+        "logprob": -5.4648438
+      },
+      {
+        "id": 3444,
+        "text": "France",
+        "logprob": -9.03125
+      }
+    ],
+    "tokens": [
+      {
+        "id": 29892,
+        "text": ",",
+        "logprob": -0.31298828,
+        "special": false
+      },
+      {
+        "id": 322,
+        "text": " and",
+        "logprob": -1.4345703,
+        "special": false
+      },
+      {
+        "id": 306,
+        "text": " I",
+        "logprob": -0.32080078,
+        "special": false
+      },
+      {
+        "id": 626,
+        "text": " am",
+        "logprob": -1.3798828,
+        "special": false
+      },
+      {
+        "id": 2675,
+        "text": " going",
+        "logprob": -1.2304688,
+        "special": false
+      },
+      {
+        "id": 304,
+        "text": " to",
+        "logprob": -0.0014791489,
+        "special": false
+      },
+      {
+        "id": 6493,
+        "text": " visit",
+        "logprob": -1.1503906,
+        "special": false
+      },
+      {
+        "id": 278,
+        "text": " the",
+        "logprob": -0.41259766,
+        "special": false
+      },
+      {
+        "id": 4562,
+        "text": " Lou",
+        "logprob": -1.8134766,
+        "special": false
+      },
+      {
+        "id": 12675,
+        "text": "vre",
+        "logprob": -0.000767231,
+        "special": false
+      }
+    ]
+  }
+}
diff --git a/...on-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json b/...on-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
@@ -0,0 +1,97 @@
+{
+  "generated_text": "The capital city of France isParis.\n The Best Way to Visit",
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "seed": 0,
+    "prefill": [
+      {
+        "id": 2,
+        "text": "</s>",
+        "logprob": null
+      },
+      {
+        "id": 4272,
+        "text": "city",
+        "logprob": -12.4453125
+      },
+      {
+        "id": 310,
+        "text": "of",
+        "logprob": -2.4023438
+      },
+      {
+        "id": 3444,
+        "text": "France",
+        "logprob": -12.515625
+      },
+      {
+        "id": 338,
+        "text": "is",
+        "logprob": -5.1914062
+      }
+    ],
+    "tokens": [
+      {
+        "id": 3681,
+        "text": " Paris",
+        "logprob": -0.22546387,
+        "special": false
+      },
+      {
+        "id": 29889,
+        "text": ".",
+        "logprob": 0,
+        "special": false
+      },
+      {
+        "id": 13,
+        "text": "\n",
+        "logprob": 0,
+        "special": false
+      },
+      {
+        "id": 1,
+        "text": "",
+        "logprob": 0,
+        "special": false
+      },
+      {
+        "id": 450,
+        "text": " The",
+        "logprob": 0,
+        "special": false
+      },
+      {
+        "id": 6407,
+        "text": " Best",
+        "logprob": -0.5522461,
+        "special": false
+      },
+      {
+        "id": 5307,
+        "text": " Way",
+        "logprob": 0,
+        "special": false
+      },
+      {
+        "id": 304,
+        "text": " to",
+        "logprob": 0,
+        "special": false
+      },
+      {
+        "id": 5741,
+        "text": " Vis",
+        "logprob": -2.3496094,
+        "special": false
+      },
+      {
+        "id": 277,
+        "text": "it",
+        "logprob": 0,
+        "special": false
+      }
+    ]
+  }
+}