From c39ac59b8926fd153f7d218d9ace948565dab095 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 19:33:46 +0200
Subject: [PATCH 01/38] Import from awq

---
 awq/__init__.py | 1 +
 awq/entry.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 awq/__init__.py

diff --git a/awq/__init__.py b/awq/__init__.py
new file mode 100644
index 00000000..0ffd9d73
--- /dev/null
+++ b/awq/__init__.py
@@ -0,0 +1 @@
+from awq.models.auto import AutoAWQForCausalLM
\ No newline at end of file
diff --git a/awq/entry.py b/awq/entry.py
index 4a7e135e..a7db3699 100644
--- a/awq/entry.py
+++ b/awq/entry.py
@@ -4,7 +4,7 @@
 import argparse
 from lm_eval import evaluator
 from transformers import AutoTokenizer
-from awq.models.auto import AutoAWQForCausalLM
+from awq import AutoAWQForCausalLM
 from awq.quantize.auto_clip import apply_clip
 from awq.quantize.auto_scale import apply_scale
 from awq.utils.lm_eval_adaptor import LMEvalAdaptor

From 60296077a40d6095f4ca7846d7145d4948672b0b Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 19:36:02 +0200
Subject: [PATCH 02/38] Add basic quant example

---
 examples/basic_quant.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 examples/basic_quant.py

diff --git a/examples/basic_quant.py b/examples/basic_quant.py
new file mode 100644
index 00000000..d6fdc96a
--- /dev/null
+++ b/examples/basic_quant.py
@@ -0,0 +1,19 @@
+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer
+
+model_path = 'lmsys/vicuna-7b-v1.5'
+quant_path = 'vicuna-7b-v1.5-awq'
+quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4 }
+
+# Load model
+model = AutoAWQForCausalLM.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+# Quantize
+model.quantize(tokenizer, quant_config=quant_config)
+
+# Save quantized model
+model.save_quantized(quant_path)
+tokenizer.save_pretrained(quant_path)
+
+print(f'Model is quantized and saved at "{quant_path}"')
\ No newline at end of file

From ff556eb0a4fd00551792c2d3a2cbff3269ed545c Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 19:36:18 +0200
Subject: [PATCH 03/38] Create more detailed setup.py

---
 setup.py | 45 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index 16a316af..b5cce475 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,19 @@
 import os
+import torch
 from setuptools import setup, find_packages
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
+
+if CUDA_HOME is None:
+    raise RuntimeError(
+        f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
+
+# Collect the compute capabilities of all available GPUs.
+compute_capabilities = set()
+for i in range(torch.cuda.device_count()):
+    major, minor = torch.cuda.get_device_capability(i)
+    if major < 8:
+        raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.")
+    compute_capabilities.add(major * 10 + minor)
 
 # Get environment variables
 build_cuda_extension = os.environ.get('BUILD_CUDA_EXT', '1') == '1'
@@ -22,6 +35,11 @@
 ext_modules = []
 
 if build_cuda_extension:
+    n_threads = min(os.cpu_count(), 8)
+
+    cxx_args = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"]
+    nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads]
+
     ext_modules.append(
         CUDAExtension(
             name="awq_inference_engine",
@@ -32,25 +50,36 @@
                 "awq_cuda/position_embedding/pos_encoding_kernels.cu"
             ],
             extra_compile_args={
-                "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"],
-                "nvcc": ["-O3", "-std=c++17"]
+                "cxx": cxx_args,
+                "nvcc": nvcc_args
             },
         )
     )
 
 setup(
-    name="awq",
+    name="autoawq",
     version="0.1.0",
-    description="An efficient and accurate low-bit weight quantization(INT3/4) method for LLMs.",
+    author="Casper Hansen",
+    license="MIT",
+    description="AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.",
     long_description=open("README.md", "r").read(),
     long_description_content_type="text/markdown",
     python_requires=">=3.8",
+    url="https://github.com/casper-hansen/AutoAWQ",
+    keywords=["awq", "autoawq", "quantization", "transformers"],
     classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: Apache Software License",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+        "Environment :: GPU :: NVIDIA CUDA :: 12",
+        "License :: OSI Approved :: MIT License",
+        "Natural Language :: English",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: C++",
     ],
     install_requires=dependencies,
-    packages=find_packages(exclude=["results*", "scripts*", "examples*"]),
+    packages=find_packages(exclude=["examples*"]),
     ext_modules=ext_modules,
     cmdclass={"build_ext": BuildExtension}
 )

From 77ca8337650af4e12963f7478a64a82fc236fa6d Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 19:36:40 +0200
Subject: [PATCH 04/38] Update install instructions

---
 README.md | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 8396b1f0..0bfc5326 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ AutoAWQ is a package that implements the Activation-aware Weight Quantization (A
 
 Roadmap:
 
-- [ ] Publish pip package
+- [x] Publish pip package
 - [ ] Refactor quantization code
 - [ ] Support more models
 - [ ] Optimize the speed of models
@@ -14,7 +14,18 @@ Roadmap:
 Requirements: 
 - Compute Capability 8.0 (sm80). Ampere and later architectures are supported.
 
-Clone this repository and install with pip.
+Install:
+- Use pip to install awq
+
+```
+pip install awq
+```
+
+### Build source
+
+<details>
+
+<summary>Build AutoAWQ from scratch</summary>
 
 ```
 git clone https://github.com/casper-hansen/AutoAWQ
@@ -22,6 +33,8 @@ cd AutoAWQ
 pip install -e .
 ```
 
+</details>
+
 ## Supported models
 
 The detailed support list:
@@ -36,6 +49,7 @@ The detailed support list:
 | OPT      | 125m/1.3B/2.7B/6.7B/13B/30B |
 | Bloom    | 560m/3B/7B/                 |
 | LLaVA-v0 | 13B                         |
+| GPTJ     | 6.7B                        |
 
 ## Usage
 
@@ -44,8 +58,8 @@ Below, you will find examples for how to easily quantize a model and run inferen
 ### Quantization
 
 ```python
+from awq import AutoAWQForCausalLM
 from transformers import AutoTokenizer
-from awq.models.auto import AutoAWQForCausalLM
 
 model_path = 'lmsys/vicuna-7b-v1.5'
 quant_path = 'vicuna-7b-v1.5-awq'
@@ -68,8 +82,8 @@ tokenizer.save_pretrained(quant_path)
 Run inference on a quantized model from Huggingface:
 
 ```python
+from awq import AutoAWQForCausalLM
 from transformers import AutoTokenizer
-from awq.models.auto import AutoAWQForCausalLM
 
 quant_path = "casperhansen/vicuna-7b-v1.5-awq"
 quant_file = "awq_model_w4_g128.pt"
@@ -101,8 +115,11 @@ Benchmark speeds may vary from server to server and that it also depends on your
 | MPT-30B     | A6000 | OOM               | 31.57             | --      |
 | Falcon-7B   | A6000 | 39.44             | 27.34             | 1.44x   |
 
+<details>
 
-For example, here is the difference between a fast and slow CPU on MPT-7B:
+<summary>Detailed benchmark (CPU vs. GPU)</summary>
+
+Here is the difference between a fast and slow CPU on MPT-7B:
 
 RTX 4090 + Intel i9 13900K (2 different VMs):
 - CUDA 12.0, Driver 525.125.06: 134 tokens/s (7.46 ms/token)
@@ -113,6 +130,8 @@ RTX 4090 + AMD EPYC 7-Series (3 different VMs):
 - CUDA 12.2, Driver 535.54.03: 56 tokens/s (17.71 ms/token)
 - CUDA 12.0, Driver 525.125.06: 55 tokens/ (18.15 ms/token)
 
+</details>
+
 ## Reference
 
 If you find AWQ useful or relevant to your research, you can cite their [paper](https://arxiv.org/abs/2306.00978):

From 7fee854f417f3d80cb4b6ec191004f9de7cd019b Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 19:56:31 +0200
Subject: [PATCH 05/38] Add compute capability flags and n_threads as str

---
 setup.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index b5cce475..75246eaf 100644
--- a/setup.py
+++ b/setup.py
@@ -35,10 +35,19 @@
 ext_modules = []
 
 if build_cuda_extension:
-    n_threads = min(os.cpu_count(), 8)
+    # figure out compute capability
+    compute_capabilities = {80, 86, 89, 90}
+    if torch_is_prebuilt:
+        compute_capabilities.update({87})
+    
+    capability_flags = ["-gencode", f"arch=compute_{cap},code=sm_{cap}" for cap in compute_capabilities]
 
+    # num threads
+    n_threads = str(min(os.cpu_count(), 8))
+
+    # final args
     cxx_args = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"]
-    nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads]
+    nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads] + capability_flags
 
     ext_modules.append(
         CUDAExtension(

From 98ae978aebd2a1133ba6d3d4f950abd1b0a2c97f Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 19:57:52 +0200
Subject: [PATCH 06/38] Fix capability flags

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 75246eaf..32396338 100644
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,9 @@
     if torch_is_prebuilt:
         compute_capabilities.update({87})
     
-    capability_flags = ["-gencode", f"arch=compute_{cap},code=sm_{cap}" for cap in compute_capabilities]
+    capability_flags = []
+    for cap in compute_capabilities:
+        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
 
     # num threads
     n_threads = str(min(os.cpu_count(), 8))

From 5abd53a2464c3a3a11c407b94b0f354fbb041b60 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 21:56:40 +0200
Subject: [PATCH 07/38] Initial workflow, needs testing

---
 .github/workflows/build.yaml                  | 99 +++++++++++++++++++
 .../scripts/github_create_release.js          | 17 ++++
 2 files changed, 116 insertions(+)
 create mode 100644 .github/workflows/build.yaml
 create mode 100644 .github/workflows/scripts/github_create_release.js

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
new file mode 100644
index 00000000..636af3f1
--- /dev/null
+++ b/.github/workflows/build.yaml
@@ -0,0 +1,99 @@
+name: Build
+
+# run jobs on new release starting with "AutoAWQ v"
+on:
+  push:
+    tags:
+      - AutoAWQ v*
+
+jobs:
+  # create a github release
+  release:
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/github_create_release.js')
+            await script(github, context, core)
+
+  # build AWQ
+  build:
+    name: Build AWQ
+    runs-on: ${{ matrix.os }}
+    needs: release
+    strategy:
+      matrix:
+        os: [windows-latest, ubuntu-20.04]
+        python-version: [3.8, 3.9, 3.10, 3.11]
+        cuda: ["11.8"]
+    defaults:
+      run:
+        shell: pwsh
+    env:
+        CUDA_VERSION: ${{ matrix.cuda }}
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    
+    - name: Setup Miniconda
+      uses: conda-incubator/setup-miniconda@v2.2.0
+      with:
+        activate-environment: "build"
+        python-version: ${{ matrix.python-version }}
+        mamba-version: "*"
+        use-mamba: false
+        channels: conda-forge,defaults
+        channel-priority: true
+        add-pip-as-python-dependency: true
+        auto-activate-base: false
+    
+    - name: Install Dependencies
+      run: |
+        conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0"
+        conda install pytorch "pytorch-cuda=${env:CUDA_VERSION}" -c pytorch -c nvidia
+        python -m pip install --upgrade build setuptools wheel ninja
+    
+    - name: Build Wheel
+      run: |
+        $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX'
+        $env:PYPI_RELEASE = "1"
+        $env:CUDA_PATH = $env:CONDA_PREFIX
+        $env:CUDA_HOME = $env:CONDA_PREFIX
+        if ($IsLinux) { $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH }
+
+        python setup.py sdist bdist_wheel
+
+        $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name }
+        echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV
+    
+    - name: Upload Release Asset
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        upload_url: ${{ needs.release.outputs.upload_url }}
+        asset_path: ./dist/${{ env.wheel_path }}
+        asset_content_type: application/*
\ No newline at end of file
diff --git a/.github/workflows/scripts/github_create_release.js b/.github/workflows/scripts/github_create_release.js
new file mode 100644
index 00000000..fe26188b
--- /dev/null
+++ b/.github/workflows/scripts/github_create_release.js
@@ -0,0 +1,17 @@
+module.exports = async (github, context, core) => {
+	try {
+		const response = await github.rest.repos.createRelease({
+			draft: false,
+			generate_release_notes: true,
+			name: process.env.RELEASE_TAG,
+			owner: context.repo.owner,
+			prerelease: false,
+			repo: context.repo.repo,
+			tag_name: process.env.RELEASE_TAG,
+		});
+
+		core.setOutput('upload_url', response.data.upload_url);
+	} catch (error) {
+		core.setFailed(error.message);
+	}
+}

From 80ccf31ad408eba5f62485a21419362f5dcf52ce Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 22:28:53 +0200
Subject: [PATCH 08/38] run jobs on new release starting with "v"

---
 .github/workflows/build.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 636af3f1..a8ea75b7 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -1,10 +1,10 @@
 name: Build
 
-# run jobs on new release starting with "AutoAWQ v"
+# run jobs on new release starting with "v"
 on:
   push:
     tags:
-      - AutoAWQ v*
+      - v*
 
 jobs:
   # create a github release

From 98d874d1c4a16e7654b6f9ba1da685bb6e617229 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 22:42:11 +0200
Subject: [PATCH 09/38] Remove python from build

---
 .github/workflows/build.yaml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index a8ea75b7..43841eb8 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -52,11 +52,6 @@ jobs:
     steps:
     - name: Checkout code
       uses: actions/checkout@v3
-
-    - name: Set up Python
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
     
     - name: Setup Miniconda
       uses: conda-incubator/setup-miniconda@v2.2.0

From f741f406fe868703e6cd27d5c6b1661056f19197 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 22:59:12 +0200
Subject: [PATCH 10/38] Quote on python versions (fix: 3.10 was interpreted as
 3.1)

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 43841eb8..30c7615d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -41,7 +41,7 @@ jobs:
     strategy:
       matrix:
         os: [windows-latest, ubuntu-20.04]
-        python-version: [3.8, 3.9, 3.10, 3.11]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
         cuda: ["11.8"]
     defaults:
       run:

From b491c2d68c51cbb330f8eb8b2aaadea344dc1c25 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 23:31:51 +0200
Subject: [PATCH 11/38] Replace print with logging, Remove uncommented code

---
 awq/modules/fused_attn.py    | 9 +--------
 awq/modules/fused_mlp.py     | 1 -
 awq/modules/fused_norm.py    | 2 --
 awq/quantize/auto_scale.py   | 4 ++--
 awq/utils/calib_data.py      | 3 ++-
 awq/utils/lm_eval_adaptor.py | 4 ++--
 awq/utils/parallel.py        | 3 ++-
 7 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/awq/modules/fused_attn.py b/awq/modules/fused_attn.py
index 2615ce7a..28d55cae 100644
--- a/awq/modules/fused_attn.py
+++ b/awq/modules/fused_attn.py
@@ -34,8 +34,6 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         sin = freqs.sin()
         cache = torch.cat((cos, sin), dim=-1)
         
-        # self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        # self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
         self.register_buffer("cos_sin_cache", cache.half(), persistent=False)
     
     def forward(
@@ -46,7 +44,6 @@ def forward(
     ):
         # Apply rotary embedding to the query and key before passing them
         # to the attention op.
-        # print(positions.shape, query.shape, key.shape, self.cos_sin_cache.shape)
         query = query.contiguous()
         key = key.contiguous()
         awq_inference_engine.rotary_embedding_neox(
@@ -146,7 +143,7 @@ def make_quant_attn(model, dev):
         qweights = torch.cat([q_proj.qweight, k_proj.qweight, v_proj.qweight], dim=1)
         qzeros = torch.cat([q_proj.qzeros, k_proj.qzeros, v_proj.qzeros], dim=1)
         scales = torch.cat([q_proj.scales, k_proj.scales, v_proj.scales], dim=1)
-        # g_idx = torch.cat([q_proj.g_idx, k_proj.g_idx, v_proj.g_idx], dim=0)
+        
         g_idx = None
         bias = torch.cat([q_proj.bias, k_proj.bias, v_proj.bias], dim=0) if q_proj.bias is not None else None
 
@@ -156,8 +153,6 @@ def make_quant_attn(model, dev):
         qkv_layer.scales = scales
 
         qkv_layer.bias = bias
-        # We're dropping the rotary embedding layer m.rotary_emb here. We don't need it in the triton branch.
-
         attn = QuantLlamaAttention(m.hidden_size, m.num_heads, qkv_layer, m.o_proj, dev)
 
         if '.' in name:
@@ -169,6 +164,4 @@ def make_quant_attn(model, dev):
             parent = model
             child_name = name
 
-        #print(f"Replacing {name} with quant_attn; parent: {parent_name}, child's name: {child_name}")
-
         setattr(parent, child_name, attn)
diff --git a/awq/modules/fused_mlp.py b/awq/modules/fused_mlp.py
index 6957a329..0ca30baf 100644
--- a/awq/modules/fused_mlp.py
+++ b/awq/modules/fused_mlp.py
@@ -71,7 +71,6 @@ def our_llama_mlp(self, x):
 
 def make_fused_mlp(m, parent_name=''):
     if not hasattr(make_fused_mlp, "called"):
-        # print("[Warning] Calling a fake MLP fusion. But still faster than Huggingface Implimentation.")
         make_fused_mlp.called = True
     """
     Replace all LlamaMLP modules with QuantLlamaMLP modules, which fuses many of the operations.
diff --git a/awq/modules/fused_norm.py b/awq/modules/fused_norm.py
index 50f49c3a..9ce8f64b 100644
--- a/awq/modules/fused_norm.py
+++ b/awq/modules/fused_norm.py
@@ -38,6 +38,4 @@ def make_quant_norm(model):
             parent = model
             child_name = name
 
-        #print(f"Replacing {name} with quant_attn; parent: {parent_name}, child's name: {child_name}")
-
         setattr(parent, child_name, norm)
diff --git a/awq/quantize/auto_scale.py b/awq/quantize/auto_scale.py
index a868ed6f..18bc3009 100644
--- a/awq/quantize/auto_scale.py
+++ b/awq/quantize/auto_scale.py
@@ -1,6 +1,7 @@
 import gc
 import torch
 import torch.nn as nn
+import logging
 
 from transformers.models.bloom.modeling_bloom import BloomBlock, BloomGelu
 from transformers.models.opt.modeling_opt import OPTDecoderLayer
@@ -154,9 +155,8 @@ def _search_module_scale(block, linears2scale: list, x, kwargs={}):
                 best_scales = scales
             block.load_state_dict(org_sd)
         if best_ratio == -1:
-            print(history)
+            logging.debug(history)
             raise Exception
-        # print(best_ratio)
         best_scales = best_scales.view(-1)
 
         assert torch.isnan(best_scales).sum() == 0, best_scales
diff --git a/awq/utils/calib_data.py b/awq/utils/calib_data.py
index 0c6f82be..9320a1e1 100644
--- a/awq/utils/calib_data.py
+++ b/awq/utils/calib_data.py
@@ -1,4 +1,5 @@
 import torch
+import logging
 from datasets import load_dataset
 
 def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size=512):
@@ -25,5 +26,5 @@ def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size=
     # now concatenate all samples and split according to block size
     cat_samples = torch.cat(samples, dim=1)
     n_split = cat_samples.shape[1] // block_size
-    print(f" * Split into {n_split} blocks")
+    logging.debug(f" * Split into {n_split} blocks")
     return [cat_samples[:, i*block_size:(i+1)*block_size] for i in range(n_split)]
diff --git a/awq/utils/lm_eval_adaptor.py b/awq/utils/lm_eval_adaptor.py
index c1c35d05..a38f1c4f 100644
--- a/awq/utils/lm_eval_adaptor.py
+++ b/awq/utils/lm_eval_adaptor.py
@@ -2,7 +2,7 @@
 import torch
 from lm_eval.base import BaseLM
 import fnmatch
-
+import logging
 
 class LMEvalAdaptor(BaseLM):
 
@@ -52,7 +52,7 @@ def max_length(self):
         elif 'falcon' in self.model_name:
             return 2048
         else:
-            print(self.model.config)
+            logging.debug(self.model.config)
             raise NotImplementedError
 
     @property
diff --git a/awq/utils/parallel.py b/awq/utils/parallel.py
index f1ba27b0..eb4389bc 100644
--- a/awq/utils/parallel.py
+++ b/awq/utils/parallel.py
@@ -1,6 +1,7 @@
 import os
 import torch
 import gc
+import logging
 
 
 def auto_parallel(args):
@@ -23,5 +24,5 @@ def auto_parallel(args):
         cuda_visible_devices = list(range(8))
     os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
         [str(dev) for dev in cuda_visible_devices[:n_gpu]])
-    print("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"])
+    logging.debug("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"])
     return cuda_visible_devices

From 9ba3afacd7e9e02b3e49fdd42a02ae6ac70dd328 Mon Sep 17 00:00:00 2001
From: Casper Hansen <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 22:31:07 +0000
Subject: [PATCH 12/38] Add basic generation example

---
 examples/basic_generate.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 examples/basic_generate.py

diff --git a/examples/basic_generate.py b/examples/basic_generate.py
new file mode 100644
index 00000000..5a9a678f
--- /dev/null
+++ b/examples/basic_generate.py
@@ -0,0 +1,29 @@
+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer, TextStreamer
+
+quant_path = "casperhansen/vicuna-7b-v1.5-awq"
+quant_file = "awq_model_w4_g128.pt"
+
+# Load model
+model = AutoAWQForCausalLM.from_quantized(quant_path, quant_file, fuse_layers=True)
+tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
+streamer = TextStreamer(tokenizer, skip_special_tokens=True)
+
+# Convert prompt to tokens
+prompt_template = """\
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+
+USER: {prompt}
+ASSISTANT:"""
+
+tokens = tokenizer(
+    prompt_template.format(prompt="How are you today?"), 
+    return_tensors='pt'
+).input_ids.cuda()
+
+# Generate output
+generation_output = model.generate(
+    tokens, 
+    streamer=streamer,
+    max_new_tokens=512
+)

From d6bd9db7653a8bf8dc8e5e0e83db5837146588a1 Mon Sep 17 00:00:00 2001
From: Casper Hansen <casperbh.96@gmail.com>
Date: Sun, 27 Aug 2023 22:31:40 +0000
Subject: [PATCH 13/38] Standardize ASM calls. Add Windows support.

---
 awq_cuda/quantization/dequantize.cuh   | 16 +++----
 awq_cuda/quantization/gemm_cuda_gen.cu | 61 ++++++++++++++------------
 2 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/awq_cuda/quantization/dequantize.cuh b/awq_cuda/quantization/dequantize.cuh
index 5d333b35..368cf984 100644
--- a/awq_cuda/quantization/dequantize.cuh
+++ b/awq_cuda/quantization/dequantize.cuh
@@ -34,19 +34,19 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
     // immediately before required.
     const uint32_t top_i4s = i4s >> 8;
     // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+    ASM("lop3.b32 %0, %1, %2, %3, %4;\n"
                     : "=r"(h[0])
                     : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
     // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+    ASM("lop3.b32 %0, %1, %2, %3, %4;\n"
                     : "=r"(h[1])
                     : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
     // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+    ASM("lop3.b32 %0, %1, %2, %3, %4;\n"
                     : "=r"(h[2])
                     : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
     // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+    ASM("lop3.b32 %0, %1, %2, %3, %4;\n"
                     : "=r"(h[3])
                     : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
 
@@ -66,13 +66,13 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
 
     // Finally, we construct the output numbers.
     // Convert elt_01
-    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+    ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
     // Convert elt_23
-    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+    ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
     // Convert elt_45
-    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+    ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
     // Convert elt_67
-    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+    ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
 
     return result;
 }
diff --git a/awq_cuda/quantization/gemm_cuda_gen.cu b/awq_cuda/quantization/gemm_cuda_gen.cu
index 1632d8be..a44b195a 100644
--- a/awq_cuda/quantization/gemm_cuda_gen.cu
+++ b/awq_cuda/quantization/gemm_cuda_gen.cu
@@ -9,6 +9,11 @@
 
  */
 
+#ifdef _MSC_VER
+#define ASM __asm
+#else
+#define ASM asm volatile
+#endif
 
 #include <torch/extension.h>
 #include "gemm_cuda.h"
@@ -132,14 +137,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
       // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8);
       // - zero and * scale
       // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale.
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
       /*
       if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){
         printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
@@ -154,14 +159,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
     for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) {
       {
         unsigned int addr;
-        __asm__ __volatile__(
+        ASM(
           "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
           : "=r"(addr)
           : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8))))
         );
 
 
-        __asm__ __volatile__(
+        ASM(
           "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
           "{%0, %1, %2, %3}, [%4];\n"
           : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
@@ -172,12 +177,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
       for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) {
         {
           unsigned int addr;
-          __asm__ __volatile__(
+          ASM(
             "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
             : "=r"(addr)
             : "l"((void *)((&(B_shared[(((k_0_1 * 2176) + (((int)threadIdx.y) * 64)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 136) + ((((int)threadIdx.x) >> 4) * 8))))
           );
-          __asm__ __volatile__(
+          ASM(
             "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
             "{%0, %1, %2, %3}, [%4];\n"
             : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
@@ -187,7 +192,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
       }
       for (int j_0_4 = 0; j_0_4 < 4; ++j_0_4) {
         {
-          __asm__ __volatile__(
+          ASM(
             "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
             "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
             :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
@@ -195,7 +200,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
         }
 
         {
-          __asm__ __volatile__(
+          ASM(
             "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
             "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
             :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
@@ -326,14 +331,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
       // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8);
       // - zero and * scale
       // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale.
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
       /*
       if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){
         printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
@@ -349,12 +354,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
     {
       {
         unsigned int addr;
-        __asm__ __volatile__(
+        ASM(
           "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
           : "=r"(addr)
           : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8))))
         );
-        __asm__ __volatile__(
+        ASM(
           "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
           "{%0, %1, %2, %3}, [%4];\n"
           : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
@@ -367,12 +372,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
       {
         {
           unsigned int addr;
-          __asm__ __volatile__(
+          ASM(
             "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
             : "=r"(addr)
             : "l"((void *)((&(B_shared[(((k_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8))))
           );
-          __asm__ __volatile__(
+          ASM(
             "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
             "{%0, %1, %2, %3}, [%4];\n"
             : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
@@ -385,7 +390,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
       {
 
         {
-          __asm__ __volatile__(
+          ASM(
             "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
             "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
             :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
@@ -393,7 +398,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
         }
 
         {
-          __asm__ __volatile__(
+          ASM(
             "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
             "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
             :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])

From acdf7d9a93af6b137844f096a5941c67d22947f9 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Mon, 28 Aug 2023 10:41:42 +0200
Subject: [PATCH 14/38] Define HALF_FLT_MAX

---
 awq_cuda/layernorm/reduction.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/awq_cuda/layernorm/reduction.cuh b/awq_cuda/layernorm/reduction.cuh
index 678160e8..f670d185 100644
--- a/awq_cuda/layernorm/reduction.cuh
+++ b/awq_cuda/layernorm/reduction.cuh
@@ -16,7 +16,7 @@ https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kern
 #include <float.h>
 #include <type_traits>
 
-static const float HALF_FLT_MAX = 65504.F;
+#define HALF_FLT_MAX 65504.F
 #define FINAL_MASK 0xffffffff
 
 

From 4c39a76a17d08f6d6eea95abe9f6c7ff7107ab24 Mon Sep 17 00:00:00 2001
From: Casper Hansen <casperbh.96@gmail.com>
Date: Mon, 28 Aug 2023 10:36:59 +0000
Subject: [PATCH 15/38] NVCC: Use asm volatile

---
 awq_cuda/quantization/dequantize.cuh   | 16 +++----
 awq_cuda/quantization/gemm_cuda_gen.cu | 65 +++++++++++---------------
 2 files changed, 36 insertions(+), 45 deletions(-)

diff --git a/awq_cuda/quantization/dequantize.cuh b/awq_cuda/quantization/dequantize.cuh
index 368cf984..5d333b35 100644
--- a/awq_cuda/quantization/dequantize.cuh
+++ b/awq_cuda/quantization/dequantize.cuh
@@ -34,19 +34,19 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
     // immediately before required.
     const uint32_t top_i4s = i4s >> 8;
     // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
-    ASM("lop3.b32 %0, %1, %2, %3, %4;\n"
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
                     : "=r"(h[0])
                     : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
     // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
-    ASM("lop3.b32 %0, %1, %2, %3, %4;\n"
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
                     : "=r"(h[1])
                     : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
     // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
-    ASM("lop3.b32 %0, %1, %2, %3, %4;\n"
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
                     : "=r"(h[2])
                     : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
     // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
-    ASM("lop3.b32 %0, %1, %2, %3, %4;\n"
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
                     : "=r"(h[3])
                     : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
 
@@ -66,13 +66,13 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
 
     // Finally, we construct the output numbers.
     // Convert elt_01
-    ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
     // Convert elt_23
-    ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
     // Convert elt_45
-    ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
     // Convert elt_67
-    ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
 
     return result;
 }
diff --git a/awq_cuda/quantization/gemm_cuda_gen.cu b/awq_cuda/quantization/gemm_cuda_gen.cu
index a44b195a..067a9c0f 100644
--- a/awq_cuda/quantization/gemm_cuda_gen.cu
+++ b/awq_cuda/quantization/gemm_cuda_gen.cu
@@ -9,12 +9,6 @@
 
  */
 
-#ifdef _MSC_VER
-#define ASM __asm
-#else
-#define ASM asm volatile
-#endif
-
 #include <torch/extension.h>
 #include "gemm_cuda.h"
 #include "dequantize.cuh"
@@ -36,9 +30,6 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
   float C_warp[32];
   __shared__ half A_shared[16 * (32 + 8)];
   __shared__ half B_shared[32 * (128 + 8)];
-  
-  __shared__ half scaling_factors_shared[128];
-  __shared__ half zeros_shared[128];
 
   int j_factors1 = ((OC + 128 - 1) / 128);
   int blockIdx_x = 0;
@@ -137,14 +128,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
       // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8);
       // - zero and * scale
       // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale.
-      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
-      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
-      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
-      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
-      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
-      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
-      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
-      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
       /*
       if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){
         printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
@@ -159,14 +150,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
     for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) {
       {
         unsigned int addr;
-        ASM(
+        asm volatile(
           "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
           : "=r"(addr)
           : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8))))
         );
 
 
-        ASM(
+        asm volatile(
           "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
           "{%0, %1, %2, %3}, [%4];\n"
           : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
@@ -177,12 +168,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
       for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) {
         {
           unsigned int addr;
-          ASM(
+          asm volatile(
             "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
             : "=r"(addr)
             : "l"((void *)((&(B_shared[(((k_0_1 * 2176) + (((int)threadIdx.y) * 64)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 136) + ((((int)threadIdx.x) >> 4) * 8))))
           );
-          ASM(
+          asm volatile(
             "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
             "{%0, %1, %2, %3}, [%4];\n"
             : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
@@ -192,7 +183,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
       }
       for (int j_0_4 = 0; j_0_4 < 4; ++j_0_4) {
         {
-          ASM(
+          asm volatile(
             "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
             "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
             :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
@@ -200,7 +191,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
         }
 
         {
-          ASM(
+          asm volatile(
             "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
             "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
             :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
@@ -331,14 +322,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
       // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8);
       // - zero and * scale
       // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale.
-      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
-      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
-      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
-      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
-      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
-      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
-      ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
-      ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
       /*
       if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){
         printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
@@ -354,12 +345,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
     {
       {
         unsigned int addr;
-        ASM(
+        asm volatile(
           "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
           : "=r"(addr)
           : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8))))
         );
-        ASM(
+        asm volatile(
           "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
           "{%0, %1, %2, %3}, [%4];\n"
           : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
@@ -372,12 +363,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
       {
         {
           unsigned int addr;
-          ASM(
+          asm volatile(
             "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
             : "=r"(addr)
             : "l"((void *)((&(B_shared[(((k_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8))))
           );
-          ASM(
+          asm volatile(
             "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
             "{%0, %1, %2, %3}, [%4];\n"
             : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
@@ -390,7 +381,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
       {
 
         {
-          ASM(
+          asm volatile(
             "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
             "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
             :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
@@ -398,7 +389,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
         }
 
         {
-          ASM(
+          asm volatile(
             "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
             "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
             :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])

From a20d30dd36daa77d8630fa45aded79be80db8dbc Mon Sep 17 00:00:00 2001
From: Casper Hansen <casperbh.96@gmail.com>
Date: Mon, 28 Aug 2023 10:37:30 +0000
Subject: [PATCH 16/38] Remove windows build

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 30c7615d..4513d20f 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -40,7 +40,7 @@ jobs:
     needs: release
     strategy:
       matrix:
-        os: [windows-latest, ubuntu-20.04]
+        os: [ubuntu-20.04]
         python-version: ["3.8", "3.9", "3.10", "3.11"]
         cuda: ["11.8"]
     defaults:

From 6aa94f548eca4d127ddac5fcbb51b59211e7ca4e Mon Sep 17 00:00:00 2001
From: Casper Hansen <casperbh.96@gmail.com>
Date: Mon, 28 Aug 2023 11:36:13 +0000
Subject: [PATCH 17/38] Add Python checkout

---
 .github/workflows/build.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 4513d20f..1dedb0c8 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -53,6 +53,11 @@ jobs:
     - name: Checkout code
       uses: actions/checkout@v3
     
+    - name: Setup Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.pyver }}
+    
     - name: Setup Miniconda
       uses: conda-incubator/setup-miniconda@v2.2.0
       with:

From 5bc0916b79a0a13977deb8de8eed7d222d36f9c9 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Mon, 28 Aug 2023 14:39:30 +0200
Subject: [PATCH 18/38] Separate PyTorch install

---
 .github/workflows/build.yaml | 61 ++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 1dedb0c8..0b71d1f3 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -3,8 +3,8 @@ name: Build
 # run jobs on new release starting with "v"
 on:
   push:
-    tags:
-      - v*
+    branches:
+      - release_package
 
 jobs:
   # create a github release
@@ -22,16 +22,16 @@ jobs:
         run: |
           echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
 
-      - name: Create Release
-        id: create_release
-        uses: "actions/github-script@v6"
-        env:
-          RELEASE_TAG: ${{ env.release_tag }}
-        with:
-          github-token: "${{ secrets.GITHUB_TOKEN }}"
-          script: |
-            const script = require('.github/workflows/scripts/github_create_release.js')
-            await script(github, context, core)
+  #     - name: Create Release
+  #       id: create_release
+  #       uses: "actions/github-script@v6"
+  #       env:
+  #         RELEASE_TAG: ${{ env.release_tag }}
+  #       with:
+  #         github-token: "${{ secrets.GITHUB_TOKEN }}"
+  #         script: |
+  #           const script = require('.github/workflows/scripts/github_create_release.js')
+  #           await script(github, context, core)
 
   # build AWQ
   build:
@@ -42,21 +42,19 @@ jobs:
       matrix:
         os: [ubuntu-20.04]
         python-version: ["3.8", "3.9", "3.10", "3.11"]
-        cuda: ["11.8"]
+        cuda-version: ["11.8"]
     defaults:
       run:
         shell: pwsh
-    env:
-        CUDA_VERSION: ${{ matrix.cuda }}
 
     steps:
     - name: Checkout code
       uses: actions/checkout@v3
     
     - name: Setup Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v4
       with:
-        python-version: ${{ matrix.pyver }}
+        python-version: ${{ matrix.python-version }}
     
     - name: Setup Miniconda
       uses: conda-incubator/setup-miniconda@v2.2.0
@@ -70,19 +68,34 @@ jobs:
         add-pip-as-python-dependency: true
         auto-activate-base: false
     
-    - name: Install Dependencies
+    - name: Install CUDA
+      run: |
+        conda install cuda-toolkit -c "nvidia/label/cuda-${{ matrix.cuda-version }}.0"
+        $env:CUDA_PATH = $env:CONDA_PREFIX
+        $env:CUDA_HOME = $env:CONDA_PREFIX
+        
+        echo "$env:CUDA_PATH"
+        echo "$env:CUDA_HOME"
+    
+    - name: Install PyTorch-cu${{ matrix.cuda-version }}
       run: |
-        conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0"
-        conda install pytorch "pytorch-cuda=${env:CUDA_VERSION}" -c pytorch -c nvidia
+        $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX'
+        if ($IsLinux) { $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH }
+        
+        # Install torch
+        $env:CUDA_VERSION = ${{ matrix.cuda-version }} -replace '\.', ''
+        pip install --upgrade --no-cache-dir torch==2.0.1+cu$env:CUDA_VERSION --index-url https://download.pytorch.org/whl/cu$env:CUDA_VERSION
         python -m pip install --upgrade build setuptools wheel ninja
+
+        # Print version information
+        python --version
+        python -c "import torch; print('PyTorch:', torch.__version__)"
+        python -c "import torch; print('CUDA:', torch.version.cuda)"
+        python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
     
     - name: Build Wheel
       run: |
-        $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX'
         $env:PYPI_RELEASE = "1"
-        $env:CUDA_PATH = $env:CONDA_PREFIX
-        $env:CUDA_HOME = $env:CONDA_PREFIX
-        if ($IsLinux) { $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH }
 
         python setup.py sdist bdist_wheel
 

From 7e361d16b5dd4102a03933255c02dd8411b4a8b4 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 29 Aug 2023 12:13:24 +0200
Subject: [PATCH 19/38] Add cuda_runtime in include_dirs

---
 setup.py | 61 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/setup.py b/setup.py
index 32396338..98684d9a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,23 +1,31 @@
 import os
 import torch
 from setuptools import setup, find_packages
+from distutils.sysconfig import get_python_lib
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
 
-if CUDA_HOME is None:
-    raise RuntimeError(
-        f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
+def check_dependencies():
+    if CUDA_HOME is None:
+        raise RuntimeError(
+            f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
 
-# Collect the compute capabilities of all available GPUs.
-compute_capabilities = set()
-for i in range(torch.cuda.device_count()):
-    major, minor = torch.cuda.get_device_capability(i)
-    if major < 8:
-        raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.")
-    compute_capabilities.add(major * 10 + minor)
-
-# Get environment variables
-build_cuda_extension = os.environ.get('BUILD_CUDA_EXT', '1') == '1'
-torch_is_prebuilt = os.environ.get('TORCH_IS_PREBUILT', '0') == '1'
+def get_compute_capabilities():
+    # Collect the compute capabilities of all available GPUs.
+    compute_capabilities = set()
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        if major < 8:
+            raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.")
+        compute_capabilities.add(major * 10 + minor)
+    
+    # figure out compute capability
+    compute_capabilities = {80, 86, 89, 90}
+    
+    capability_flags = []
+    for cap in compute_capabilities:
+        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
+    
+    return capability_flags
 
 # Define dependencies
 dependencies = [
@@ -25,29 +33,22 @@
     "transformers>=4.32.0", 
     "lm_eval", "texttable",
     "toml", "attributedict",
-    "protobuf"
+    "protobuf",
+    "torch>=2.0.0", "torchvision"
 ]
 
-if not torch_is_prebuilt:
-    dependencies.extend(["torch>=2.0.0", "torchvision"])
+# Get environment variables
+build_cuda_extension = os.environ.get('BUILD_CUDA_EXT', '1') == '1'
 
 # Setup CUDA extension
 ext_modules = []
 
 if build_cuda_extension:
-    # figure out compute capability
-    compute_capabilities = {80, 86, 89, 90}
-    if torch_is_prebuilt:
-        compute_capabilities.update({87})
-    
-    capability_flags = []
-    for cap in compute_capabilities:
-        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
-
     # num threads
     n_threads = str(min(os.cpu_count(), 8))
 
     # final args
+    capability_flags = get_compute_capabilities()
     cxx_args = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"]
     nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads] + capability_flags
 
@@ -67,6 +68,13 @@
         )
     )
 
+# Find directories to be included in setup
+include_dirs = []
+conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
+
+if os.path.isdir(conda_cuda_include_dir):
+    include_dirs.append(conda_cuda_include_dir)
+
 setup(
     name="autoawq",
     version="0.1.0",
@@ -90,6 +98,7 @@
         "Programming Language :: C++",
     ],
     install_requires=dependencies,
+    include_dirs=include_dirs,
     packages=find_packages(exclude=["examples*"]),
     ext_modules=ext_modules,
     cmdclass={"build_ext": BuildExtension}

From b574767f4a3dfa18481ed7e1745bb7e4c9646da7 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 29 Aug 2023 20:28:06 +0200
Subject: [PATCH 20/38] Improved setup.py structure and build instructions

---
 .github/workflows/build.yaml | 132 ++++++++++++++----------------
 setup.py                     | 153 +++++++++++++++--------------------
 2 files changed, 126 insertions(+), 159 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0b71d1f3..51922f62 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -22,91 +22,81 @@ jobs:
         run: |
           echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
 
-  #     - name: Create Release
-  #       id: create_release
-  #       uses: "actions/github-script@v6"
-  #       env:
-  #         RELEASE_TAG: ${{ env.release_tag }}
-  #       with:
-  #         github-token: "${{ secrets.GITHUB_TOKEN }}"
-  #         script: |
-  #           const script = require('.github/workflows/scripts/github_create_release.js')
-  #           await script(github, context, core)
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/github_create_release.js')
+            await script(github, context, core)
 
   # build AWQ
-  build:
+  build_wheels:
     name: Build AWQ
     runs-on: ${{ matrix.os }}
     needs: release
     strategy:
       matrix:
         os: [ubuntu-20.04]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-        cuda-version: ["11.8"]
+        pyver: ["3.8", "3.9", "3.10", "3.11"]
+        cuda: ["11.8"]
     defaults:
       run:
         shell: pwsh
+    env:
+        CUDA_VERSION: ${{ matrix.cuda }}
 
     steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-    
-    - name: Setup Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-    
-    - name: Setup Miniconda
-      uses: conda-incubator/setup-miniconda@v2.2.0
-      with:
-        activate-environment: "build"
-        python-version: ${{ matrix.python-version }}
-        mamba-version: "*"
-        use-mamba: false
-        channels: conda-forge,defaults
-        channel-priority: true
-        add-pip-as-python-dependency: true
-        auto-activate-base: false
-    
-    - name: Install CUDA
-      run: |
-        conda install cuda-toolkit -c "nvidia/label/cuda-${{ matrix.cuda-version }}.0"
-        $env:CUDA_PATH = $env:CONDA_PREFIX
-        $env:CUDA_HOME = $env:CONDA_PREFIX
-        
-        echo "$env:CUDA_PATH"
-        echo "$env:CUDA_HOME"
-    
-    - name: Install PyTorch-cu${{ matrix.cuda-version }}
-      run: |
-        $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX'
-        if ($IsLinux) { $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH }
-        
-        # Install torch
-        $env:CUDA_VERSION = ${{ matrix.cuda-version }} -replace '\.', ''
-        pip install --upgrade --no-cache-dir torch==2.0.1+cu$env:CUDA_VERSION --index-url https://download.pytorch.org/whl/cu$env:CUDA_VERSION
-        python -m pip install --upgrade build setuptools wheel ninja
+      - uses: actions/checkout@v3
 
-        # Print version information
-        python --version
-        python -c "import torch; print('PyTorch:', torch.__version__)"
-        python -c "import torch; print('CUDA:', torch.version.cuda)"
-        python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
-    
-    - name: Build Wheel
-      run: |
-        $env:PYPI_RELEASE = "1"
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Setup Miniconda
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.pyver }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: Install Dependencies
+        run: |
+          conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0"
+          conda install pytorch "pytorch-cuda=${env:CUDA_VERSION}" -c pytorch -c nvidia
+          python -m pip install --upgrade build setuptools wheel ninja
+
+          # Print version information
+          python --version
+          python -c "import torch; print('PyTorch:', torch.__version__)"
+          python -c "import torch; print('CUDA:', torch.version.cuda)"
+          python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
+
+      - name: Build Wheel
+        run: |
+          $env:CUDA_PATH = $env:CONDA_PREFIX
+          $env:CUDA_HOME = $env:CONDA_PREFIX
+          if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
+          $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX'
 
-        python setup.py sdist bdist_wheel
+          python setup.py sdist bdist_wheel
 
-        $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name }
-        echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV
+          $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name }
+          echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV
     
-    - name: Upload Release Asset
-      uses: actions/upload-release-asset@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      with:
-        upload_url: ${{ needs.release.outputs.upload_url }}
-        asset_path: ./dist/${{ env.wheel_path }}
-        asset_content_type: application/*
\ No newline at end of file
+      - name: Upload Release Asset
+        uses: shogo82148/actions-upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/${{ env.wheel_path }}
+          asset_content_type: application/*
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 98684d9a..bbad0e59 100644
--- a/setup.py
+++ b/setup.py
@@ -1,105 +1,82 @@
 import os
-import torch
+from pathlib import Path
+from torch.utils import cpp_extension
 from setuptools import setup, find_packages
 from distutils.sysconfig import get_python_lib
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
 
-def check_dependencies():
-    if CUDA_HOME is None:
-        raise RuntimeError(
-            f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
+os.environ["CC"] = "g++"
+os.environ["CXX"] = "g++"
 
-def get_compute_capabilities():
-    # Collect the compute capabilities of all available GPUs.
-    compute_capabilities = set()
-    for i in range(torch.cuda.device_count()):
-        major, minor = torch.cuda.get_device_capability(i)
-        if major < 8:
-            raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.")
-        compute_capabilities.add(major * 10 + minor)
-    
-    # figure out compute capability
-    compute_capabilities = {80, 86, 89, 90}
-    
-    capability_flags = []
-    for cap in compute_capabilities:
-        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
-    
-    return capability_flags
+common_setup_kwargs = {
+    "version": "0.0.1",
+    "name": "autoawq",
+    "author": "Casper Hansen",
+    "license": "MIT",
+    "python_requires": ">=3.8.0",
+    "description": "AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.",
+    "long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
+    "long_description_content_type": "text/markdown",
+    "url": "https://github.com/casper-hansen/AutoAWQ",
+    "keywords": ["awq", "autoawq", "quantization", "transformers"],
+    "platforms": ["linux"],
+    "classifiers": [
+        "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+        "Environment :: GPU :: NVIDIA CUDA :: 12",
+        "License :: OSI Approved :: MIT License",
+        "Natural Language :: English",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: C++",
+    ]
+}
 
-# Define dependencies
-dependencies = [
-    "accelerate", "sentencepiece", "tokenizers>=0.12.1",
-    "transformers>=4.32.0", 
-    "lm_eval", "texttable",
-    "toml", "attributedict",
+requirements = [
+    "torch>=2.0.0",
+    "transformers>=4.32.0",
+    "tokenizers>=0.12.1",
+    "accelerate",
+    "sentencepiece",
+    "lm_eval",
+    "texttable",
+    "toml",
+    "attributedict",
     "protobuf",
-    "torch>=2.0.0", "torchvision"
+    "torchvision"
 ]
 
-# Get environment variables
-build_cuda_extension = os.environ.get('BUILD_CUDA_EXT', '1') == '1'
-
-# Setup CUDA extension
-ext_modules = []
-
-if build_cuda_extension:
-    # num threads
-    n_threads = str(min(os.cpu_count(), 8))
+include_dirs = []
 
-    # final args
-    capability_flags = get_compute_capabilities()
-    cxx_args = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"]
-    nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads] + capability_flags
+conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
+if os.path.isdir(conda_cuda_include_dir):
+    include_dirs.append(conda_cuda_include_dir)
 
-    ext_modules.append(
-        CUDAExtension(
-            name="awq_inference_engine",
-            sources=[
-                "awq_cuda/pybind.cpp",
-                "awq_cuda/quantization/gemm_cuda_gen.cu",
-                "awq_cuda/layernorm/layernorm.cu",
-                "awq_cuda/position_embedding/pos_encoding_kernels.cu"
-            ],
-            extra_compile_args={
-                "cxx": cxx_args,
-                "nvcc": nvcc_args
-            },
-        )
+extensions = [
+    cpp_extension.CppExtension(
+        "awq_inference_engine",
+        [
+            "awq_cuda/pybind.cpp",
+            "awq_cuda/quantization/gemm_cuda_gen.cu",
+            "awq_cuda/layernorm/layernorm.cu",
+            "awq_cuda/position_embedding/pos_encoding_kernels.cu"
+        ], extra_compile_args={
+            "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"],
+            "nvcc": ["-O3", "-std=c++17"]
+        }
     )
+]
 
-# Find directories to be included in setup
-include_dirs = []
-conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
+additional_setup_kwargs = {
+    "ext_modules": extensions,
+    "cmdclass": {'build_ext': cpp_extension.BuildExtension}
+}
 
-if os.path.isdir(conda_cuda_include_dir):
-    include_dirs.append(conda_cuda_include_dir)
+common_setup_kwargs.update(additional_setup_kwargs)
 
 setup(
-    name="autoawq",
-    version="0.1.0",
-    author="Casper Hansen",
-    license="MIT",
-    description="AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.",
-    long_description=open("README.md", "r").read(),
-    long_description_content_type="text/markdown",
-    python_requires=">=3.8",
-    url="https://github.com/casper-hansen/AutoAWQ",
-    keywords=["awq", "autoawq", "quantization", "transformers"],
-    classifiers=[
-        "Environment :: GPU :: NVIDIA CUDA :: 11.8",
-        "Environment :: GPU :: NVIDIA CUDA :: 12",
-        "License :: OSI Approved :: MIT License",
-        "Natural Language :: English",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: C++",
-    ],
-    install_requires=dependencies,
+    packages=find_packages(),
+    install_requires=requirements,
     include_dirs=include_dirs,
-    packages=find_packages(exclude=["examples*"]),
-    ext_modules=ext_modules,
-    cmdclass={"build_ext": BuildExtension}
-)
+    **common_setup_kwargs
+)
\ No newline at end of file

From 2d70f22af7e039c6f03d17f0325f2c7b643e4800 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 29 Aug 2023 21:00:25 +0200
Subject: [PATCH 21/38] Reduce build file

---
 .github/workflows/build.yaml | 52 +++++++++---------------------------
 1 file changed, 13 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 51922f62..08ce060e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -1,47 +1,18 @@
-name: Build
+name: Build AutoGPTQ Wheels with CUDA
 
-# run jobs on new release starting with "v"
 on:
   push:
     branches:
       - release_package
 
 jobs:
-  # create a github release
-  release:
-    name: Create Release
-    runs-on: ubuntu-latest
-    outputs:
-      upload_url: ${{ steps.create_release.outputs.upload_url }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Extract branch info
-        shell: bash
-        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
-
-      - name: Create Release
-        id: create_release
-        uses: "actions/github-script@v6"
-        env:
-          RELEASE_TAG: ${{ env.release_tag }}
-        with:
-          github-token: "${{ secrets.GITHUB_TOKEN }}"
-          script: |
-            const script = require('.github/workflows/scripts/github_create_release.js')
-            await script(github, context, core)
-
-  # build AWQ
   build_wheels:
     name: Build AWQ
     runs-on: ${{ matrix.os }}
-    needs: release
     strategy:
       matrix:
         os: [ubuntu-20.04]
-        pyver: ["3.8", "3.9", "3.10", "3.11"]
+        pyver: ["3.8", "3.9", "3.10"]
         cuda: ["11.8"]
     defaults:
       run:
@@ -91,12 +62,15 @@ jobs:
 
           $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name }
           echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV
-    
-      - name: Upload Release Asset
-        uses: shogo82148/actions-upload-release-asset@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - uses: actions/upload-artifact@v3
+        if: runner.os == 'Linux'
+        with:
+          name: 'linux-cuda-wheels'
+          path: ./dist/*.whl
+
+      - uses: actions/upload-artifact@v3
+        if: runner.os == 'Windows'
         with:
-          upload_url: ${{ needs.release.outputs.upload_url }}
-          asset_path: ./dist/${{ env.wheel_path }}
-          asset_content_type: application/*
\ No newline at end of file
+          name: 'windows-cuda-wheels'
+          path: ./dist/*.whl
\ No newline at end of file

From 04b164c42792f95f3fec62980259c0e0b84b0a84 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 29 Aug 2023 21:02:43 +0200
Subject: [PATCH 22/38] Update build.yaml

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 08ce060e..49ac324e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -1,4 +1,4 @@
-name: Build AutoGPTQ Wheels with CUDA
+name: Build AutoAWQ Wheels with CUDA
 
 on:
   push:

From 8149d362674a5563517f9b55a445f3c14011288c Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 29 Aug 2023 21:52:30 +0200
Subject: [PATCH 23/38] Create release after build upon new tag

---
 .github/workflows/build.yaml | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 49ac324e..3276c485 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -2,8 +2,8 @@ name: Build AutoAWQ Wheels with CUDA
 
 on:
   push:
-    branches:
-      - release_package
+    tags:
+      - "v*"
 
 jobs:
   build_wheels:
@@ -62,15 +62,13 @@ jobs:
 
           $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name }
           echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV
-
-      - uses: actions/upload-artifact@v3
-        if: runner.os == 'Linux'
-        with:
-          name: 'linux-cuda-wheels'
-          path: ./dist/*.whl
-
-      - uses: actions/upload-artifact@v3
-        if: runner.os == 'Windows'
+      
+      - name: Create a Release
+        id: create_release
+        uses: shogo82148/actions-create-release@v1
+      
+      - name: Upload Assets
+        uses: shogo82148/actions-upload-release-asset@v1
         with:
-          name: 'windows-cuda-wheels'
-          path: ./dist/*.whl
\ No newline at end of file
+          upload_url: ${{ steps.create_release.outputs.upload_url }}
+          asset_path: ./dist/*.whl
\ No newline at end of file

From 0fab60af011dc516afec1c0fb5b334f2d6ad4e31 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 29 Aug 2023 21:59:00 +0200
Subject: [PATCH 24/38] Add CUDA 11.8 to requirements list

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0bfc5326..e77e2068 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ Roadmap:
 
 Requirements: 
 - Compute Capability 8.0 (sm80). Ampere and later architectures are supported.
+- CUDA Toolkit 11.8 and later.
 
 Install:
 - Use pip to install awq

From 51f89b6c8b850091771514656c7f263bba012603 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 29 Aug 2023 22:38:25 +0200
Subject: [PATCH 25/38] Replace creating release with script

---
 .github/workflows/build.yaml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 3276c485..efda539c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -63,9 +63,16 @@ jobs:
           $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name }
           echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV
       
-      - name: Create a Release
+      - name: Create Release
         id: create_release
-        uses: shogo82148/actions-create-release@v1
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/create_release.js')
+            await script(github, context, core)
       
       - name: Upload Assets
         uses: shogo82148/actions-upload-release-asset@v1

From 248f6418095b2be25539dce3cc383f011a107062 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 29 Aug 2023 23:18:20 +0200
Subject: [PATCH 26/38] Move creating release to start of build

---
 .github/workflows/build.yaml | 41 +++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index efda539c..54366688 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -6,9 +6,37 @@ on:
       - "v*"
 
 jobs:
+  release:
+    # Retrieve tag and create release
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/create_release.js')
+            await script(github, context, core)
+
   build_wheels:
     name: Build AWQ
     runs-on: ${{ matrix.os }}
+    needs: release
+    
     strategy:
       matrix:
         os: [ubuntu-20.04]
@@ -63,19 +91,8 @@ jobs:
           $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name }
           echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV
       
-      - name: Create Release
-        id: create_release
-        uses: "actions/github-script@v6"
-        env:
-          RELEASE_TAG: ${{ env.release_tag }}
-        with:
-          github-token: "${{ secrets.GITHUB_TOKEN }}"
-          script: |
-            const script = require('.github/workflows/scripts/create_release.js')
-            await script(github, context, core)
-      
       - name: Upload Assets
         uses: shogo82148/actions-upload-release-asset@v1
         with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }}
+          upload_url: ${{ needs.release.outputs.upload_url }}
           asset_path: ./dist/*.whl
\ No newline at end of file

From e2768c29560551123fd9bd91f9703254cd30cf0e Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Tue, 29 Aug 2023 23:23:29 +0200
Subject: [PATCH 27/38] Fixed path to script

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 54366688..c73108a0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -29,7 +29,7 @@ jobs:
         with:
           github-token: "${{ secrets.GITHUB_TOKEN }}"
           script: |
-            const script = require('.github/workflows/scripts/create_release.js')
+            const script = require('.github/workflows/scripts/github_create_release.js')
             await script(github, context, core)
 
   build_wheels:

From bfa5ba70addb884091043290b80f79de9958d7ce Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Wed, 30 Aug 2023 00:08:13 +0200
Subject: [PATCH 28/38] Build 3.11

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c73108a0..dc1272a5 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -40,7 +40,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-20.04]
-        pyver: ["3.8", "3.9", "3.10"]
+        pyver: ["3.8", "3.9", "3.10", "3.11"]
         cuda: ["11.8"]
     defaults:
       run:

From 83c2f09d79d97306c05f26723913ef0e43d31b04 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Thu, 31 Aug 2023 16:45:42 +0200
Subject: [PATCH 29/38] Add compute capability

---
 setup.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index bbad0e59..4e79a9a6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,9 @@
 import os
+import torch
 from pathlib import Path
-from torch.utils import cpp_extension
 from setuptools import setup, find_packages
 from distutils.sysconfig import get_python_lib
+from torch.utils import cpp_extension, CUDA_HOME
 
 os.environ["CC"] = "g++"
 os.environ["CXX"] = "g++"
@@ -52,6 +53,32 @@
 if os.path.isdir(conda_cuda_include_dir):
     include_dirs.append(conda_cuda_include_dir)
 
+def check_dependencies():
+    if CUDA_HOME is None:
+        raise RuntimeError(
+            f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
+
+def get_compute_capabilities():
+    # Collect the compute capabilities of all available GPUs.
+    compute_capabilities = set()
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        if major < 8:
+            raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.")
+        compute_capabilities.add(major * 10 + minor)
+
+    # figure out compute capability
+    compute_capabilities = {80, 86, 89, 90}
+
+    capability_flags = []
+    for cap in compute_capabilities:
+        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
+
+    return capability_flags
+
+check_dependencies()
+arch_flags = get_compute_capabilities()
+
 extensions = [
     cpp_extension.CppExtension(
         "awq_inference_engine",
@@ -62,7 +89,7 @@
             "awq_cuda/position_embedding/pos_encoding_kernels.cu"
         ], extra_compile_args={
             "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"],
-            "nvcc": ["-O3", "-std=c++17"]
+            "nvcc": ["-O3", "-std=c++17"] + arch_flags
         }
     )
 ]

From ac36d82e3976e6c89dd5c73fae3875a4b8618def Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Thu, 31 Aug 2023 17:19:59 +0200
Subject: [PATCH 30/38] Fix torch import

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 4e79a9a6..1986250d 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from setuptools import setup, find_packages
 from distutils.sysconfig import get_python_lib
-from torch.utils import cpp_extension, CUDA_HOME
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME
 
 os.environ["CC"] = "g++"
 os.environ["CXX"] = "g++"
@@ -80,7 +80,7 @@ def get_compute_capabilities():
 arch_flags = get_compute_capabilities()
 
 extensions = [
-    cpp_extension.CppExtension(
+    CppExtension(
         "awq_inference_engine",
         [
             "awq_cuda/pybind.cpp",
@@ -96,7 +96,7 @@ def get_compute_capabilities():
 
 additional_setup_kwargs = {
     "ext_modules": extensions,
-    "cmdclass": {'build_ext': cpp_extension.BuildExtension}
+    "cmdclass": {'build_ext': BuildExtension}
 }
 
 common_setup_kwargs.update(additional_setup_kwargs)

From 87c59401272a343837ea5e3513bc47bb449ec323 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Thu, 31 Aug 2023 17:25:38 +0200
Subject: [PATCH 31/38] Move release to end

---
 .github/workflows/build.yaml | 46 +++++++++++++-----------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index dc1272a5..8c078878 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -6,36 +6,9 @@ on:
       - "v*"
 
 jobs:
-  release:
-    # Retrieve tag and create release
-    name: Create Release
-    runs-on: ubuntu-latest
-    outputs:
-      upload_url: ${{ steps.create_release.outputs.upload_url }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Extract branch info
-        shell: bash
-        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
-
-      - name: Create Release
-        id: create_release
-        uses: "actions/github-script@v6"
-        env:
-          RELEASE_TAG: ${{ env.release_tag }}
-        with:
-          github-token: "${{ secrets.GITHUB_TOKEN }}"
-          script: |
-            const script = require('.github/workflows/scripts/github_create_release.js')
-            await script(github, context, core)
-
   build_wheels:
     name: Build AWQ
     runs-on: ${{ matrix.os }}
-    needs: release
     
     strategy:
       matrix:
@@ -87,12 +60,25 @@ jobs:
           $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX'
 
           python setup.py sdist bdist_wheel
+      
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
 
-          $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name }
-          echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/github_create_release.js')
+            await script(github, context, core)
       
       - name: Upload Assets
         uses: shogo82148/actions-upload-release-asset@v1
         with:
-          upload_url: ${{ needs.release.outputs.upload_url }}
+          upload_url: ${{ steps.create_release.outputs.upload_url }}
           asset_path: ./dist/*.whl
\ No newline at end of file

From 7e69373f623a81e22f7db5314604bec7c7e7dd9f Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Thu, 31 Aug 2023 18:34:56 +0200
Subject: [PATCH 32/38] Rollback build

---
 .github/workflows/build.yaml | 47 ++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 8c078878..fec17e84 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -6,14 +6,41 @@ on:
       - "v*"
 
 jobs:
+  release:
+    # Retrieve tag and create release
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+
+      # - name: Create Release
+      #   id: create_release
+      #   uses: "actions/github-script@v6"
+      #   env:
+      #     RELEASE_TAG: ${{ env.release_tag }}
+      #   with:
+      #     github-token: "${{ secrets.GITHUB_TOKEN }}"
+      #     script: |
+      #       const script = require('.github/workflows/scripts/github_create_release.js')
+      #       await script(github, context, core)
+
   build_wheels:
     name: Build AWQ
     runs-on: ${{ matrix.os }}
+    needs: release
     
     strategy:
       matrix:
         os: [ubuntu-20.04]
-        pyver: ["3.8", "3.9", "3.10", "3.11"]
+        pyver: ["3.8", "3.9", "3.10"]
         cuda: ["11.8"]
     defaults:
       run:
@@ -61,24 +88,8 @@ jobs:
 
           python setup.py sdist bdist_wheel
       
-      - name: Extract branch info
-        shell: bash
-        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
-
-      - name: Create Release
-        id: create_release
-        uses: "actions/github-script@v6"
-        env:
-          RELEASE_TAG: ${{ env.release_tag }}
-        with:
-          github-token: "${{ secrets.GITHUB_TOKEN }}"
-          script: |
-            const script = require('.github/workflows/scripts/github_create_release.js')
-            await script(github, context, core)
-      
       - name: Upload Assets
         uses: shogo82148/actions-upload-release-asset@v1
         with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }}
+          upload_url: ${{ needs.release.outputs.upload_url }}
           asset_path: ./dist/*.whl
\ No newline at end of file

From 3af180279a8768b33aa0ddecd4d76529c5281bc1 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Thu, 31 Aug 2023 19:45:21 +0200
Subject: [PATCH 33/38] Move release to end

---
 .github/workflows/build.yaml | 45 ++++++++++++++----------------------
 1 file changed, 17 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index fec17e84..e7cdb4e4 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -6,36 +6,9 @@ on:
       - "v*"
 
 jobs:
-  release:
-    # Retrieve tag and create release
-    name: Create Release
-    runs-on: ubuntu-latest
-    outputs:
-      upload_url: ${{ steps.create_release.outputs.upload_url }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Extract branch info
-        shell: bash
-        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
-
-      # - name: Create Release
-      #   id: create_release
-      #   uses: "actions/github-script@v6"
-      #   env:
-      #     RELEASE_TAG: ${{ env.release_tag }}
-      #   with:
-      #     github-token: "${{ secrets.GITHUB_TOKEN }}"
-      #     script: |
-      #       const script = require('.github/workflows/scripts/github_create_release.js')
-      #       await script(github, context, core)
-
   build_wheels:
     name: Build AWQ
     runs-on: ${{ matrix.os }}
-    needs: release
     
     strategy:
       matrix:
@@ -88,8 +61,24 @@ jobs:
 
           python setup.py sdist bdist_wheel
       
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/github_create_release.js')
+            await script(github, context, core)
+      
       - name: Upload Assets
         uses: shogo82148/actions-upload-release-asset@v1
         with:
-          upload_url: ${{ needs.release.outputs.upload_url }}
+          upload_url: ${{ steps.create_release.outputs.upload_url }}
           asset_path: ./dist/*.whl
\ No newline at end of file

From ce2dc060e03a6df2f92c7071503f2cfcedc41376 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Thu, 31 Aug 2023 20:22:17 +0200
Subject: [PATCH 34/38] Move release back up

---
 .github/workflows/build.yaml | 45 ++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e7cdb4e4..261b094a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -6,9 +6,36 @@ on:
       - "v*"
 
 jobs:
+  release:
+    # Retrieve tag and create release
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/github_create_release.js')
+            await script(github, context, core)
+
   build_wheels:
     name: Build AWQ
     runs-on: ${{ matrix.os }}
+    needs: release
     
     strategy:
       matrix:
@@ -61,24 +88,8 @@ jobs:
 
           python setup.py sdist bdist_wheel
       
-      - name: Extract branch info
-        shell: bash
-        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
-
-      - name: Create Release
-        id: create_release
-        uses: "actions/github-script@v6"
-        env:
-          RELEASE_TAG: ${{ env.release_tag }}
-        with:
-          github-token: "${{ secrets.GITHUB_TOKEN }}"
-          script: |
-            const script = require('.github/workflows/scripts/github_create_release.js')
-            await script(github, context, core)
-      
       - name: Upload Assets
         uses: shogo82148/actions-upload-release-asset@v1
         with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }}
+          upload_url: ${{ needs.release.outputs.upload_url }}
           asset_path: ./dist/*.whl
\ No newline at end of file

From bdde6c9673324bba6f7955bbf077e09eeb1ac7a7 Mon Sep 17 00:00:00 2001
From: Casper Hansen <casperbh.96@gmail.com>
Date: Thu, 31 Aug 2023 21:53:19 +0000
Subject: [PATCH 35/38] Python 3.8 compatibility

---
 awq/entry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/awq/entry.py b/awq/entry.py
index a7db3699..3886cdfb 100644
--- a/awq/entry.py
+++ b/awq/entry.py
@@ -152,7 +152,7 @@ def _warmup(device:str):
     parser.add_argument('--tasks', type=str, default='wikitext', help='Tasks to evaluate. '
                         'Separate tasks by comma for multiple tasks.'
                         'https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md')
-    parser.add_argument("--task_use_pretrained", default=False, action=argparse.BooleanOptionalAction,
+    parser.add_argument("--task_use_pretrained", default=False, action='store_true',
                         help="Pass '--task_use_pretrained' to use a pretrained model running FP16")
     parser.add_argument('--task_batch_size', type=int, default=1)
     parser.add_argument('--task_n_shot', type=int, default=0)

From 87c2e01ba40fa958c7891095b2167ba68a2d1adb Mon Sep 17 00:00:00 2001
From: qwopqwop200 <qwopqwop200@gmail.com>
Date: Fri, 1 Sep 2023 15:10:03 +0900
Subject: [PATCH 36/38] windows support

---
 setup.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index 1986250d..c7613ada 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from setuptools import setup, find_packages
 from distutils.sysconfig import get_python_lib
-from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
 
 os.environ["CC"] = "g++"
 os.environ["CXX"] = "g++"
@@ -19,7 +19,7 @@
     "long_description_content_type": "text/markdown",
     "url": "https://github.com/casper-hansen/AutoAWQ",
     "keywords": ["awq", "autoawq", "quantization", "transformers"],
-    "platforms": ["linux"],
+    "platforms": ["windows", "linux"],
     "classifiers": [
         "Environment :: GPU :: NVIDIA CUDA :: 11.8",
         "Environment :: GPU :: NVIDIA CUDA :: 12",
@@ -80,17 +80,14 @@ def get_compute_capabilities():
 arch_flags = get_compute_capabilities()
 
 extensions = [
-    CppExtension(
+    CUDAExtension(
         "awq_inference_engine",
         [
             "awq_cuda/pybind.cpp",
             "awq_cuda/quantization/gemm_cuda_gen.cu",
             "awq_cuda/layernorm/layernorm.cu",
             "awq_cuda/position_embedding/pos_encoding_kernels.cu"
-        ], extra_compile_args={
-            "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"],
-            "nvcc": ["-O3", "-std=c++17"] + arch_flags
-        }
+        ]
     )
 ]
 
@@ -106,4 +103,4 @@ def get_compute_capabilities():
     install_requires=requirements,
     include_dirs=include_dirs,
     **common_setup_kwargs
-)
\ No newline at end of file
+)

From 1a3acf02e2580e0bbdca0d8fba83a5b59fb80596 Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Fri, 1 Sep 2023 16:49:02 +0200
Subject: [PATCH 37/38] Generalize to Linux and Windows

---
 setup.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index c7613ada..ad1295d4 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from setuptools import setup, find_packages
 from distutils.sysconfig import get_python_lib
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
+from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
 
 os.environ["CC"] = "g++"
 os.environ["CXX"] = "g++"
@@ -19,7 +19,7 @@
     "long_description_content_type": "text/markdown",
     "url": "https://github.com/casper-hansen/AutoAWQ",
     "keywords": ["awq", "autoawq", "quantization", "transformers"],
-    "platforms": ["windows", "linux"],
+    "platforms": ["linux", "windows"],
     "classifiers": [
         "Environment :: GPU :: NVIDIA CUDA :: 11.8",
         "Environment :: GPU :: NVIDIA CUDA :: 12",
@@ -79,6 +79,17 @@ def get_compute_capabilities():
 check_dependencies()
 arch_flags = get_compute_capabilities()
 
+if os.name == "nt":
+    # Relaxed args on Windows
+    extra_compile_args={
+        "nvcc": arch_flags
+    }
+else:
+    extra_compile_args={
+        "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"],
+        "nvcc": ["-O3", "-std=c++17"] + arch_flags
+    }
+
 extensions = [
     CUDAExtension(
         "awq_inference_engine",
@@ -87,7 +98,7 @@ def get_compute_capabilities():
             "awq_cuda/quantization/gemm_cuda_gen.cu",
             "awq_cuda/layernorm/layernorm.cu",
             "awq_cuda/position_embedding/pos_encoding_kernels.cu"
-        ]
+        ], extra_compile_args=extra_compile_args
     )
 ]
 
@@ -103,4 +114,4 @@ def get_compute_capabilities():
     install_requires=requirements,
     include_dirs=include_dirs,
     **common_setup_kwargs
-)
+)
\ No newline at end of file

From afcce1a11d5f9a2fa19858da6c43d8990837bd5b Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Fri, 1 Sep 2023 16:49:42 +0200
Subject: [PATCH 38/38] Add Windows build to workflow

---
 .github/workflows/build.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 261b094a..33096a85 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -39,8 +39,8 @@ jobs:
     
     strategy:
       matrix:
-        os: [ubuntu-20.04]
-        pyver: ["3.8", "3.9", "3.10"]
+        os: [ubuntu-20.04, windows-latest]
+        pyver: ["3.8", "3.9", "3.10", "3.11"]
         cuda: ["11.8"]
     defaults:
       run:
@@ -73,6 +73,11 @@ jobs:
           conda install pytorch "pytorch-cuda=${env:CUDA_VERSION}" -c pytorch -c nvidia
           python -m pip install --upgrade build setuptools wheel ninja
 
+          # Environment variables
+          Add-Content $env:GITHUB_ENV "CUDA_PATH=$env:CONDA_PREFIX"
+          Add-Content $env:GITHUB_ENV "CUDA_HOME=$env:CONDA_PREFIX"
+          if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
+
           # Print version information
           python --version
           python -c "import torch; print('PyTorch:', torch.__version__)"
@@ -81,11 +86,6 @@ jobs:
 
       - name: Build Wheel
         run: |
-          $env:CUDA_PATH = $env:CONDA_PREFIX
-          $env:CUDA_HOME = $env:CONDA_PREFIX
-          if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
-          $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX'
-
           python setup.py sdist bdist_wheel
       
       - name: Upload Assets