From c39ac59b8926fd153f7d218d9ace948565dab095 Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 19:33:46 +0200 Subject: [PATCH 01/38] Import from awq --- awq/__init__.py | 1 + awq/entry.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 awq/__init__.py diff --git a/awq/__init__.py b/awq/__init__.py new file mode 100644 index 00000000..0ffd9d73 --- /dev/null +++ b/awq/__init__.py @@ -0,0 +1 @@ +from awq.models.auto import AutoAWQForCausalLM \ No newline at end of file diff --git a/awq/entry.py b/awq/entry.py index 4a7e135e..a7db3699 100644 --- a/awq/entry.py +++ b/awq/entry.py @@ -4,7 +4,7 @@ import argparse from lm_eval import evaluator from transformers import AutoTokenizer -from awq.models.auto import AutoAWQForCausalLM +from awq import AutoAWQForCausalLM from awq.quantize.auto_clip import apply_clip from awq.quantize.auto_scale import apply_scale from awq.utils.lm_eval_adaptor import LMEvalAdaptor From 60296077a40d6095f4ca7846d7145d4948672b0b Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 19:36:02 +0200 Subject: [PATCH 02/38] Add basic quant example --- examples/basic_quant.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 examples/basic_quant.py diff --git a/examples/basic_quant.py b/examples/basic_quant.py new file mode 100644 index 00000000..d6fdc96a --- /dev/null +++ b/examples/basic_quant.py @@ -0,0 +1,19 @@ +from awq import AutoAWQForCausalLM +from transformers import AutoTokenizer + +model_path = 'lmsys/vicuna-7b-v1.5' +quant_path = 'vicuna-7b-v1.5-awq' +quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4 } + +# Load model +model = AutoAWQForCausalLM.from_pretrained(model_path) +tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + +# Quantize +model.quantize(tokenizer, quant_config=quant_config) + +# Save quantized model +model.save_quantized(quant_path) +tokenizer.save_pretrained(quant_path) + +print(f'Model is quantized and saved at "{quant_path}"') \ No newline at end of file From ff556eb0a4fd00551792c2d3a2cbff3269ed545c Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 19:36:18 +0200 Subject: [PATCH 03/38] Create more detailed setup.py --- setup.py | 45 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 16a316af..b5cce475 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,19 @@ import os +import torch from setuptools import setup, find_packages -from torch.utils.cpp_extension import BuildExtension, CUDAExtension +from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME + +if CUDA_HOME is None: + raise RuntimeError( + f"Cannot find CUDA_HOME. CUDA must be available to build the package.") + +# Collect the compute capabilities of all available GPUs. +compute_capabilities = set() +for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + if major < 8: + raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.") + compute_capabilities.add(major * 10 + minor) # Get environment variables build_cuda_extension = os.environ.get('BUILD_CUDA_EXT', '1') == '1' @@ -22,6 +35,11 @@ ext_modules = [] if build_cuda_extension: + n_threads = min(os.cpu_count(), 8) + + cxx_args = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"] + nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads] + ext_modules.append( CUDAExtension( name="awq_inference_engine", @@ -32,25 +50,36 @@ "awq_cuda/position_embedding/pos_encoding_kernels.cu" ], extra_compile_args={ - "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"], - "nvcc": ["-O3", "-std=c++17"] + "cxx": cxx_args, + "nvcc": nvcc_args }, ) ) setup( - name="awq", + name="autoawq", version="0.1.0", - description="An efficient and accurate low-bit weight quantization(INT3/4) method for LLMs.", + author="Casper Hansen", + license="MIT", + description="AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.", long_description=open("README.md", "r").read(), long_description_content_type="text/markdown", python_requires=">=3.8", + url="https://github.com/casper-hansen/AutoAWQ", + keywords=["awq", "autoawq", "quantization", "transformers"], classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", + "Environment :: GPU :: NVIDIA CUDA :: 11.8", + "Environment :: GPU :: NVIDIA CUDA :: 12", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: C++", ], install_requires=dependencies, - packages=find_packages(exclude=["results*", "scripts*", "examples*"]), + packages=find_packages(exclude=["examples*"]), ext_modules=ext_modules, cmdclass={"build_ext": BuildExtension} ) From 77ca8337650af4e12963f7478a64a82fc236fa6d Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 19:36:40 +0200 Subject: [PATCH 04/38] Update install instructions --- README.md | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8396b1f0..0bfc5326 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ AutoAWQ is a package that implements the Activation-aware Weight Quantization (A Roadmap: -- [ ] Publish pip package +- [x] Publish pip package - [ ] Refactor quantization code - [ ] Support more models - [ ] Optimize the speed of models @@ -14,7 +14,18 @@ Roadmap: Requirements: - Compute Capability 8.0 (sm80). Ampere and later architectures are supported. -Clone this repository and install with pip. +Install: +- Use pip to install awq + +``` +pip install awq +``` + +### Build source + +
+ +Build AutoAWQ from scratch ``` git clone https://github.com/casper-hansen/AutoAWQ @@ -22,6 +33,8 @@ cd AutoAWQ pip install -e . ``` +
+ ## Supported models The detailed support list: @@ -36,6 +49,7 @@ The detailed support list: | OPT | 125m/1.3B/2.7B/6.7B/13B/30B | | Bloom | 560m/3B/7B/ | | LLaVA-v0 | 13B | +| GPTJ | 6.7B | ## Usage @@ -44,8 +58,8 @@ Below, you will find examples for how to easily quantize a model and run inferen ### Quantization ```python +from awq import AutoAWQForCausalLM from transformers import AutoTokenizer -from awq.models.auto import AutoAWQForCausalLM model_path = 'lmsys/vicuna-7b-v1.5' quant_path = 'vicuna-7b-v1.5-awq' @@ -68,8 +82,8 @@ tokenizer.save_pretrained(quant_path) Run inference on a quantized model from Huggingface: ```python +from awq import AutoAWQForCausalLM from transformers import AutoTokenizer -from awq.models.auto import AutoAWQForCausalLM quant_path = "casperhansen/vicuna-7b-v1.5-awq" quant_file = "awq_model_w4_g128.pt" @@ -101,8 +115,11 @@ Benchmark speeds may vary from server to server and that it also depends on your | MPT-30B | A6000 | OOM | 31.57 | -- | | Falcon-7B | A6000 | 39.44 | 27.34 | 1.44x | +
-For example, here is the difference between a fast and slow CPU on MPT-7B: +Detailed benchmark (CPU vs. GPU) + +Here is the difference between a fast and slow CPU on MPT-7B: RTX 4090 + Intel i9 13900K (2 different VMs): - CUDA 12.0, Driver 525.125.06: 134 tokens/s (7.46 ms/token) @@ -113,6 +130,8 @@ RTX 4090 + AMD EPYC 7-Series (3 different VMs): - CUDA 12.2, Driver 535.54.03: 56 tokens/s (17.71 ms/token) - CUDA 12.0, Driver 525.125.06: 55 tokens/ (18.15 ms/token) +
+ ## Reference If you find AWQ useful or relevant to your research, you can cite their [paper](https://arxiv.org/abs/2306.00978): From 7fee854f417f3d80cb4b6ec191004f9de7cd019b Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 19:56:31 +0200 Subject: [PATCH 05/38] Add compute capability flags and n_threads as str --- setup.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b5cce475..75246eaf 100644 --- a/setup.py +++ b/setup.py @@ -35,10 +35,19 @@ ext_modules = [] if build_cuda_extension: - n_threads = min(os.cpu_count(), 8) + # figure out compute capability + compute_capabilities = {80, 86, 89, 90} + if torch_is_prebuilt: + compute_capabilities.update({87}) + + capability_flags = ["-gencode", f"arch=compute_{cap},code=sm_{cap}" for cap in compute_capabilities] + # num threads + n_threads = str(min(os.cpu_count(), 8)) + + # final args cxx_args = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"] - nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads] + nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads] + capability_flags ext_modules.append( CUDAExtension( From 98ae978aebd2a1133ba6d3d4f950abd1b0a2c97f Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 19:57:52 +0200 Subject: [PATCH 06/38] Fix capability flags --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 75246eaf..32396338 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,9 @@ if torch_is_prebuilt: compute_capabilities.update({87}) - capability_flags = ["-gencode", f"arch=compute_{cap},code=sm_{cap}" for cap in compute_capabilities] + capability_flags = [] + for cap in compute_capabilities: + capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"] # num threads n_threads = str(min(os.cpu_count(), 8)) From 5abd53a2464c3a3a11c407b94b0f354fbb041b60 Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 21:56:40 +0200 Subject: [PATCH 07/38] Initial workflow, needs testing --- .github/workflows/build.yaml | 99 +++++++++++++++++++ .../scripts/github_create_release.js | 17 ++++ 2 files changed, 116 insertions(+) create mode 100644 .github/workflows/build.yaml create mode 100644 .github/workflows/scripts/github_create_release.js diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 00000000..636af3f1 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,99 @@ +name: Build + +# run jobs on new release starting with "AutoAWQ v" +on: + push: + tags: + - AutoAWQ v* + +jobs: + # create a github release + release: + name: Create Release + runs-on: ubuntu-latest + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Extract branch info + shell: bash + run: | + echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + + - name: Create Release + id: create_release + uses: "actions/github-script@v6" + env: + RELEASE_TAG: ${{ env.release_tag }} + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + const script = require('.github/workflows/scripts/github_create_release.js') + await script(github, context, core) + + # build AWQ + build: + name: Build AWQ + runs-on: ${{ matrix.os }} + needs: release + strategy: + matrix: + os: [windows-latest, ubuntu-20.04] + python-version: [3.8, 3.9, 3.10, 3.11] + cuda: ["11.8"] + defaults: + run: + shell: pwsh + env: + CUDA_VERSION: ${{ matrix.cuda }} + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + - name: Setup Miniconda + uses: conda-incubator/setup-miniconda@v2.2.0 + with: + activate-environment: "build" + python-version: ${{ matrix.python-version }} + mamba-version: "*" + use-mamba: false + channels: conda-forge,defaults + channel-priority: true + add-pip-as-python-dependency: true + auto-activate-base: false + + - name: Install Dependencies + run: | + conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0" + conda install pytorch "pytorch-cuda=${env:CUDA_VERSION}" -c pytorch -c nvidia + python -m pip install --upgrade build setuptools wheel ninja + + - name: Build Wheel + run: | + $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX' + $env:PYPI_RELEASE = "1" + $env:CUDA_PATH = $env:CONDA_PREFIX + $env:CUDA_HOME = $env:CONDA_PREFIX + if ($IsLinux) { $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH } + + python setup.py sdist bdist_wheel + + $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name } + echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV + + - name: Upload Release Asset + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.release.outputs.upload_url }} + asset_path: ./dist/${{ env.wheel_path }} + asset_content_type: application/* \ No newline at end of file diff --git a/.github/workflows/scripts/github_create_release.js b/.github/workflows/scripts/github_create_release.js new file mode 100644 index 00000000..fe26188b --- /dev/null +++ b/.github/workflows/scripts/github_create_release.js @@ -0,0 +1,17 @@ +module.exports = async (github, context, core) => { + try { + const response = await github.rest.repos.createRelease({ + draft: false, + generate_release_notes: true, + name: process.env.RELEASE_TAG, + owner: context.repo.owner, + prerelease: false, + repo: context.repo.repo, + tag_name: process.env.RELEASE_TAG, + }); + + core.setOutput('upload_url', response.data.upload_url); + } catch (error) { + core.setFailed(error.message); + } +} From 80ccf31ad408eba5f62485a21419362f5dcf52ce Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 22:28:53 +0200 Subject: [PATCH 08/38] run jobs on new release starting with "v" --- .github/workflows/build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 636af3f1..a8ea75b7 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,10 +1,10 @@ name: Build -# run jobs on new release starting with "AutoAWQ v" +# run jobs on new release starting with "v" on: push: tags: - - AutoAWQ v* + - v* jobs: # create a github release From 98d874d1c4a16e7654b6f9ba1da685bb6e617229 Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 22:42:11 +0200 Subject: [PATCH 09/38] Remove python from build --- .github/workflows/build.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index a8ea75b7..43841eb8 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -52,11 +52,6 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - name: Setup Miniconda uses: conda-incubator/setup-miniconda@v2.2.0 From f741f406fe868703e6cd27d5c6b1661056f19197 Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 22:59:12 +0200 Subject: [PATCH 10/38] Quote on python versions (fix: 3.10 was interpreted as 3.1) --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 43841eb8..30c7615d 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -41,7 +41,7 @@ jobs: strategy: matrix: os: [windows-latest, ubuntu-20.04] - python-version: [3.8, 3.9, 3.10, 3.11] + python-version: ["3.8", "3.9", "3.10", "3.11"] cuda: ["11.8"] defaults: run: From b491c2d68c51cbb330f8eb8b2aaadea344dc1c25 Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 27 Aug 2023 23:31:51 +0200 Subject: [PATCH 11/38] Replace print with logging, Remove uncommented code --- awq/modules/fused_attn.py | 9 +-------- awq/modules/fused_mlp.py | 1 - awq/modules/fused_norm.py | 2 -- awq/quantize/auto_scale.py | 4 ++-- awq/utils/calib_data.py | 3 ++- awq/utils/lm_eval_adaptor.py | 4 ++-- awq/utils/parallel.py | 3 ++- 7 files changed, 9 insertions(+), 17 deletions(-) diff --git a/awq/modules/fused_attn.py b/awq/modules/fused_attn.py index 2615ce7a..28d55cae 100644 --- a/awq/modules/fused_attn.py +++ b/awq/modules/fused_attn.py @@ -34,8 +34,6 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): sin = freqs.sin() cache = torch.cat((cos, sin), dim=-1) - # self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) - # self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) self.register_buffer("cos_sin_cache", cache.half(), persistent=False) def forward( @@ -46,7 +44,6 @@ def forward( ): # Apply rotary embedding to the query and key before passing them # to the attention op. - # print(positions.shape, query.shape, key.shape, self.cos_sin_cache.shape) query = query.contiguous() key = key.contiguous() awq_inference_engine.rotary_embedding_neox( @@ -146,7 +143,7 @@ def make_quant_attn(model, dev): qweights = torch.cat([q_proj.qweight, k_proj.qweight, v_proj.qweight], dim=1) qzeros = torch.cat([q_proj.qzeros, k_proj.qzeros, v_proj.qzeros], dim=1) scales = torch.cat([q_proj.scales, k_proj.scales, v_proj.scales], dim=1) - # g_idx = torch.cat([q_proj.g_idx, k_proj.g_idx, v_proj.g_idx], dim=0) + g_idx = None bias = torch.cat([q_proj.bias, k_proj.bias, v_proj.bias], dim=0) if q_proj.bias is not None else None @@ -156,8 +153,6 @@ def make_quant_attn(model, dev): qkv_layer.scales = scales qkv_layer.bias = bias - # We're dropping the rotary embedding layer m.rotary_emb here. We don't need it in the triton branch. - attn = QuantLlamaAttention(m.hidden_size, m.num_heads, qkv_layer, m.o_proj, dev) if '.' in name: @@ -169,6 +164,4 @@ def make_quant_attn(model, dev): parent = model child_name = name - #print(f"Replacing {name} with quant_attn; parent: {parent_name}, child's name: {child_name}") - setattr(parent, child_name, attn) diff --git a/awq/modules/fused_mlp.py b/awq/modules/fused_mlp.py index 6957a329..0ca30baf 100644 --- a/awq/modules/fused_mlp.py +++ b/awq/modules/fused_mlp.py @@ -71,7 +71,6 @@ def our_llama_mlp(self, x): def make_fused_mlp(m, parent_name=''): if not hasattr(make_fused_mlp, "called"): - # print("[Warning] Calling a fake MLP fusion. But still faster than Huggingface Implimentation.") make_fused_mlp.called = True """ Replace all LlamaMLP modules with QuantLlamaMLP modules, which fuses many of the operations. diff --git a/awq/modules/fused_norm.py b/awq/modules/fused_norm.py index 50f49c3a..9ce8f64b 100644 --- a/awq/modules/fused_norm.py +++ b/awq/modules/fused_norm.py @@ -38,6 +38,4 @@ def make_quant_norm(model): parent = model child_name = name - #print(f"Replacing {name} with quant_attn; parent: {parent_name}, child's name: {child_name}") - setattr(parent, child_name, norm) diff --git a/awq/quantize/auto_scale.py b/awq/quantize/auto_scale.py index a868ed6f..18bc3009 100644 --- a/awq/quantize/auto_scale.py +++ b/awq/quantize/auto_scale.py @@ -1,6 +1,7 @@ import gc import torch import torch.nn as nn +import logging from transformers.models.bloom.modeling_bloom import BloomBlock, BloomGelu from transformers.models.opt.modeling_opt import OPTDecoderLayer @@ -154,9 +155,8 @@ def _search_module_scale(block, linears2scale: list, x, kwargs={}): best_scales = scales block.load_state_dict(org_sd) if best_ratio == -1: - print(history) + logging.debug(history) raise Exception - # print(best_ratio) best_scales = best_scales.view(-1) assert torch.isnan(best_scales).sum() == 0, best_scales diff --git a/awq/utils/calib_data.py b/awq/utils/calib_data.py index 0c6f82be..9320a1e1 100644 --- a/awq/utils/calib_data.py +++ b/awq/utils/calib_data.py @@ -1,4 +1,5 @@ import torch +import logging from datasets import load_dataset def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size=512): @@ -25,5 +26,5 @@ def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size= # now concatenate all samples and split according to block size cat_samples = torch.cat(samples, dim=1) n_split = cat_samples.shape[1] // block_size - print(f" * Split into {n_split} blocks") + logging.debug(f" * Split into {n_split} blocks") return [cat_samples[:, i*block_size:(i+1)*block_size] for i in range(n_split)] diff --git a/awq/utils/lm_eval_adaptor.py b/awq/utils/lm_eval_adaptor.py index c1c35d05..a38f1c4f 100644 --- a/awq/utils/lm_eval_adaptor.py +++ b/awq/utils/lm_eval_adaptor.py @@ -2,7 +2,7 @@ import torch from lm_eval.base import BaseLM import fnmatch - +import logging class LMEvalAdaptor(BaseLM): @@ -52,7 +52,7 @@ def max_length(self): elif 'falcon' in self.model_name: return 2048 else: - print(self.model.config) + logging.debug(self.model.config) raise NotImplementedError @property diff --git a/awq/utils/parallel.py b/awq/utils/parallel.py index f1ba27b0..eb4389bc 100644 --- a/awq/utils/parallel.py +++ b/awq/utils/parallel.py @@ -1,6 +1,7 @@ import os import torch import gc +import logging def auto_parallel(args): @@ -23,5 +24,5 @@ def auto_parallel(args): cuda_visible_devices = list(range(8)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(dev) for dev in cuda_visible_devices[:n_gpu]]) - print("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"]) + logging.debug("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"]) return cuda_visible_devices From 9ba3afacd7e9e02b3e49fdd42a02ae6ac70dd328 Mon Sep 17 00:00:00 2001 From: Casper Hansen Date: Sun, 27 Aug 2023 22:31:07 +0000 Subject: [PATCH 12/38] Add basic generation example --- examples/basic_generate.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 examples/basic_generate.py diff --git a/examples/basic_generate.py b/examples/basic_generate.py new file mode 100644 index 00000000..5a9a678f --- /dev/null +++ b/examples/basic_generate.py @@ -0,0 +1,29 @@ +from awq import AutoAWQForCausalLM +from transformers import AutoTokenizer, TextStreamer + +quant_path = "casperhansen/vicuna-7b-v1.5-awq" +quant_file = "awq_model_w4_g128.pt" + +# Load model +model = AutoAWQForCausalLM.from_quantized(quant_path, quant_file, fuse_layers=True) +tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True) +streamer = TextStreamer(tokenizer, skip_special_tokens=True) + +# Convert prompt to tokens +prompt_template = """\ +A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. + +USER: {prompt} +ASSISTANT:""" + +tokens = tokenizer( + prompt_template.format(prompt="How are you today?"), + return_tensors='pt' +).input_ids.cuda() + +# Generate output +generation_output = model.generate( + tokens, + streamer=streamer, + max_new_tokens=512 +) From d6bd9db7653a8bf8dc8e5e0e83db5837146588a1 Mon Sep 17 00:00:00 2001 From: Casper Hansen Date: Sun, 27 Aug 2023 22:31:40 +0000 Subject: [PATCH 13/38] Standardize ASM calls. Add Windows support. --- awq_cuda/quantization/dequantize.cuh | 16 +++---- awq_cuda/quantization/gemm_cuda_gen.cu | 61 ++++++++++++++------------ 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/awq_cuda/quantization/dequantize.cuh b/awq_cuda/quantization/dequantize.cuh index 5d333b35..368cf984 100644 --- a/awq_cuda/quantization/dequantize.cuh +++ b/awq_cuda/quantization/dequantize.cuh @@ -34,19 +34,19 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) // immediately before required. const uint32_t top_i4s = i4s >> 8; // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + ASM("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[0]) : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + ASM("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[1]) : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + ASM("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[2]) : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + ASM("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[3]) : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); @@ -66,13 +66,13 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) // Finally, we construct the output numbers. // Convert elt_01 - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); // Convert elt_23 - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); // Convert elt_45 - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); // Convert elt_67 - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); return result; } diff --git a/awq_cuda/quantization/gemm_cuda_gen.cu b/awq_cuda/quantization/gemm_cuda_gen.cu index 1632d8be..a44b195a 100644 --- a/awq_cuda/quantization/gemm_cuda_gen.cu +++ b/awq_cuda/quantization/gemm_cuda_gen.cu @@ -9,6 +9,11 @@ */ +#ifdef _MSC_VER +#define ASM __asm +#else +#define ASM asm volatile +#endif #include #include "gemm_cuda.h" @@ -132,14 +137,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8); // - zero and * scale // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale. - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); /* if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){ printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w); @@ -154,14 +159,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) { { unsigned int addr; - __asm__ __volatile__( + ASM( "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" : "=r"(addr) : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))) ); - __asm__ __volatile__( + ASM( "ldmatrix.sync.aligned.m8n8.x4.shared.b16" "{%0, %1, %2, %3}, [%4];\n" : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3]) @@ -172,12 +177,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) { { unsigned int addr; - __asm__ __volatile__( + ASM( "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" : "=r"(addr) : "l"((void *)((&(B_shared[(((k_0_1 * 2176) + (((int)threadIdx.y) * 64)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 136) + ((((int)threadIdx.x) >> 4) * 8)))) ); - __asm__ __volatile__( + ASM( "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16" "{%0, %1, %2, %3}, [%4];\n" : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3]) @@ -187,7 +192,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i } for (int j_0_4 = 0; j_0_4 < 4; ++j_0_4) { { - __asm__ __volatile__( + ASM( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -195,7 +200,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i } { - __asm__ __volatile__( + ASM( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) @@ -326,14 +331,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8); // - zero and * scale // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale. - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); + ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); + ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); /* if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){ printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w); @@ -349,12 +354,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in { { unsigned int addr; - __asm__ __volatile__( + ASM( "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" : "=r"(addr) : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))) ); - __asm__ __volatile__( + ASM( "ldmatrix.sync.aligned.m8n8.x4.shared.b16" "{%0, %1, %2, %3}, [%4];\n" : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3]) @@ -367,12 +372,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in { { unsigned int addr; - __asm__ __volatile__( + ASM( "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" : "=r"(addr) : "l"((void *)((&(B_shared[(((k_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))) ); - __asm__ __volatile__( + ASM( "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16" "{%0, %1, %2, %3}, [%4];\n" : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3]) @@ -385,7 +390,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in { { - __asm__ __volatile__( + ASM( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -393,7 +398,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in } { - __asm__ __volatile__( + ASM( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) From acdf7d9a93af6b137844f096a5941c67d22947f9 Mon Sep 17 00:00:00 2001 From: Casper Date: Mon, 28 Aug 2023 10:41:42 +0200 Subject: [PATCH 14/38] Define HALF_FLT_MAX --- awq_cuda/layernorm/reduction.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awq_cuda/layernorm/reduction.cuh b/awq_cuda/layernorm/reduction.cuh index 678160e8..f670d185 100644 --- a/awq_cuda/layernorm/reduction.cuh +++ b/awq_cuda/layernorm/reduction.cuh @@ -16,7 +16,7 @@ https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kern #include #include -static const float HALF_FLT_MAX = 65504.F; +#define HALF_FLT_MAX 65504.F #define FINAL_MASK 0xffffffff From 4c39a76a17d08f6d6eea95abe9f6c7ff7107ab24 Mon Sep 17 00:00:00 2001 From: Casper Hansen Date: Mon, 28 Aug 2023 10:36:59 +0000 Subject: [PATCH 15/38] NVCC: Use asm volatile --- awq_cuda/quantization/dequantize.cuh | 16 +++---- awq_cuda/quantization/gemm_cuda_gen.cu | 65 +++++++++++--------------- 2 files changed, 36 insertions(+), 45 deletions(-) diff --git a/awq_cuda/quantization/dequantize.cuh b/awq_cuda/quantization/dequantize.cuh index 368cf984..5d333b35 100644 --- a/awq_cuda/quantization/dequantize.cuh +++ b/awq_cuda/quantization/dequantize.cuh @@ -34,19 +34,19 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) // immediately before required. const uint32_t top_i4s = i4s >> 8; // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 - ASM("lop3.b32 %0, %1, %2, %3, %4;\n" + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[0]) : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 - ASM("lop3.b32 %0, %1, %2, %3, %4;\n" + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[1]) : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 - ASM("lop3.b32 %0, %1, %2, %3, %4;\n" + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[2]) : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 - ASM("lop3.b32 %0, %1, %2, %3, %4;\n" + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[3]) : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); @@ -66,13 +66,13 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) // Finally, we construct the output numbers. // Convert elt_01 - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); // Convert elt_23 - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); // Convert elt_45 - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); // Convert elt_67 - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); return result; } diff --git a/awq_cuda/quantization/gemm_cuda_gen.cu b/awq_cuda/quantization/gemm_cuda_gen.cu index a44b195a..067a9c0f 100644 --- a/awq_cuda/quantization/gemm_cuda_gen.cu +++ b/awq_cuda/quantization/gemm_cuda_gen.cu @@ -9,12 +9,6 @@ */ -#ifdef _MSC_VER -#define ASM __asm -#else -#define ASM asm volatile -#endif - #include #include "gemm_cuda.h" #include "dequantize.cuh" @@ -36,9 +30,6 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i float C_warp[32]; __shared__ half A_shared[16 * (32 + 8)]; __shared__ half B_shared[32 * (128 + 8)]; - - __shared__ half scaling_factors_shared[128]; - __shared__ half zeros_shared[128]; int j_factors1 = ((OC + 128 - 1) / 128); int blockIdx_x = 0; @@ -137,14 +128,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8); // - zero and * scale // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale. - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); /* if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){ printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w); @@ -159,14 +150,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) { { unsigned int addr; - ASM( + asm volatile( "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" : "=r"(addr) : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))) ); - ASM( + asm volatile( "ldmatrix.sync.aligned.m8n8.x4.shared.b16" "{%0, %1, %2, %3}, [%4];\n" : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3]) @@ -177,12 +168,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) { { unsigned int addr; - ASM( + asm volatile( "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" : "=r"(addr) : "l"((void *)((&(B_shared[(((k_0_1 * 2176) + (((int)threadIdx.y) * 64)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 136) + ((((int)threadIdx.x) >> 4) * 8)))) ); - ASM( + asm volatile( "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16" "{%0, %1, %2, %3}, [%4];\n" : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3]) @@ -192,7 +183,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i } for (int j_0_4 = 0; j_0_4 < 4; ++j_0_4) { { - ASM( + asm volatile( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -200,7 +191,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i } { - ASM( + asm volatile( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) @@ -331,14 +322,14 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8); // - zero and * scale // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale. - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); - ASM("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); - ASM("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); /* if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){ printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w); @@ -354,12 +345,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in { { unsigned int addr; - ASM( + asm volatile( "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" : "=r"(addr) : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))) ); - ASM( + asm volatile( "ldmatrix.sync.aligned.m8n8.x4.shared.b16" "{%0, %1, %2, %3}, [%4];\n" : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3]) @@ -372,12 +363,12 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in { { unsigned int addr; - ASM( + asm volatile( "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" : "=r"(addr) : "l"((void *)((&(B_shared[(((k_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))) ); - ASM( + asm volatile( "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16" "{%0, %1, %2, %3}, [%4];\n" : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3]) @@ -390,7 +381,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in { { - ASM( + asm volatile( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -398,7 +389,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in } { - ASM( + asm volatile( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) From a20d30dd36daa77d8630fa45aded79be80db8dbc Mon Sep 17 00:00:00 2001 From: Casper Hansen Date: Mon, 28 Aug 2023 10:37:30 +0000 Subject: [PATCH 16/38] Remove windows build --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 30c7615d..4513d20f 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -40,7 +40,7 @@ jobs: needs: release strategy: matrix: - os: [windows-latest, ubuntu-20.04] + os: [ubuntu-20.04] python-version: ["3.8", "3.9", "3.10", "3.11"] cuda: ["11.8"] defaults: From 6aa94f548eca4d127ddac5fcbb51b59211e7ca4e Mon Sep 17 00:00:00 2001 From: Casper Hansen Date: Mon, 28 Aug 2023 11:36:13 +0000 Subject: [PATCH 17/38] Add Python checkout --- .github/workflows/build.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 4513d20f..1dedb0c8 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -53,6 +53,11 @@ jobs: - name: Checkout code uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.pyver }} + - name: Setup Miniconda uses: conda-incubator/setup-miniconda@v2.2.0 with: From 5bc0916b79a0a13977deb8de8eed7d222d36f9c9 Mon Sep 17 00:00:00 2001 From: Casper Date: Mon, 28 Aug 2023 14:39:30 +0200 Subject: [PATCH 18/38] Separate PyTorch install --- .github/workflows/build.yaml | 61 ++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 1dedb0c8..0b71d1f3 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -3,8 +3,8 @@ name: Build # run jobs on new release starting with "v" on: push: - tags: - - v* + branches: + - release_package jobs: # create a github release @@ -22,16 +22,16 @@ jobs: run: | echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV - - name: Create Release - id: create_release - uses: "actions/github-script@v6" - env: - RELEASE_TAG: ${{ env.release_tag }} - with: - github-token: "${{ secrets.GITHUB_TOKEN }}" - script: | - const script = require('.github/workflows/scripts/github_create_release.js') - await script(github, context, core) + # - name: Create Release + # id: create_release + # uses: "actions/github-script@v6" + # env: + # RELEASE_TAG: ${{ env.release_tag }} + # with: + # github-token: "${{ secrets.GITHUB_TOKEN }}" + # script: | + # const script = require('.github/workflows/scripts/github_create_release.js') + # await script(github, context, core) # build AWQ build: @@ -42,21 +42,19 @@ jobs: matrix: os: [ubuntu-20.04] python-version: ["3.8", "3.9", "3.10", "3.11"] - cuda: ["11.8"] + cuda-version: ["11.8"] defaults: run: shell: pwsh - env: - CUDA_VERSION: ${{ matrix.cuda }} steps: - name: Checkout code uses: actions/checkout@v3 - name: Setup Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: - python-version: ${{ matrix.pyver }} + python-version: ${{ matrix.python-version }} - name: Setup Miniconda uses: conda-incubator/setup-miniconda@v2.2.0 @@ -70,19 +68,34 @@ jobs: add-pip-as-python-dependency: true auto-activate-base: false - - name: Install Dependencies + - name: Install CUDA + run: | + conda install cuda-toolkit -c "nvidia/label/cuda-${{ matrix.cuda-version }}.0" + $env:CUDA_PATH = $env:CONDA_PREFIX + $env:CUDA_HOME = $env:CONDA_PREFIX + + echo "$env:CUDA_PATH" + echo "$env:CUDA_HOME" + + - name: Install PyTorch-cu${{ matrix.cuda-version }} run: | - conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0" - conda install pytorch "pytorch-cuda=${env:CUDA_VERSION}" -c pytorch -c nvidia + $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX' + if ($IsLinux) { $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH } + + # Install torch + $env:CUDA_VERSION = ${{ matrix.cuda-version }} -replace '\.', '' + pip install --upgrade --no-cache-dir torch==2.0.1+cu$env:CUDA_VERSION --index-url https://download.pytorch.org/whl/cu$env:CUDA_VERSION python -m pip install --upgrade build setuptools wheel ninja + + # Print version information + python --version + python -c "import torch; print('PyTorch:', torch.__version__)" + python -c "import torch; print('CUDA:', torch.version.cuda)" + python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" - name: Build Wheel run: | - $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX' $env:PYPI_RELEASE = "1" - $env:CUDA_PATH = $env:CONDA_PREFIX - $env:CUDA_HOME = $env:CONDA_PREFIX - if ($IsLinux) { $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH } python setup.py sdist bdist_wheel From 7e361d16b5dd4102a03933255c02dd8411b4a8b4 Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 29 Aug 2023 12:13:24 +0200 Subject: [PATCH 19/38] Add cuda_runtime in include_dirs --- setup.py | 61 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index 32396338..98684d9a 100644 --- a/setup.py +++ b/setup.py @@ -1,23 +1,31 @@ import os import torch from setuptools import setup, find_packages +from distutils.sysconfig import get_python_lib from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME -if CUDA_HOME is None: - raise RuntimeError( - f"Cannot find CUDA_HOME. CUDA must be available to build the package.") +def check_dependencies(): + if CUDA_HOME is None: + raise RuntimeError( + f"Cannot find CUDA_HOME. CUDA must be available to build the package.") -# Collect the compute capabilities of all available GPUs. -compute_capabilities = set() -for i in range(torch.cuda.device_count()): - major, minor = torch.cuda.get_device_capability(i) - if major < 8: - raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.") - compute_capabilities.add(major * 10 + minor) - -# Get environment variables -build_cuda_extension = os.environ.get('BUILD_CUDA_EXT', '1') == '1' -torch_is_prebuilt = os.environ.get('TORCH_IS_PREBUILT', '0') == '1' +def get_compute_capabilities(): + # Collect the compute capabilities of all available GPUs. + compute_capabilities = set() + for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + if major < 8: + raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.") + compute_capabilities.add(major * 10 + minor) + + # figure out compute capability + compute_capabilities = {80, 86, 89, 90} + + capability_flags = [] + for cap in compute_capabilities: + capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"] + + return capability_flags # Define dependencies dependencies = [ @@ -25,29 +33,22 @@ "transformers>=4.32.0", "lm_eval", "texttable", "toml", "attributedict", - "protobuf" + "protobuf", + "torch>=2.0.0", "torchvision" ] -if not torch_is_prebuilt: - dependencies.extend(["torch>=2.0.0", "torchvision"]) +# Get environment variables +build_cuda_extension = os.environ.get('BUILD_CUDA_EXT', '1') == '1' # Setup CUDA extension ext_modules = [] if build_cuda_extension: - # figure out compute capability - compute_capabilities = {80, 86, 89, 90} - if torch_is_prebuilt: - compute_capabilities.update({87}) - - capability_flags = [] - for cap in compute_capabilities: - capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"] - # num threads n_threads = str(min(os.cpu_count(), 8)) # final args + capability_flags = get_compute_capabilities() cxx_args = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"] nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads] + capability_flags @@ -67,6 +68,13 @@ ) ) +# Find directories to be included in setup +include_dirs = [] +conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include") + +if os.path.isdir(conda_cuda_include_dir): + include_dirs.append(conda_cuda_include_dir) + setup( name="autoawq", version="0.1.0", @@ -90,6 +98,7 @@ "Programming Language :: C++", ], install_requires=dependencies, + include_dirs=include_dirs, packages=find_packages(exclude=["examples*"]), ext_modules=ext_modules, cmdclass={"build_ext": BuildExtension} From b574767f4a3dfa18481ed7e1745bb7e4c9646da7 Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 29 Aug 2023 20:28:06 +0200 Subject: [PATCH 20/38] Improved setup.py structure and build instructions --- .github/workflows/build.yaml | 132 ++++++++++++++---------------- setup.py | 153 +++++++++++++++-------------------- 2 files changed, 126 insertions(+), 159 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 0b71d1f3..51922f62 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -22,91 +22,81 @@ jobs: run: | echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV - # - name: Create Release - # id: create_release - # uses: "actions/github-script@v6" - # env: - # RELEASE_TAG: ${{ env.release_tag }} - # with: - # github-token: "${{ secrets.GITHUB_TOKEN }}" - # script: | - # const script = require('.github/workflows/scripts/github_create_release.js') - # await script(github, context, core) + - name: Create Release + id: create_release + uses: "actions/github-script@v6" + env: + RELEASE_TAG: ${{ env.release_tag }} + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + const script = require('.github/workflows/scripts/github_create_release.js') + await script(github, context, core) # build AWQ - build: + build_wheels: name: Build AWQ runs-on: ${{ matrix.os }} needs: release strategy: matrix: os: [ubuntu-20.04] - python-version: ["3.8", "3.9", "3.10", "3.11"] - cuda-version: ["11.8"] + pyver: ["3.8", "3.9", "3.10", "3.11"] + cuda: ["11.8"] defaults: run: shell: pwsh + env: + CUDA_VERSION: ${{ matrix.cuda }} steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Setup Miniconda - uses: conda-incubator/setup-miniconda@v2.2.0 - with: - activate-environment: "build" - python-version: ${{ matrix.python-version }} - mamba-version: "*" - use-mamba: false - channels: conda-forge,defaults - channel-priority: true - add-pip-as-python-dependency: true - auto-activate-base: false - - - name: Install CUDA - run: | - conda install cuda-toolkit -c "nvidia/label/cuda-${{ matrix.cuda-version }}.0" - $env:CUDA_PATH = $env:CONDA_PREFIX - $env:CUDA_HOME = $env:CONDA_PREFIX - - echo "$env:CUDA_PATH" - echo "$env:CUDA_HOME" - - - name: Install PyTorch-cu${{ matrix.cuda-version }} - run: | - $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX' - if ($IsLinux) { $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH } - - # Install torch - $env:CUDA_VERSION = ${{ matrix.cuda-version }} -replace '\.', '' - pip install --upgrade --no-cache-dir torch==2.0.1+cu$env:CUDA_VERSION --index-url https://download.pytorch.org/whl/cu$env:CUDA_VERSION - python -m pip install --upgrade build setuptools wheel ninja + - uses: actions/checkout@v3 - # Print version information - python --version - python -c "import torch; print('PyTorch:', torch.__version__)" - python -c "import torch; print('CUDA:', torch.version.cuda)" - python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" - - - name: Build Wheel - run: | - $env:PYPI_RELEASE = "1" + - uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.pyver }} + + - name: Setup Miniconda + uses: conda-incubator/setup-miniconda@v2.2.0 + with: + activate-environment: "build" + python-version: ${{ matrix.pyver }} + mamba-version: "*" + use-mamba: false + channels: conda-forge,defaults + channel-priority: true + add-pip-as-python-dependency: true + auto-activate-base: false + + - name: Install Dependencies + run: | + conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0" + conda install pytorch "pytorch-cuda=${env:CUDA_VERSION}" -c pytorch -c nvidia + python -m pip install --upgrade build setuptools wheel ninja + + # Print version information + python --version + python -c "import torch; print('PyTorch:', torch.__version__)" + python -c "import torch; print('CUDA:', torch.version.cuda)" + python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" + + - name: Build Wheel + run: | + $env:CUDA_PATH = $env:CONDA_PREFIX + $env:CUDA_HOME = $env:CONDA_PREFIX + if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH} + $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX' - python setup.py sdist bdist_wheel + python setup.py sdist bdist_wheel - $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name } - echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV + $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name } + echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV - - name: Upload Release Asset - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.release.outputs.upload_url }} - asset_path: ./dist/${{ env.wheel_path }} - asset_content_type: application/* \ No newline at end of file + - name: Upload Release Asset + uses: shogo82148/actions-upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.release.outputs.upload_url }} + asset_path: ./dist/${{ env.wheel_path }} + asset_content_type: application/* \ No newline at end of file diff --git a/setup.py b/setup.py index 98684d9a..bbad0e59 100644 --- a/setup.py +++ b/setup.py @@ -1,105 +1,82 @@ import os -import torch +from pathlib import Path +from torch.utils import cpp_extension from setuptools import setup, find_packages from distutils.sysconfig import get_python_lib -from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME -def check_dependencies(): - if CUDA_HOME is None: - raise RuntimeError( - f"Cannot find CUDA_HOME. CUDA must be available to build the package.") +os.environ["CC"] = "g++" +os.environ["CXX"] = "g++" -def get_compute_capabilities(): - # Collect the compute capabilities of all available GPUs. - compute_capabilities = set() - for i in range(torch.cuda.device_count()): - major, minor = torch.cuda.get_device_capability(i) - if major < 8: - raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.") - compute_capabilities.add(major * 10 + minor) - - # figure out compute capability - compute_capabilities = {80, 86, 89, 90} - - capability_flags = [] - for cap in compute_capabilities: - capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"] - - return capability_flags +common_setup_kwargs = { + "version": "0.0.1", + "name": "autoawq", + "author": "Casper Hansen", + "license": "MIT", + "python_requires": ">=3.8.0", + "description": "AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.", + "long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"), + "long_description_content_type": "text/markdown", + "url": "https://github.com/casper-hansen/AutoAWQ", + "keywords": ["awq", "autoawq", "quantization", "transformers"], + "platforms": ["linux"], + "classifiers": [ + "Environment :: GPU :: NVIDIA CUDA :: 11.8", + "Environment :: GPU :: NVIDIA CUDA :: 12", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: C++", + ] +} -# Define dependencies -dependencies = [ - "accelerate", "sentencepiece", "tokenizers>=0.12.1", - "transformers>=4.32.0", - "lm_eval", "texttable", - "toml", "attributedict", +requirements = [ + "torch>=2.0.0", + "transformers>=4.32.0", + "tokenizers>=0.12.1", + "accelerate", + "sentencepiece", + "lm_eval", + "texttable", + "toml", + "attributedict", "protobuf", - "torch>=2.0.0", "torchvision" + "torchvision" ] -# Get environment variables -build_cuda_extension = os.environ.get('BUILD_CUDA_EXT', '1') == '1' - -# Setup CUDA extension -ext_modules = [] - -if build_cuda_extension: - # num threads - n_threads = str(min(os.cpu_count(), 8)) +include_dirs = [] - # final args - capability_flags = get_compute_capabilities() - cxx_args = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"] - nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads] + capability_flags +conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include") +if os.path.isdir(conda_cuda_include_dir): + include_dirs.append(conda_cuda_include_dir) - ext_modules.append( - CUDAExtension( - name="awq_inference_engine", - sources=[ - "awq_cuda/pybind.cpp", - "awq_cuda/quantization/gemm_cuda_gen.cu", - "awq_cuda/layernorm/layernorm.cu", - "awq_cuda/position_embedding/pos_encoding_kernels.cu" - ], - extra_compile_args={ - "cxx": cxx_args, - "nvcc": nvcc_args - }, - ) +extensions = [ + cpp_extension.CppExtension( + "awq_inference_engine", + [ + "awq_cuda/pybind.cpp", + "awq_cuda/quantization/gemm_cuda_gen.cu", + "awq_cuda/layernorm/layernorm.cu", + "awq_cuda/position_embedding/pos_encoding_kernels.cu" + ], extra_compile_args={ + "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"], + "nvcc": ["-O3", "-std=c++17"] + } ) +] -# Find directories to be included in setup -include_dirs = [] -conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include") +additional_setup_kwargs = { + "ext_modules": extensions, + "cmdclass": {'build_ext': cpp_extension.BuildExtension} +} -if os.path.isdir(conda_cuda_include_dir): - include_dirs.append(conda_cuda_include_dir) +common_setup_kwargs.update(additional_setup_kwargs) setup( - name="autoawq", - version="0.1.0", - author="Casper Hansen", - license="MIT", - description="AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.", - long_description=open("README.md", "r").read(), - long_description_content_type="text/markdown", - python_requires=">=3.8", - url="https://github.com/casper-hansen/AutoAWQ", - keywords=["awq", "autoawq", "quantization", "transformers"], - classifiers=[ - "Environment :: GPU :: NVIDIA CUDA :: 11.8", - "Environment :: GPU :: NVIDIA CUDA :: 12", - "License :: OSI Approved :: MIT License", - "Natural Language :: English", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: C++", - ], - install_requires=dependencies, + packages=find_packages(), + install_requires=requirements, include_dirs=include_dirs, - packages=find_packages(exclude=["examples*"]), - ext_modules=ext_modules, - cmdclass={"build_ext": BuildExtension} -) + **common_setup_kwargs +) \ No newline at end of file From 2d70f22af7e039c6f03d17f0325f2c7b643e4800 Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 29 Aug 2023 21:00:25 +0200 Subject: [PATCH 21/38] Reduce build file --- .github/workflows/build.yaml | 52 +++++++++--------------------------- 1 file changed, 13 insertions(+), 39 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 51922f62..08ce060e 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,47 +1,18 @@ -name: Build +name: Build AutoGPTQ Wheels with CUDA -# run jobs on new release starting with "v" on: push: branches: - release_package jobs: - # create a github release - release: - name: Create Release - runs-on: ubuntu-latest - outputs: - upload_url: ${{ steps.create_release.outputs.upload_url }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Extract branch info - shell: bash - run: | - echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV - - - name: Create Release - id: create_release - uses: "actions/github-script@v6" - env: - RELEASE_TAG: ${{ env.release_tag }} - with: - github-token: "${{ secrets.GITHUB_TOKEN }}" - script: | - const script = require('.github/workflows/scripts/github_create_release.js') - await script(github, context, core) - - # build AWQ build_wheels: name: Build AWQ runs-on: ${{ matrix.os }} - needs: release strategy: matrix: os: [ubuntu-20.04] - pyver: ["3.8", "3.9", "3.10", "3.11"] + pyver: ["3.8", "3.9", "3.10"] cuda: ["11.8"] defaults: run: @@ -91,12 +62,15 @@ jobs: $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name } echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV - - - name: Upload Release Asset - uses: shogo82148/actions-upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - uses: actions/upload-artifact@v3 + if: runner.os == 'Linux' + with: + name: 'linux-cuda-wheels' + path: ./dist/*.whl + + - uses: actions/upload-artifact@v3 + if: runner.os == 'Windows' with: - upload_url: ${{ needs.release.outputs.upload_url }} - asset_path: ./dist/${{ env.wheel_path }} - asset_content_type: application/* \ No newline at end of file + name: 'windows-cuda-wheels' + path: ./dist/*.whl \ No newline at end of file From 04b164c42792f95f3fec62980259c0e0b84b0a84 Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 29 Aug 2023 21:02:43 +0200 Subject: [PATCH 22/38] Update build.yaml --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 08ce060e..49ac324e 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,4 +1,4 @@ -name: Build AutoGPTQ Wheels with CUDA +name: Build AutoAWQ Wheels with CUDA on: push: From 8149d362674a5563517f9b55a445f3c14011288c Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 29 Aug 2023 21:52:30 +0200 Subject: [PATCH 23/38] Create release after build upon new tag --- .github/workflows/build.yaml | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 49ac324e..3276c485 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -2,8 +2,8 @@ name: Build AutoAWQ Wheels with CUDA on: push: - branches: - - release_package + tags: + - "v*" jobs: build_wheels: @@ -62,15 +62,13 @@ jobs: $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name } echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV - - - uses: actions/upload-artifact@v3 - if: runner.os == 'Linux' - with: - name: 'linux-cuda-wheels' - path: ./dist/*.whl - - - uses: actions/upload-artifact@v3 - if: runner.os == 'Windows' + + - name: Create a Release + id: create_release + uses: shogo82148/actions-create-release@v1 + + - name: Upload Assets + uses: shogo82148/actions-upload-release-asset@v1 with: - name: 'windows-cuda-wheels' - path: ./dist/*.whl \ No newline at end of file + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: ./dist/*.whl \ No newline at end of file From 0fab60af011dc516afec1c0fb5b334f2d6ad4e31 Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 29 Aug 2023 21:59:00 +0200 Subject: [PATCH 24/38] Add CUDA 11.8 to requirements list --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0bfc5326..e77e2068 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Roadmap: Requirements: - Compute Capability 8.0 (sm80). Ampere and later architectures are supported. +- CUDA Toolkit 11.8 and later. Install: - Use pip to install awq From 51f89b6c8b850091771514656c7f263bba012603 Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 29 Aug 2023 22:38:25 +0200 Subject: [PATCH 25/38] Replace creating release with script --- .github/workflows/build.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3276c485..efda539c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -63,9 +63,16 @@ jobs: $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name } echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV - - name: Create a Release + - name: Create Release id: create_release - uses: shogo82148/actions-create-release@v1 + uses: "actions/github-script@v6" + env: + RELEASE_TAG: ${{ env.release_tag }} + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + const script = require('.github/workflows/scripts/create_release.js') + await script(github, context, core) - name: Upload Assets uses: shogo82148/actions-upload-release-asset@v1 From 248f6418095b2be25539dce3cc383f011a107062 Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 29 Aug 2023 23:18:20 +0200 Subject: [PATCH 26/38] Move creating release to start of build --- .github/workflows/build.yaml | 41 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index efda539c..54366688 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -6,9 +6,37 @@ on: - "v*" jobs: + release: + # Retrieve tag and create release + name: Create Release + runs-on: ubuntu-latest + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Extract branch info + shell: bash + run: | + echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + + - name: Create Release + id: create_release + uses: "actions/github-script@v6" + env: + RELEASE_TAG: ${{ env.release_tag }} + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + const script = require('.github/workflows/scripts/create_release.js') + await script(github, context, core) + build_wheels: name: Build AWQ runs-on: ${{ matrix.os }} + needs: release + strategy: matrix: os: [ubuntu-20.04] @@ -63,19 +91,8 @@ jobs: $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name } echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV - - name: Create Release - id: create_release - uses: "actions/github-script@v6" - env: - RELEASE_TAG: ${{ env.release_tag }} - with: - github-token: "${{ secrets.GITHUB_TOKEN }}" - script: | - const script = require('.github/workflows/scripts/create_release.js') - await script(github, context, core) - - name: Upload Assets uses: shogo82148/actions-upload-release-asset@v1 with: - upload_url: ${{ steps.create_release.outputs.upload_url }} + upload_url: ${{ needs.release.outputs.upload_url }} asset_path: ./dist/*.whl \ No newline at end of file From e2768c29560551123fd9bd91f9703254cd30cf0e Mon Sep 17 00:00:00 2001 From: Casper Date: Tue, 29 Aug 2023 23:23:29 +0200 Subject: [PATCH 27/38] Fixed path to script --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 54366688..c73108a0 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -29,7 +29,7 @@ jobs: with: github-token: "${{ secrets.GITHUB_TOKEN }}" script: | - const script = require('.github/workflows/scripts/create_release.js') + const script = require('.github/workflows/scripts/github_create_release.js') await script(github, context, core) build_wheels: From bfa5ba70addb884091043290b80f79de9958d7ce Mon Sep 17 00:00:00 2001 From: Casper Date: Wed, 30 Aug 2023 00:08:13 +0200 Subject: [PATCH 28/38] Build 3.11 --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index c73108a0..dc1272a5 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -40,7 +40,7 @@ jobs: strategy: matrix: os: [ubuntu-20.04] - pyver: ["3.8", "3.9", "3.10"] + pyver: ["3.8", "3.9", "3.10", "3.11"] cuda: ["11.8"] defaults: run: From 83c2f09d79d97306c05f26723913ef0e43d31b04 Mon Sep 17 00:00:00 2001 From: Casper Date: Thu, 31 Aug 2023 16:45:42 +0200 Subject: [PATCH 29/38] Add compute capability --- setup.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index bbad0e59..4e79a9a6 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,9 @@ import os +import torch from pathlib import Path -from torch.utils import cpp_extension from setuptools import setup, find_packages from distutils.sysconfig import get_python_lib +from torch.utils import cpp_extension, CUDA_HOME os.environ["CC"] = "g++" os.environ["CXX"] = "g++" @@ -52,6 +53,32 @@ if os.path.isdir(conda_cuda_include_dir): include_dirs.append(conda_cuda_include_dir) +def check_dependencies(): + if CUDA_HOME is None: + raise RuntimeError( + f"Cannot find CUDA_HOME. CUDA must be available to build the package.") + +def get_compute_capabilities(): + # Collect the compute capabilities of all available GPUs. + compute_capabilities = set() + for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + if major < 8: + raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.") + compute_capabilities.add(major * 10 + minor) + + # figure out compute capability + compute_capabilities = {80, 86, 89, 90} + + capability_flags = [] + for cap in compute_capabilities: + capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"] + + return capability_flags + +check_dependencies() +arch_flags = get_compute_capabilities() + extensions = [ cpp_extension.CppExtension( "awq_inference_engine", @@ -62,7 +89,7 @@ "awq_cuda/position_embedding/pos_encoding_kernels.cu" ], extra_compile_args={ "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"], - "nvcc": ["-O3", "-std=c++17"] + "nvcc": ["-O3", "-std=c++17"] + arch_flags } ) ] From ac36d82e3976e6c89dd5c73fae3875a4b8618def Mon Sep 17 00:00:00 2001 From: Casper Date: Thu, 31 Aug 2023 17:19:59 +0200 Subject: [PATCH 30/38] Fix torch import --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 4e79a9a6..1986250d 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from pathlib import Path from setuptools import setup, find_packages from distutils.sysconfig import get_python_lib -from torch.utils import cpp_extension, CUDA_HOME +from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME os.environ["CC"] = "g++" os.environ["CXX"] = "g++" @@ -80,7 +80,7 @@ def get_compute_capabilities(): arch_flags = get_compute_capabilities() extensions = [ - cpp_extension.CppExtension( + CppExtension( "awq_inference_engine", [ "awq_cuda/pybind.cpp", @@ -96,7 +96,7 @@ def get_compute_capabilities(): additional_setup_kwargs = { "ext_modules": extensions, - "cmdclass": {'build_ext': cpp_extension.BuildExtension} + "cmdclass": {'build_ext': BuildExtension} } common_setup_kwargs.update(additional_setup_kwargs) From 87c59401272a343837ea5e3513bc47bb449ec323 Mon Sep 17 00:00:00 2001 From: Casper Date: Thu, 31 Aug 2023 17:25:38 +0200 Subject: [PATCH 31/38] Move release to end --- .github/workflows/build.yaml | 46 +++++++++++++----------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index dc1272a5..8c078878 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -6,36 +6,9 @@ on: - "v*" jobs: - release: - # Retrieve tag and create release - name: Create Release - runs-on: ubuntu-latest - outputs: - upload_url: ${{ steps.create_release.outputs.upload_url }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Extract branch info - shell: bash - run: | - echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV - - - name: Create Release - id: create_release - uses: "actions/github-script@v6" - env: - RELEASE_TAG: ${{ env.release_tag }} - with: - github-token: "${{ secrets.GITHUB_TOKEN }}" - script: | - const script = require('.github/workflows/scripts/github_create_release.js') - await script(github, context, core) - build_wheels: name: Build AWQ runs-on: ${{ matrix.os }} - needs: release strategy: matrix: @@ -87,12 +60,25 @@ jobs: $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX' python setup.py sdist bdist_wheel + + - name: Extract branch info + shell: bash + run: | + echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV - $wheel_path = Get-ChildItem dist\*.whl | ForEach-Object { $_.Name } - echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV + - name: Create Release + id: create_release + uses: "actions/github-script@v6" + env: + RELEASE_TAG: ${{ env.release_tag }} + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + const script = require('.github/workflows/scripts/github_create_release.js') + await script(github, context, core) - name: Upload Assets uses: shogo82148/actions-upload-release-asset@v1 with: - upload_url: ${{ needs.release.outputs.upload_url }} + upload_url: ${{ steps.create_release.outputs.upload_url }} asset_path: ./dist/*.whl \ No newline at end of file From 7e69373f623a81e22f7db5314604bec7c7e7dd9f Mon Sep 17 00:00:00 2001 From: Casper Date: Thu, 31 Aug 2023 18:34:56 +0200 Subject: [PATCH 32/38] Rollback build --- .github/workflows/build.yaml | 47 ++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 8c078878..fec17e84 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -6,14 +6,41 @@ on: - "v*" jobs: + release: + # Retrieve tag and create release + name: Create Release + runs-on: ubuntu-latest + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Extract branch info + shell: bash + run: | + echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + + # - name: Create Release + # id: create_release + # uses: "actions/github-script@v6" + # env: + # RELEASE_TAG: ${{ env.release_tag }} + # with: + # github-token: "${{ secrets.GITHUB_TOKEN }}" + # script: | + # const script = require('.github/workflows/scripts/github_create_release.js') + # await script(github, context, core) + build_wheels: name: Build AWQ runs-on: ${{ matrix.os }} + needs: release strategy: matrix: os: [ubuntu-20.04] - pyver: ["3.8", "3.9", "3.10", "3.11"] + pyver: ["3.8", "3.9", "3.10"] cuda: ["11.8"] defaults: run: @@ -61,24 +88,8 @@ jobs: python setup.py sdist bdist_wheel - - name: Extract branch info - shell: bash - run: | - echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV - - - name: Create Release - id: create_release - uses: "actions/github-script@v6" - env: - RELEASE_TAG: ${{ env.release_tag }} - with: - github-token: "${{ secrets.GITHUB_TOKEN }}" - script: | - const script = require('.github/workflows/scripts/github_create_release.js') - await script(github, context, core) - - name: Upload Assets uses: shogo82148/actions-upload-release-asset@v1 with: - upload_url: ${{ steps.create_release.outputs.upload_url }} + upload_url: ${{ needs.release.outputs.upload_url }} asset_path: ./dist/*.whl \ No newline at end of file From 3af180279a8768b33aa0ddecd4d76529c5281bc1 Mon Sep 17 00:00:00 2001 From: Casper Date: Thu, 31 Aug 2023 19:45:21 +0200 Subject: [PATCH 33/38] Move release to end --- .github/workflows/build.yaml | 45 ++++++++++++++---------------------- 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index fec17e84..e7cdb4e4 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -6,36 +6,9 @@ on: - "v*" jobs: - release: - # Retrieve tag and create release - name: Create Release - runs-on: ubuntu-latest - outputs: - upload_url: ${{ steps.create_release.outputs.upload_url }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Extract branch info - shell: bash - run: | - echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV - - # - name: Create Release - # id: create_release - # uses: "actions/github-script@v6" - # env: - # RELEASE_TAG: ${{ env.release_tag }} - # with: - # github-token: "${{ secrets.GITHUB_TOKEN }}" - # script: | - # const script = require('.github/workflows/scripts/github_create_release.js') - # await script(github, context, core) - build_wheels: name: Build AWQ runs-on: ${{ matrix.os }} - needs: release strategy: matrix: @@ -88,8 +61,24 @@ jobs: python setup.py sdist bdist_wheel + - name: Extract branch info + shell: bash + run: | + echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + + - name: Create Release + id: create_release + uses: "actions/github-script@v6" + env: + RELEASE_TAG: ${{ env.release_tag }} + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + const script = require('.github/workflows/scripts/github_create_release.js') + await script(github, context, core) + - name: Upload Assets uses: shogo82148/actions-upload-release-asset@v1 with: - upload_url: ${{ needs.release.outputs.upload_url }} + upload_url: ${{ steps.create_release.outputs.upload_url }} asset_path: ./dist/*.whl \ No newline at end of file From ce2dc060e03a6df2f92c7071503f2cfcedc41376 Mon Sep 17 00:00:00 2001 From: Casper Date: Thu, 31 Aug 2023 20:22:17 +0200 Subject: [PATCH 34/38] Move release back up --- .github/workflows/build.yaml | 45 ++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e7cdb4e4..261b094a 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -6,9 +6,36 @@ on: - "v*" jobs: + release: + # Retrieve tag and create release + name: Create Release + runs-on: ubuntu-latest + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Extract branch info + shell: bash + run: | + echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + + - name: Create Release + id: create_release + uses: "actions/github-script@v6" + env: + RELEASE_TAG: ${{ env.release_tag }} + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + const script = require('.github/workflows/scripts/github_create_release.js') + await script(github, context, core) + build_wheels: name: Build AWQ runs-on: ${{ matrix.os }} + needs: release strategy: matrix: @@ -61,24 +88,8 @@ jobs: python setup.py sdist bdist_wheel - - name: Extract branch info - shell: bash - run: | - echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV - - - name: Create Release - id: create_release - uses: "actions/github-script@v6" - env: - RELEASE_TAG: ${{ env.release_tag }} - with: - github-token: "${{ secrets.GITHUB_TOKEN }}" - script: | - const script = require('.github/workflows/scripts/github_create_release.js') - await script(github, context, core) - - name: Upload Assets uses: shogo82148/actions-upload-release-asset@v1 with: - upload_url: ${{ steps.create_release.outputs.upload_url }} + upload_url: ${{ needs.release.outputs.upload_url }} asset_path: ./dist/*.whl \ No newline at end of file From bdde6c9673324bba6f7955bbf077e09eeb1ac7a7 Mon Sep 17 00:00:00 2001 From: Casper Hansen Date: Thu, 31 Aug 2023 21:53:19 +0000 Subject: [PATCH 35/38] Python 3.8 compatibility --- awq/entry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awq/entry.py b/awq/entry.py index a7db3699..3886cdfb 100644 --- a/awq/entry.py +++ b/awq/entry.py @@ -152,7 +152,7 @@ def _warmup(device:str): parser.add_argument('--tasks', type=str, default='wikitext', help='Tasks to evaluate. ' 'Separate tasks by comma for multiple tasks.' 'https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md') - parser.add_argument("--task_use_pretrained", default=False, action=argparse.BooleanOptionalAction, + parser.add_argument("--task_use_pretrained", default=False, action='store_true', help="Pass '--task_use_pretrained' to use a pretrained model running FP16") parser.add_argument('--task_batch_size', type=int, default=1) parser.add_argument('--task_n_shot', type=int, default=0) From 87c2e01ba40fa958c7891095b2167ba68a2d1adb Mon Sep 17 00:00:00 2001 From: qwopqwop200 Date: Fri, 1 Sep 2023 15:10:03 +0900 Subject: [PATCH 36/38] windows support --- setup.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 1986250d..c7613ada 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from pathlib import Path from setuptools import setup, find_packages from distutils.sysconfig import get_python_lib -from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME +from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME os.environ["CC"] = "g++" os.environ["CXX"] = "g++" @@ -19,7 +19,7 @@ "long_description_content_type": "text/markdown", "url": "https://github.com/casper-hansen/AutoAWQ", "keywords": ["awq", "autoawq", "quantization", "transformers"], - "platforms": ["linux"], + "platforms": ["windows", "linux"], "classifiers": [ "Environment :: GPU :: NVIDIA CUDA :: 11.8", "Environment :: GPU :: NVIDIA CUDA :: 12", @@ -80,17 +80,14 @@ def get_compute_capabilities(): arch_flags = get_compute_capabilities() extensions = [ - CppExtension( + CUDAExtension( "awq_inference_engine", [ "awq_cuda/pybind.cpp", "awq_cuda/quantization/gemm_cuda_gen.cu", "awq_cuda/layernorm/layernorm.cu", "awq_cuda/position_embedding/pos_encoding_kernels.cu" - ], extra_compile_args={ - "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"], - "nvcc": ["-O3", "-std=c++17"] + arch_flags - } + ] ) ] @@ -106,4 +103,4 @@ def get_compute_capabilities(): install_requires=requirements, include_dirs=include_dirs, **common_setup_kwargs -) \ No newline at end of file +) From 1a3acf02e2580e0bbdca0d8fba83a5b59fb80596 Mon Sep 17 00:00:00 2001 From: Casper Date: Fri, 1 Sep 2023 16:49:02 +0200 Subject: [PATCH 37/38] Generalize to Linux and Windows --- setup.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index c7613ada..ad1295d4 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from pathlib import Path from setuptools import setup, find_packages from distutils.sysconfig import get_python_lib -from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME +from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension os.environ["CC"] = "g++" os.environ["CXX"] = "g++" @@ -19,7 +19,7 @@ "long_description_content_type": "text/markdown", "url": "https://github.com/casper-hansen/AutoAWQ", "keywords": ["awq", "autoawq", "quantization", "transformers"], - "platforms": ["windows", "linux"], + "platforms": ["linux", "windows"], "classifiers": [ "Environment :: GPU :: NVIDIA CUDA :: 11.8", "Environment :: GPU :: NVIDIA CUDA :: 12", @@ -79,6 +79,17 @@ def get_compute_capabilities(): check_dependencies() arch_flags = get_compute_capabilities() +if os.name == "nt": + # Relaxed args on Windows + extra_compile_args={ + "nvcc": arch_flags + } +else: + extra_compile_args={ + "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"], + "nvcc": ["-O3", "-std=c++17"] + arch_flags + } + extensions = [ CUDAExtension( "awq_inference_engine", @@ -87,7 +98,7 @@ def get_compute_capabilities(): "awq_cuda/quantization/gemm_cuda_gen.cu", "awq_cuda/layernorm/layernorm.cu", "awq_cuda/position_embedding/pos_encoding_kernels.cu" - ] + ], extra_compile_args=extra_compile_args ) ] @@ -103,4 +114,4 @@ def get_compute_capabilities(): install_requires=requirements, include_dirs=include_dirs, **common_setup_kwargs -) +) \ No newline at end of file From afcce1a11d5f9a2fa19858da6c43d8990837bd5b Mon Sep 17 00:00:00 2001 From: Casper Date: Fri, 1 Sep 2023 16:49:42 +0200 Subject: [PATCH 38/38] Add Windows build to workflow --- .github/workflows/build.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 261b094a..33096a85 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -39,8 +39,8 @@ jobs: strategy: matrix: - os: [ubuntu-20.04] - pyver: ["3.8", "3.9", "3.10"] + os: [ubuntu-20.04, windows-latest] + pyver: ["3.8", "3.9", "3.10", "3.11"] cuda: ["11.8"] defaults: run: @@ -73,6 +73,11 @@ jobs: conda install pytorch "pytorch-cuda=${env:CUDA_VERSION}" -c pytorch -c nvidia python -m pip install --upgrade build setuptools wheel ninja + # Environment variables + Add-Content $env:GITHUB_ENV "CUDA_PATH=$env:CONDA_PREFIX" + Add-Content $env:GITHUB_ENV "CUDA_HOME=$env:CONDA_PREFIX" + if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH} + # Print version information python --version python -c "import torch; print('PyTorch:', torch.__version__)" @@ -81,11 +86,6 @@ jobs: - name: Build Wheel run: | - $env:CUDA_PATH = $env:CONDA_PREFIX - $env:CUDA_HOME = $env:CONDA_PREFIX - if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH} - $env:TORCH_CUDA_ARCH_LIST = '8.0 8.6 8.9 9.0+PTX' - python setup.py sdist bdist_wheel - name: Upload Assets