Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add exllama GPTQ CUDA kernel support #553

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
ee7ba48
add exllama gptq kernel
fxmarty Jul 5, 2023
c858d79
add attribution
fxmarty Jul 5, 2023
0ff8219
Merge branch 'main' into gptq-cuda-kernels
fxmarty Jul 5, 2023
2272b3a
some more cleanup
fxmarty Jul 5, 2023
620ed7d
Merge branch 'gptq-cuda-kernels' of https://github.com/fxmarty/text-g…
fxmarty Jul 5, 2023
a6e3874
try-catch to load the cuda extension, quite ugly practice tbh
fxmarty Jul 5, 2023
4462854
have a single gptq quantization type
fxmarty Jul 12, 2023
67a46b7
move exllama buffer init to the top level
fxmarty Jul 12, 2023
67d6876
cleanup
fxmarty Jul 12, 2023
f90c61a
support bits different than 4
fxmarty Jul 12, 2023
8645fd3
tests
fxmarty Jul 12, 2023
faa5b52
Merge branch 'main' into gptq-cuda-kernels
fxmarty Jul 12, 2023
38c2be5
fix test
fxmarty Jul 12, 2023
2ae65b4
fix tests
fxmarty Jul 13, 2023
0036084
support all, test llama
fxmarty Jul 13, 2023
9401e10
Merge branch 'main' into gptq-cuda-kernels
fxmarty Jul 13, 2023
74e6d6e
fix the usual merge mess
fxmarty Jul 13, 2023
edfbfdf
Merge branch 'main' into gptq-cuda-kernels
fxmarty Jul 19, 2023
6bf7090
fix per-column quantization
fxmarty Jul 19, 2023
0860394
Refactored a bit.
Narsil Jul 20, 2023
8cf7c89
Small polish.
Narsil Jul 20, 2023
7faef69
Give escape hatch to not use exllama kernels even if available.
Narsil Jul 20, 2023
900ac49
Fixing GTPQ device santacoder.
Narsil Jul 20, 2023
12191b7
Fix config.
Narsil Jul 20, 2023
c6e702f
Add kernel target.
Narsil Jul 20, 2023
3ec3add
Separate build process.
Narsil Jul 20, 2023
40be532
Update starcoder_gptq
Narsil Jul 21, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,17 @@ COPY server/Makefile-flash-att-v2 Makefile
# Build specific version of flash attention v2
RUN make build-flash-attention-v2

# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder

WORKDIR /usr/src

COPY server/exllama_kernels/ .


# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder

Expand Down Expand Up @@ -161,6 +172,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86

# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,6 @@ run-bloom:

run-bloom-quantize:
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080

clean:
rm -rf target aml
14 changes: 9 additions & 5 deletions integration-tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,15 +230,19 @@ def local_launcher(
shard_uds_path,
]

env = os.environ

if num_shard is not None:
args.extend(["--num-shard", str(num_shard)])
if quantize:
if quantize is not None:
args.append("--quantize")
args.append("bitsandbytes")
args.append(quantize)
if quantize == "gptq":
env["GPTQ_GROUPSIZE"] = "128"
env["GPTQ_BITS"] = "4"
if trust_remote_code:
args.append("--trust-remote-code")

env = os.environ
env["LOG_LEVEL"] = "info,text_generation_router=debug"

if not use_flash_attention:
Expand Down Expand Up @@ -275,9 +279,9 @@ def docker_launcher(

if num_shard is not None:
args.extend(["--num-shard", str(num_shard)])
if quantize:
if quantize is not None:
args.append("--quantize")
args.append("bitsandbytes")
args.append(quantize)
if trust_remote_code:
args.append("--trust-remote-code")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{
"generated_text": ", and I am going to visit the Louvre",
"details": {
"finish_reason": "length",
"generated_tokens": 10,
"seed": null,
"prefill": [
{
"id": 2,
"text": "</s>",
"logprob": null
},
{
"id": 20628,
"text": "Today",
"logprob": -11.2265625
},
{
"id": 306,
"text": "I",
"logprob": -4.1757812
},
{
"id": 626,
"text": "am",
"logprob": -1.9746094
},
{
"id": 297,
"text": "in",
"logprob": -5.4648438
},
{
"id": 3444,
"text": "France",
"logprob": -9.03125
}
],
"tokens": [
{
"id": 29892,
"text": ",",
"logprob": -0.31298828,
"special": false
},
{
"id": 322,
"text": " and",
"logprob": -1.4345703,
"special": false
},
{
"id": 306,
"text": " I",
"logprob": -0.32080078,
"special": false
},
{
"id": 626,
"text": " am",
"logprob": -1.3798828,
"special": false
},
{
"id": 2675,
"text": " going",
"logprob": -1.2304688,
"special": false
},
{
"id": 304,
"text": " to",
"logprob": -0.0014791489,
"special": false
},
{
"id": 6493,
"text": " visit",
"logprob": -1.1503906,
"special": false
},
{
"id": 278,
"text": " the",
"logprob": -0.41259766,
"special": false
},
{
"id": 4562,
"text": " Lou",
"logprob": -1.8134766,
"special": false
},
{
"id": 12675,
"text": "vre",
"logprob": -0.000767231,
"special": false
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
{
"generated_text": "The capital city of France isParis.\n The Best Way to Visit",
"details": {
"finish_reason": "length",
"generated_tokens": 10,
"seed": 0,
"prefill": [
{
"id": 2,
"text": "</s>",
"logprob": null
},
{
"id": 4272,
"text": "city",
"logprob": -12.4453125
},
{
"id": 310,
"text": "of",
"logprob": -2.4023438
},
{
"id": 3444,
"text": "France",
"logprob": -12.515625
},
{
"id": 338,
"text": "is",
"logprob": -5.1914062
}
],
"tokens": [
{
"id": 3681,
"text": " Paris",
"logprob": -0.22546387,
"special": false
},
{
"id": 29889,
"text": ".",
"logprob": 0,
"special": false
},
{
"id": 13,
"text": "\n",
"logprob": 0,
"special": false
},
{
"id": 1,
"text": "",
"logprob": 0,
"special": false
},
{
"id": 450,
"text": " The",
"logprob": 0,
"special": false
},
{
"id": 6407,
"text": " Best",
"logprob": -0.5522461,
"special": false
},
{
"id": 5307,
"text": " Way",
"logprob": 0,
"special": false
},
{
"id": 304,
"text": " to",
"logprob": 0,
"special": false
},
{
"id": 5741,
"text": " Vis",
"logprob": -2.3496094,
"special": false
},
{
"id": 277,
"text": "it",
"logprob": 0,
"special": false
}
]
}
}
Loading
Loading