forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Kernel] W8A16 Int8 inside FusedMoE (vllm-project#7415)
- Loading branch information
1 parent
866b18f
commit 35e8f72
Showing
15 changed files
with
412 additions
and
136 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# flake8: noqa | ||
"""Tests experts_int8 quantization startup and generation, | ||
doesn't test correctness | ||
""" | ||
import pytest | ||
|
||
from tests.quantization.utils import is_quant_method_supported | ||
|
||
MODELS = ["ai21labs/Jamba-tiny-random"] | ||
|
||
|
||
@pytest.mark.skipif(not is_quant_method_supported("experts_int8"), | ||
reason="ExpertsInt8 is not supported on this GPU type.") | ||
@pytest.mark.parametrize("model", MODELS) | ||
@pytest.mark.parametrize("dtype", ["bfloat16"]) | ||
@pytest.mark.parametrize("max_tokens", [10]) | ||
def test_model_experts_int8_startup( | ||
hf_runner, | ||
vllm_runner, | ||
example_prompts, | ||
model: str, | ||
dtype: str, | ||
max_tokens: int, | ||
) -> None: | ||
|
||
with vllm_runner(model, dtype=dtype, | ||
quantization="experts_int8") as vllm_model: | ||
vllm_model.generate_greedy(example_prompts, max_tokens) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Oops, something went wrong.