From a4b502bb2f972e957f20c73c7d4bb50c87a16590 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Wed, 12 Jun 2024 16:25:14 +0800 Subject: [PATCH 01/11] add hpu support for vllm example Signed-off-by: KepingYan --- doc/source/serve/doc_code/vllm_openai_example.py | 6 +++++- doc/source/serve/tutorials/vllm-example.md | 13 ++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index d2ecc7ba4120..9c29d017cfa3 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -18,6 +18,7 @@ ) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_engine import LoRAModulePath +import importlib logger = logging.getLogger("ray.serve") @@ -108,8 +109,11 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: logger.info(f"Tensor parallelism = {tp}") pg_resources = [] pg_resources.append({"CPU": 1}) # for the deployment replica + + is_hpu = importlib.util.find_spec('habana_frameworks') is not None + device = "HPU" if is_hpu else "GPU" for i in range(tp): - pg_resources.append({"CPU": 1, "GPU": 1}) # for the vLLM actors + pg_resources.append({"CPU": 1, device: 1}) # for the vLLM actors # We use the "STRICT_PACK" strategy below to ensure all vLLM actors are placed on # the same Ray node. diff --git a/doc/source/serve/tutorials/vllm-example.md b/doc/source/serve/tutorials/vllm-example.md index ef93bab5336e..b03b7055a257 100644 --- a/doc/source/serve/tutorials/vllm-example.md +++ b/doc/source/serve/tutorials/vllm-example.md @@ -5,12 +5,19 @@ orphan: true (serve-vllm-tutorial)= # Serve a Large Language Model with vLLM -This example runs a large language model with Ray Serve using [vLLM](https://docs.vllm.ai/en/latest/), a popular open-source library for serving LLMs. It uses the [OpenAI Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api), which easily integrates with other LLM tools. The example also sets up multi-GPU serving with Ray Serve using placement groups. For more advanced features like multi-lora support with serve multiplexing, JSON mode function calling and further performance improvements, try LLM deployment solutions on [Anyscale](https://www.anyscale.com/). +This example runs a large language model with Ray Serve using [vLLM](https://docs.vllm.ai/en/latest/), a popular open-source library for serving LLMs. It uses the [OpenAI Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api), which easily integrates with other LLM tools. The example also sets up multi-GPU or multi-HPU serving with Ray Serve using placement groups. For more advanced features like multi-lora support with serve multiplexing, JSON mode function calling and further performance improvements, try LLM deployment solutions on [Anyscale](https://www.anyscale.com/). To run this example, install the following: ```bash -pip install "ray[serve]" requests vllm +pip install "ray[serve]" requests +``` +vllm needs to be installed according to the device: +```bash +# on GPU +pip install vllm +# on HPU +pip install -v git+https://github.com/HabanaAI/vllm-fork.git@habana_main ``` This example uses the [NousResearch/Meta-Llama-3-8B-Instruct](https://huggingface.co/NousResearch/Meta-Llama-3-8B-Instruct) model. Save the following code to a file named `llm.py`. @@ -26,7 +33,7 @@ The Serve code is as follows: Use `serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2` to start the Serve app. :::{note} -This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs using placement groups. +This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs(based on the device) using placement groups. ::: From 2e806ce09105f85ac4e55318a88ac51e5c624f71 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Wed, 12 Jun 2024 16:30:59 +0800 Subject: [PATCH 02/11] upd doc Signed-off-by: KepingYan --- doc/source/serve/tutorials/vllm-example.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/serve/tutorials/vllm-example.md b/doc/source/serve/tutorials/vllm-example.md index b03b7055a257..caa0508bb222 100644 --- a/doc/source/serve/tutorials/vllm-example.md +++ b/doc/source/serve/tutorials/vllm-example.md @@ -33,7 +33,7 @@ The Serve code is as follows: Use `serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2` to start the Serve app. :::{note} -This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs(based on the device) using placement groups. +This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs (based on the device type) using placement groups. ::: From 2ac6fb81ce5b777fb8635129de86e19ac22e3dd0 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Thu, 13 Jun 2024 09:38:32 +0800 Subject: [PATCH 03/11] fix lint check Signed-off-by: KepingYan --- doc/source/serve/doc_code/vllm_openai_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index 9c29d017cfa3..6c240cbfaa7f 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -110,7 +110,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: pg_resources = [] pg_resources.append({"CPU": 1}) # for the deployment replica - is_hpu = importlib.util.find_spec('habana_frameworks') is not None + is_hpu = importlib.util.find_spec("habana_frameworks") is not None device = "HPU" if is_hpu else "GPU" for i in range(tp): pg_resources.append({"CPU": 1, device: 1}) # for the vLLM actors From 37bfddb644b059cfac288d87a7da785f253b5ed6 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Fri, 14 Jun 2024 16:00:07 +0800 Subject: [PATCH 04/11] add device param Signed-off-by: KepingYan --- doc/source/serve/doc_code/vllm_openai_example.py | 15 +++++++++++---- doc/source/serve/tutorials/vllm-example.md | 8 +++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index 6c240cbfaa7f..20271a911dc7 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -18,7 +18,7 @@ ) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_engine import LoRAModulePath -import importlib +import torch logger = logging.getLogger("ray.serve") @@ -101,6 +101,16 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: Supported engine arguments: https://docs.vllm.ai/en/latest/models/engine_args.html. """ # noqa: E501 + if "device" in cli_args.keys(): + device = cli_args.pop("device") + else: + try: + from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu + initialize_distributed_hpu() + torch.zeros(1).to("hpu") + device = "HPU" + except: + device = "GPU" parsed_args = parse_vllm_args(cli_args) engine_args = AsyncEngineArgs.from_cli_args(parsed_args) engine_args.worker_use_ray = True @@ -109,9 +119,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: logger.info(f"Tensor parallelism = {tp}") pg_resources = [] pg_resources.append({"CPU": 1}) # for the deployment replica - - is_hpu = importlib.util.find_spec("habana_frameworks") is not None - device = "HPU" if is_hpu else "GPU" for i in range(tp): pg_resources.append({"CPU": 1, device: 1}) # for the vLLM actors diff --git a/doc/source/serve/tutorials/vllm-example.md b/doc/source/serve/tutorials/vllm-example.md index caa0508bb222..746415921738 100644 --- a/doc/source/serve/tutorials/vllm-example.md +++ b/doc/source/serve/tutorials/vllm-example.md @@ -30,7 +30,13 @@ The Serve code is as follows: :end-before: __serve_example_end__ ``` -Use `serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2` to start the Serve app. +Use the following code to start the Serve app: +```bash +# on GPU +serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="GPU" +# on HPU +serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="HPU" +``` :::{note} This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs (based on the device type) using placement groups. From b3abb07ccf9497281d6cf7807b22ff178f4678b3 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Fri, 14 Jun 2024 17:22:56 +0800 Subject: [PATCH 05/11] fix ci Signed-off-by: KepingYan --- doc/source/serve/doc_code/vllm_openai_example.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index 20271a911dc7..92d13eef4f16 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -105,11 +105,14 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: device = cli_args.pop("device") else: try: - from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu + from habana_frameworks.torch.distributed.hccl import ( + initialize_distributed_hpu, + ) + initialize_distributed_hpu() torch.zeros(1).to("hpu") device = "HPU" - except: + except Exception: device = "GPU" parsed_args = parse_vllm_args(cli_args) engine_args = AsyncEngineArgs.from_cli_args(parsed_args) From 2f5bfa5cc9e85d119cd61f2d02820ff05988fe18 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Thu, 4 Jul 2024 17:18:11 +0800 Subject: [PATCH 06/11] address comments Signed-off-by: KepingYan --- doc/source/serve/doc_code/vllm_openai_example.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index 92d13eef4f16..87b7be1a7b38 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -101,9 +101,11 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: Supported engine arguments: https://docs.vllm.ai/en/latest/models/engine_args.html. """ # noqa: E501 - if "device" in cli_args.keys(): - device = cli_args.pop("device") + if "accelerator" in cli_args.keys(): + accelerator = cli_args.pop("accelerator") else: + accelerator = "GPU" + if accelerator == "HPU": try: from habana_frameworks.torch.distributed.hccl import ( initialize_distributed_hpu, @@ -111,9 +113,8 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: initialize_distributed_hpu() torch.zeros(1).to("hpu") - device = "HPU" except Exception: - device = "GPU" + raise Exception("Please check the environment: HPU devices not available.") parsed_args = parse_vllm_args(cli_args) engine_args = AsyncEngineArgs.from_cli_args(parsed_args) engine_args.worker_use_ray = True @@ -123,7 +124,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: pg_resources = [] pg_resources.append({"CPU": 1}) # for the deployment replica for i in range(tp): - pg_resources.append({"CPU": 1, device: 1}) # for the vLLM actors + pg_resources.append({"CPU": 1, accelerator: 1}) # for the vLLM actors # We use the "STRICT_PACK" strategy below to ensure all vLLM actors are placed on # the same Ray node. @@ -145,6 +146,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: { "model": "NousResearch/Meta-Llama-3-8B-Instruct", "tensor-parallel-size": "1", + "accelerator": "HPU" } ) ) From 48f2a321b09727ea663001f189ea37bda49020ee Mon Sep 17 00:00:00 2001 From: KepingYan Date: Fri, 5 Jul 2024 00:00:05 +0800 Subject: [PATCH 07/11] fix ci Signed-off-by: KepingYan --- doc/source/serve/doc_code/vllm_openai_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index 4d60d5f03b2a..2449e2198271 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -157,7 +157,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: { "model": "NousResearch/Meta-Llama-3-8B-Instruct", "tensor-parallel-size": "1", - "accelerator": "HPU" } ) ) From cd391ed19c75653bd5dc02c014a5a9b33e160039 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Tue, 13 Aug 2024 18:49:46 +0800 Subject: [PATCH 08/11] compatible with the latest vLLM version Signed-off-by: KepingYan --- .../serve/doc_code/vllm_openai_example.py | 17 ++++++++++++++--- doc/source/serve/tutorials/vllm-example.md | 6 +++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index d8b45e432295..e0659f47fb02 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -17,8 +17,10 @@ ErrorResponse, ) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_engine import LoRAModulePath +from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, + PromptAdapterPath) from vllm.utils import FlexibleArgumentParser +from vllm.entrypoints.logger import RequestLogger import torch logger = logging.getLogger("ray.serve") @@ -41,6 +43,8 @@ def __init__( engine_args: AsyncEngineArgs, response_role: str, lora_modules: Optional[List[LoRAModulePath]] = None, + prompt_adapters: Optional[List[PromptAdapterPath]] = None, + request_logger: Optional[RequestLogger] = None, chat_template: Optional[str] = None, ): logger.info(f"Starting with engine args: {engine_args}") @@ -48,6 +52,8 @@ def __init__( self.engine_args = engine_args self.response_role = response_role self.lora_modules = lora_modules + self.prompt_adapters = prompt_adapters + self.request_logger = request_logger self.chat_template = chat_template self.engine = AsyncLLMEngine.from_engine_args(engine_args) @@ -72,8 +78,10 @@ async def create_chat_completion( model_config, served_model_names, self.response_role, - self.lora_modules, - self.chat_template, + lora_modules=self.lora_modules, + prompt_adapters=self.prompt_adapters, + request_logger=self.request_logger, + chat_template=self.chat_template, ) logger.info(f"Request: {request}") generator = await self.openai_serving_chat.create_chat_completion( @@ -150,6 +158,8 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: engine_args, parsed_args.response_role, parsed_args.lora_modules, + parsed_args.prompt_adapters, + cli_args.get("request_logger"), parsed_args.chat_template, ) @@ -186,6 +196,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: ], temperature=0.01, stream=True, + max_tokens=100, ) for chat in chat_completion: diff --git a/doc/source/serve/tutorials/vllm-example.md b/doc/source/serve/tutorials/vllm-example.md index 746415921738..2dd503952bb7 100644 --- a/doc/source/serve/tutorials/vllm-example.md +++ b/doc/source/serve/tutorials/vllm-example.md @@ -33,13 +33,13 @@ The Serve code is as follows: Use the following code to start the Serve app: ```bash # on GPU -serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="GPU" +serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 accelerator="GPU" # on HPU -serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="HPU" +serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 accelerator="HPU" ``` :::{note} -This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs (based on the device type) using placement groups. +This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs (based on the accelerator type) using placement groups. ::: From 7395f9e9ace15538e49d440f6f7bf3cb8b4b17dd Mon Sep 17 00:00:00 2001 From: KepingYan Date: Tue, 13 Aug 2024 20:30:43 +0800 Subject: [PATCH 09/11] fix lint Signed-off-by: KepingYan --- doc/source/serve/doc_code/vllm_openai_example.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index e0659f47fb02..4f5aa7ff5851 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -17,8 +17,7 @@ ErrorResponse, ) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, - PromptAdapterPath) +from vllm.entrypoints.openai.serving_engine import LoRAModulePath, PromptAdapterPath from vllm.utils import FlexibleArgumentParser from vllm.entrypoints.logger import RequestLogger import torch From 4af9ca1276924837471a7b7d7a3435a647b87bb1 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Wed, 14 Aug 2024 09:50:19 +0800 Subject: [PATCH 10/11] remove hpu env check Signed-off-by: KepingYan --- doc/source/serve/doc_code/vllm_openai_example.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index 4f5aa7ff5851..da026af80de6 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -128,16 +128,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: accelerator = cli_args.pop("accelerator") else: accelerator = "GPU" - if accelerator == "HPU": - try: - from habana_frameworks.torch.distributed.hccl import ( - initialize_distributed_hpu, - ) - - initialize_distributed_hpu() - torch.zeros(1).to("hpu") - except Exception: - raise Exception("Please check the environment: HPU devices not available.") parsed_args = parse_vllm_args(cli_args) engine_args = AsyncEngineArgs.from_cli_args(parsed_args) engine_args.worker_use_ray = True From de9eefdb912e8997ba34085dcb79f5765cb184b0 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Wed, 14 Aug 2024 09:58:12 +0800 Subject: [PATCH 11/11] remove unused package Signed-off-by: KepingYan --- doc/source/serve/doc_code/vllm_openai_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py index da026af80de6..4e5ba0c463c3 100644 --- a/doc/source/serve/doc_code/vllm_openai_example.py +++ b/doc/source/serve/doc_code/vllm_openai_example.py @@ -20,7 +20,6 @@ from vllm.entrypoints.openai.serving_engine import LoRAModulePath, PromptAdapterPath from vllm.utils import FlexibleArgumentParser from vllm.entrypoints.logger import RequestLogger -import torch logger = logging.getLogger("ray.serve")