ray-project · anyscalesam · Aug 19, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 13, 2024
@@ -18,6 +18,7 @@
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+import torch
 
 logger = logging.getLogger("ray.serve")
 
@@ -100,6 +101,19 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
 
     Supported engine arguments: https://docs.vllm.ai/en/latest/models/engine_args.html.
     """  # noqa: E501
+    if "device" in cli_args.keys():
+        device = cli_args.pop("device")
+    else:
+        try:
+            from habana_frameworks.torch.distributed.hccl import (
+                initialize_distributed_hpu,
+            )
+
+            initialize_distributed_hpu()
+            torch.zeros(1).to("hpu")
+            device = "HPU"
+        except Exception:
+            device = "GPU"
     parsed_args = parse_vllm_args(cli_args)
     engine_args = AsyncEngineArgs.from_cli_args(parsed_args)
     engine_args.worker_use_ray = True
@@ -109,7 +123,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     pg_resources = []
     pg_resources.append({"CPU": 1})  # for the deployment replica
     for i in range(tp):
-        pg_resources.append({"CPU": 1, "GPU": 1})  # for the vLLM actors
+        pg_resources.append({"CPU": 1, device: 1})  # for the vLLM actors
 
     # We use the "STRICT_PACK" strategy below to ensure all vLLM actors are placed on
     # the same Ray node.

@@ -5,12 +5,19 @@ orphan: true
 (serve-vllm-tutorial)=
 
 # Serve a Large Language Model with vLLM
-This example runs a large language model with Ray Serve using [vLLM](https://docs.vllm.ai/en/latest/), a popular open-source library for serving LLMs. It uses the [OpenAI Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api), which easily integrates with other LLM tools. The example also sets up multi-GPU serving with Ray Serve using placement groups. For more advanced features like multi-lora support with serve multiplexing, JSON mode function calling and further performance improvements, try LLM deployment solutions on [Anyscale](https://www.anyscale.com/). 
+This example runs a large language model with Ray Serve using [vLLM](https://docs.vllm.ai/en/latest/), a popular open-source library for serving LLMs. It uses the [OpenAI Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api), which easily integrates with other LLM tools. The example also sets up multi-GPU or multi-HPU serving with Ray Serve using placement groups. For more advanced features like multi-lora support with serve multiplexing, JSON mode function calling and further performance improvements, try LLM deployment solutions on [Anyscale](https://www.anyscale.com/). 
 
 To run this example, install the following:
 
 ```bash
-pip install "ray[serve]" requests vllm
+pip install "ray[serve]" requests
+```
+vllm needs to be installed according to the device:
+```bash
+# on GPU
+pip install vllm
+# on HPU
+pip install -v git+https://github.com/HabanaAI/vllm-fork.git@habana_main
 ```
 
 This example uses the [NousResearch/Meta-Llama-3-8B-Instruct](https://huggingface.co/NousResearch/Meta-Llama-3-8B-Instruct) model. Save the following code to a file named `llm.py`.
@@ -23,10 +30,16 @@ The Serve code is as follows:
 :end-before: __serve_example_end__
 ```
 
-Use `serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2` to start the Serve app.
+Use the following code to start the Serve app:
+```bash
+# on GPU
+serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="GPU"
+# on HPU
+serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="HPU"
+```
 
 :::{note}
-This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs using placement groups.
+This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs (based on the device type) using placement groups.
 :::