From a4b502bb2f972e957f20c73c7d4bb50c87a16590 Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Wed, 12 Jun 2024 16:25:14 +0800
Subject: [PATCH 01/11] add hpu support for vllm example

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/doc_code/vllm_openai_example.py |  6 +++++-
 doc/source/serve/tutorials/vllm-example.md       | 13 ++++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index d2ecc7ba4120..9c29d017cfa3 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -18,6 +18,7 @@
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+import importlib
 
 logger = logging.getLogger("ray.serve")
 
@@ -108,8 +109,11 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     logger.info(f"Tensor parallelism = {tp}")
     pg_resources = []
     pg_resources.append({"CPU": 1})  # for the deployment replica
+
+    is_hpu = importlib.util.find_spec('habana_frameworks') is not None
+    device = "HPU" if is_hpu else "GPU"
     for i in range(tp):
-        pg_resources.append({"CPU": 1, "GPU": 1})  # for the vLLM actors
+        pg_resources.append({"CPU": 1, device: 1})  # for the vLLM actors
 
     # We use the "STRICT_PACK" strategy below to ensure all vLLM actors are placed on
     # the same Ray node.
diff --git a/doc/source/serve/tutorials/vllm-example.md b/doc/source/serve/tutorials/vllm-example.md
index ef93bab5336e..b03b7055a257 100644
--- a/doc/source/serve/tutorials/vllm-example.md
+++ b/doc/source/serve/tutorials/vllm-example.md
@@ -5,12 +5,19 @@ orphan: true
 (serve-vllm-tutorial)=
 
 # Serve a Large Language Model with vLLM
-This example runs a large language model with Ray Serve using [vLLM](https://docs.vllm.ai/en/latest/), a popular open-source library for serving LLMs. It uses the [OpenAI Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api), which easily integrates with other LLM tools. The example also sets up multi-GPU serving with Ray Serve using placement groups. For more advanced features like multi-lora support with serve multiplexing, JSON mode function calling and further performance improvements, try LLM deployment solutions on [Anyscale](https://www.anyscale.com/). 
+This example runs a large language model with Ray Serve using [vLLM](https://docs.vllm.ai/en/latest/), a popular open-source library for serving LLMs. It uses the [OpenAI Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api), which easily integrates with other LLM tools. The example also sets up multi-GPU or multi-HPU serving with Ray Serve using placement groups. For more advanced features like multi-lora support with serve multiplexing, JSON mode function calling and further performance improvements, try LLM deployment solutions on [Anyscale](https://www.anyscale.com/). 
 
 To run this example, install the following:
 
 ```bash
-pip install "ray[serve]" requests vllm
+pip install "ray[serve]" requests
+```
+vllm needs to be installed according to the device:
+```bash
+# on GPU
+pip install vllm
+# on HPU
+pip install -v git+https://github.com/HabanaAI/vllm-fork.git@habana_main
 ```
 
 This example uses the [NousResearch/Meta-Llama-3-8B-Instruct](https://huggingface.co/NousResearch/Meta-Llama-3-8B-Instruct) model. Save the following code to a file named `llm.py`.
@@ -26,7 +33,7 @@ The Serve code is as follows:
 Use `serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2` to start the Serve app.
 
 :::{note}
-This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs using placement groups.
+This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs(based on the device) using placement groups.
 :::
 
 

From 2e806ce09105f85ac4e55318a88ac51e5c624f71 Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Wed, 12 Jun 2024 16:30:59 +0800
Subject: [PATCH 02/11] upd doc

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/tutorials/vllm-example.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/serve/tutorials/vllm-example.md b/doc/source/serve/tutorials/vllm-example.md
index b03b7055a257..caa0508bb222 100644
--- a/doc/source/serve/tutorials/vllm-example.md
+++ b/doc/source/serve/tutorials/vllm-example.md
@@ -33,7 +33,7 @@ The Serve code is as follows:
 Use `serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2` to start the Serve app.
 
 :::{note}
-This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs(based on the device) using placement groups.
+This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs (based on the device type) using placement groups.
 :::
 
 

From 2ac6fb81ce5b777fb8635129de86e19ac22e3dd0 Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Thu, 13 Jun 2024 09:38:32 +0800
Subject: [PATCH 03/11] fix lint check

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/doc_code/vllm_openai_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index 9c29d017cfa3..6c240cbfaa7f 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -110,7 +110,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     pg_resources = []
     pg_resources.append({"CPU": 1})  # for the deployment replica
 
-    is_hpu = importlib.util.find_spec('habana_frameworks') is not None
+    is_hpu = importlib.util.find_spec("habana_frameworks") is not None
     device = "HPU" if is_hpu else "GPU"
     for i in range(tp):
         pg_resources.append({"CPU": 1, device: 1})  # for the vLLM actors

From 37bfddb644b059cfac288d87a7da785f253b5ed6 Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Fri, 14 Jun 2024 16:00:07 +0800
Subject: [PATCH 04/11] add device param

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/doc_code/vllm_openai_example.py | 15 +++++++++++----
 doc/source/serve/tutorials/vllm-example.md       |  8 +++++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index 6c240cbfaa7f..20271a911dc7 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -18,7 +18,7 @@
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_engine import LoRAModulePath
-import importlib
+import torch
 
 logger = logging.getLogger("ray.serve")
 
@@ -101,6 +101,16 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
 
     Supported engine arguments: https://docs.vllm.ai/en/latest/models/engine_args.html.
     """  # noqa: E501
+    if "device" in cli_args.keys():
+        device = cli_args.pop("device")
+    else:
+        try:
+            from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu
+            initialize_distributed_hpu()
+            torch.zeros(1).to("hpu")
+            device = "HPU"
+        except:
+            device = "GPU"
     parsed_args = parse_vllm_args(cli_args)
     engine_args = AsyncEngineArgs.from_cli_args(parsed_args)
     engine_args.worker_use_ray = True
@@ -109,9 +119,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     logger.info(f"Tensor parallelism = {tp}")
     pg_resources = []
     pg_resources.append({"CPU": 1})  # for the deployment replica
-
-    is_hpu = importlib.util.find_spec("habana_frameworks") is not None
-    device = "HPU" if is_hpu else "GPU"
     for i in range(tp):
         pg_resources.append({"CPU": 1, device: 1})  # for the vLLM actors
 
diff --git a/doc/source/serve/tutorials/vllm-example.md b/doc/source/serve/tutorials/vllm-example.md
index caa0508bb222..746415921738 100644
--- a/doc/source/serve/tutorials/vllm-example.md
+++ b/doc/source/serve/tutorials/vllm-example.md
@@ -30,7 +30,13 @@ The Serve code is as follows:
 :end-before: __serve_example_end__
 ```
 
-Use `serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2` to start the Serve app.
+Use the following code to start the Serve app:
+```bash
+# on GPU
+serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="GPU"
+# on HPU
+serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="HPU"
+```
 
 :::{note}
 This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs (based on the device type) using placement groups.

From b3abb07ccf9497281d6cf7807b22ff178f4678b3 Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Fri, 14 Jun 2024 17:22:56 +0800
Subject: [PATCH 05/11] fix ci

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/doc_code/vllm_openai_example.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index 20271a911dc7..92d13eef4f16 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -105,11 +105,14 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
         device = cli_args.pop("device")
     else:
         try:
-            from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu
+            from habana_frameworks.torch.distributed.hccl import (
+                initialize_distributed_hpu,
+            )
+
             initialize_distributed_hpu()
             torch.zeros(1).to("hpu")
             device = "HPU"
-        except:
+        except Exception:
             device = "GPU"
     parsed_args = parse_vllm_args(cli_args)
     engine_args = AsyncEngineArgs.from_cli_args(parsed_args)

From 2f5bfa5cc9e85d119cd61f2d02820ff05988fe18 Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Thu, 4 Jul 2024 17:18:11 +0800
Subject: [PATCH 06/11] address comments

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/doc_code/vllm_openai_example.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index 92d13eef4f16..87b7be1a7b38 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -101,9 +101,11 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
 
     Supported engine arguments: https://docs.vllm.ai/en/latest/models/engine_args.html.
     """  # noqa: E501
-    if "device" in cli_args.keys():
-        device = cli_args.pop("device")
+    if "accelerator" in cli_args.keys():
+        accelerator = cli_args.pop("accelerator")
     else:
+        accelerator = "GPU"
+    if accelerator == "HPU":
         try:
             from habana_frameworks.torch.distributed.hccl import (
                 initialize_distributed_hpu,
@@ -111,9 +113,8 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
 
             initialize_distributed_hpu()
             torch.zeros(1).to("hpu")
-            device = "HPU"
         except Exception:
-            device = "GPU"
+            raise Exception("Please check the environment: HPU devices not available.")
     parsed_args = parse_vllm_args(cli_args)
     engine_args = AsyncEngineArgs.from_cli_args(parsed_args)
     engine_args.worker_use_ray = True
@@ -123,7 +124,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     pg_resources = []
     pg_resources.append({"CPU": 1})  # for the deployment replica
     for i in range(tp):
-        pg_resources.append({"CPU": 1, device: 1})  # for the vLLM actors
+        pg_resources.append({"CPU": 1, accelerator: 1})  # for the vLLM actors
 
     # We use the "STRICT_PACK" strategy below to ensure all vLLM actors are placed on
     # the same Ray node.
@@ -145,6 +146,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
             {
                 "model": "NousResearch/Meta-Llama-3-8B-Instruct",
                 "tensor-parallel-size": "1",
+                "accelerator": "HPU"
             }
         )
     )

From 48f2a321b09727ea663001f189ea37bda49020ee Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Fri, 5 Jul 2024 00:00:05 +0800
Subject: [PATCH 07/11] fix ci

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/doc_code/vllm_openai_example.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index 4d60d5f03b2a..2449e2198271 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -157,7 +157,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
             {
                 "model": "NousResearch/Meta-Llama-3-8B-Instruct",
                 "tensor-parallel-size": "1",
-                "accelerator": "HPU"
             }
         )
     )

From cd391ed19c75653bd5dc02c014a5a9b33e160039 Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Tue, 13 Aug 2024 18:49:46 +0800
Subject: [PATCH 08/11] compatible with the latest vLLM version

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 .../serve/doc_code/vllm_openai_example.py       | 17 ++++++++++++++---
 doc/source/serve/tutorials/vllm-example.md      |  6 +++---
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index d8b45e432295..e0659f47fb02 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -17,8 +17,10 @@
     ErrorResponse,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+                                                    PromptAdapterPath)
 from vllm.utils import FlexibleArgumentParser
+from vllm.entrypoints.logger import RequestLogger
 import torch
 
 logger = logging.getLogger("ray.serve")
@@ -41,6 +43,8 @@ def __init__(
         engine_args: AsyncEngineArgs,
         response_role: str,
         lora_modules: Optional[List[LoRAModulePath]] = None,
+        prompt_adapters: Optional[List[PromptAdapterPath]] = None,
+        request_logger: Optional[RequestLogger] = None,
         chat_template: Optional[str] = None,
     ):
         logger.info(f"Starting with engine args: {engine_args}")
@@ -48,6 +52,8 @@ def __init__(
         self.engine_args = engine_args
         self.response_role = response_role
         self.lora_modules = lora_modules
+        self.prompt_adapters = prompt_adapters
+        self.request_logger = request_logger
         self.chat_template = chat_template
         self.engine = AsyncLLMEngine.from_engine_args(engine_args)
 
@@ -72,8 +78,10 @@ async def create_chat_completion(
                 model_config,
                 served_model_names,
                 self.response_role,
-                self.lora_modules,
-                self.chat_template,
+                lora_modules=self.lora_modules,
+                prompt_adapters=self.prompt_adapters,
+                request_logger=self.request_logger,
+                chat_template=self.chat_template,
             )
         logger.info(f"Request: {request}")
         generator = await self.openai_serving_chat.create_chat_completion(
@@ -150,6 +158,8 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
         engine_args,
         parsed_args.response_role,
         parsed_args.lora_modules,
+        parsed_args.prompt_adapters,
+        cli_args.get("request_logger"),
         parsed_args.chat_template,
     )
 
@@ -186,6 +196,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
         ],
         temperature=0.01,
         stream=True,
+        max_tokens=100,
     )
 
     for chat in chat_completion:
diff --git a/doc/source/serve/tutorials/vllm-example.md b/doc/source/serve/tutorials/vllm-example.md
index 746415921738..2dd503952bb7 100644
--- a/doc/source/serve/tutorials/vllm-example.md
+++ b/doc/source/serve/tutorials/vllm-example.md
@@ -33,13 +33,13 @@ The Serve code is as follows:
 Use the following code to start the Serve app:
 ```bash
 # on GPU
-serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="GPU"
+serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 accelerator="GPU"
 # on HPU
-serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 device="HPU"
+serve run llm:build_app model="NousResearch/Meta-Llama-3-8B-Instruct" tensor-parallel-size=2 accelerator="HPU"
 ```
 
 :::{note}
-This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs (based on the device type) using placement groups.
+This example uses Tensor Parallel size of 2, which means Ray Serve deploys the model to Ray Actors across 2 GPUs or HPUs (based on the accelerator type) using placement groups.
 :::
 
 

From 7395f9e9ace15538e49d440f6f7bf3cb8b4b17dd Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Tue, 13 Aug 2024 20:30:43 +0800
Subject: [PATCH 09/11] fix lint

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/doc_code/vllm_openai_example.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index e0659f47fb02..4f5aa7ff5851 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -17,8 +17,7 @@
     ErrorResponse,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
-                                                    PromptAdapterPath)
+from vllm.entrypoints.openai.serving_engine import LoRAModulePath, PromptAdapterPath
 from vllm.utils import FlexibleArgumentParser
 from vllm.entrypoints.logger import RequestLogger
 import torch

From 4af9ca1276924837471a7b7d7a3435a647b87bb1 Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Wed, 14 Aug 2024 09:50:19 +0800
Subject: [PATCH 10/11] remove hpu env check

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/doc_code/vllm_openai_example.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index 4f5aa7ff5851..da026af80de6 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -128,16 +128,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
         accelerator = cli_args.pop("accelerator")
     else:
         accelerator = "GPU"
-    if accelerator == "HPU":
-        try:
-            from habana_frameworks.torch.distributed.hccl import (
-                initialize_distributed_hpu,
-            )
-
-            initialize_distributed_hpu()
-            torch.zeros(1).to("hpu")
-        except Exception:
-            raise Exception("Please check the environment: HPU devices not available.")
     parsed_args = parse_vllm_args(cli_args)
     engine_args = AsyncEngineArgs.from_cli_args(parsed_args)
     engine_args.worker_use_ray = True

From de9eefdb912e8997ba34085dcb79f5765cb184b0 Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Wed, 14 Aug 2024 09:58:12 +0800
Subject: [PATCH 11/11] remove unused package

Signed-off-by: KepingYan <keping.yan@intel.com>
---
 doc/source/serve/doc_code/vllm_openai_example.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/source/serve/doc_code/vllm_openai_example.py b/doc/source/serve/doc_code/vllm_openai_example.py
index da026af80de6..4e5ba0c463c3 100644
--- a/doc/source/serve/doc_code/vllm_openai_example.py
+++ b/doc/source/serve/doc_code/vllm_openai_example.py
@@ -20,7 +20,6 @@
 from vllm.entrypoints.openai.serving_engine import LoRAModulePath, PromptAdapterPath
 from vllm.utils import FlexibleArgumentParser
 from vllm.entrypoints.logger import RequestLogger
-import torch
 
 logger = logging.getLogger("ray.serve")