From 008c9866bbf329ad11a6d78f3def546c4adbb800 Mon Sep 17 00:00:00 2001
From: letonghan <letong.han@intel.com>
Date: Fri, 31 May 2024 15:31:12 +0800
Subject: [PATCH 1/5] fix stream=false doesn't work issue

Signed-off-by: letonghan <letong.han@intel.com>
---
 comps/cores/mega/gateway.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py
index a59fdcea6..f2d6f99df 100644
--- a/comps/cores/mega/gateway.py
+++ b/comps/cores/mega/gateway.py
@@ -118,6 +118,7 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888):
 
     async def handle_request(self, request: Request):
         data = await request.json()
+        stream_opt = data.get("stream", True)
         chat_request = ChatCompletionRequest.parse_obj(data)
         prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
@@ -126,7 +127,7 @@ async def handle_request(self, request: Request):
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
             repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
-            streaming=chat_request.stream if chat_request.stream else True,
+            streaming=stream_opt,
         )
         await self.megaservice.schedule(initial_inputs={"text": prompt}, llm_parameters=parameters)
         for node, response in self.megaservice.result_dict.items():
@@ -159,6 +160,7 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888):
 
     async def handle_request(self, request: Request):
         data = await request.json()
+        stream_opt = data.get("stream", True)
         chat_request = ChatCompletionRequest.parse_obj(data)
         prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
@@ -167,7 +169,7 @@ async def handle_request(self, request: Request):
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
             repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
-            streaming=chat_request.stream if chat_request.stream else True,
+            streaming=stream_opt,
         )
         await self.megaservice.schedule(initial_inputs={"query": prompt}, llm_parameters=parameters)
         for node, response in self.megaservice.result_dict.items():
@@ -247,6 +249,7 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888):
 
     async def handle_request(self, request: Request):
         data = await request.json()
+        stream_opt = data.get("stream", True)
         chat_request = ChatCompletionRequest.parse_obj(data)
         prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
@@ -255,7 +258,7 @@ async def handle_request(self, request: Request):
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
             repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
-            streaming=chat_request.stream if chat_request.stream else True,
+            streaming=stream_opt,
         )
         await self.megaservice.schedule(initial_inputs={"query": prompt}, llm_parameters=parameters)
         for node, response in self.megaservice.result_dict.items():

From 0bb2c0cc9787cc8b6b2352f594f2416bf6b70392 Mon Sep 17 00:00:00 2001
From: letonghan <letong.han@intel.com>
Date: Tue, 4 Jun 2024 19:28:32 +0800
Subject: [PATCH 2/5] support qwen2 in llm microservice

Signed-off-by: letonghan <letong.han@intel.com>
---
 comps/llms/text-generation/qwen2/Dockerfile   |  40 +++++
 comps/llms/text-generation/qwen2/llm.py       |  99 +++++++++++
 .../text-generation/qwen2/requirements.txt    |   8 +
 comps/llms/text-generation/qwen2/utils.py     | 154 ++++++++++++++++++
 4 files changed, 301 insertions(+)
 create mode 100644 comps/llms/text-generation/qwen2/Dockerfile
 create mode 100644 comps/llms/text-generation/qwen2/llm.py
 create mode 100644 comps/llms/text-generation/qwen2/requirements.txt
 create mode 100644 comps/llms/text-generation/qwen2/utils.py

diff --git a/comps/llms/text-generation/qwen2/Dockerfile b/comps/llms/text-generation/qwen2/Dockerfile
new file mode 100644
index 000000000..760d80f2c
--- /dev/null
+++ b/comps/llms/text-generation/qwen2/Dockerfile
@@ -0,0 +1,40 @@
+
+
+# HABANA environment
+FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest as hpu
+
+ENV LANG=en_US.UTF-8
+ARG REPO=https://github.com/huggingface/optimum-habana.git
+ARG REPO_VER=v1.11.1
+
+RUN apt-get update && \
+    apt-get install git-lfs && \
+    git-lfs install && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+COPY comps/llm/text-generation/qwen2/qwen2.patch /home/user/qwen2.patch
+
+SHELL ["/bin/bash", "--login", "-c"]
+RUN git clone --single-branch -b ${REPO_VER} ${REPO} /optimum-habana
+
+ENV PYTHONPATH=/root:/home/user
+
+RUN cd /optimum-habana && git apply /qwen2.patch && \
+    cd /optimum-habana/examples/text-generation && pip install -r requirements.txt && \
+    cd /optimum-habana && python setup.py install
+
+WORKDIR /home/user/comps/llms/text-generation/qwen2
+
+# ENTRYPOINT ["python", "llm.py"]
+
+ENTRYPOINT ["/usr/bin/sleep", "infinity"]
diff --git a/comps/llms/text-generation/qwen2/llm.py b/comps/llms/text-generation/qwen2/llm.py
new file mode 100644
index 000000000..ccc6d2a4f
--- /dev/null
+++ b/comps/llms/text-generation/qwen2/llm.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+from datetime import datetime
+from fastapi.responses import StreamingResponse
+from langsmith import traceable
+from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+from utils import initialize_model
+
+
+def warmup():
+    input_sentences = [
+        "DeepSpeed is a machine learning framework",
+        "He is working on",
+        "He has a",
+        "He got all"
+    ]
+    input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True)
+    for t in input_tokens:
+        if torch.is_tensor(input_tokens[t]):
+            input_tokens[t] = input_tokens[t].to("hpu")
+    for i in range(3):
+        print(f"Current time: {datetime.now()}")
+        print(f"Warming up {i+1}...")
+        outputs = model.generate(
+            **input_tokens,
+            generation_config=generation_config,
+            lazy_mode=True,
+            hpu_graphs=True,
+            profiling_steps=0,
+            profiling_warmup_steps=0,
+        ).cpu()
+        res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        print(f"res: {res}")
+
+
+@register_microservice(
+    name="opea_service@llm_qwen",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/chat/completions",
+    host="0.0.0.0",
+    port=8000,
+)
+@traceable(run_type="llm")
+def llm_generate(input: LLMParamsDoc):
+    input_query = input.query
+    input_tokens = tokenizer.batch_encode_plus([input_query], return_tensors="pt", padding=True)
+    for t in input_tokens:
+        if torch.is_tensor(input_tokens[t]):
+            input_tokens[t] = input_tokens[t].to("hpu")
+
+    print(f"[llm - qwen] Current time: {datetime.now()}")
+    output = model.generate(
+        **input_tokens,
+        generation_config=generation_config,
+        lazy_mode=True,
+        hpu_graphs=True,
+        profiling_steps=0,
+        profiling_warmup_steps=0,
+    ).cpu()
+    res = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
+    print(f"[llm - qwen] res: {res}")
+    return res
+
+
+if __name__ == "__main__":
+    model, tokenizer, generation_config = initialize_model(
+        model_name_or_path="Qwen/Qwen1.5-7B-Chat", 
+        max_new_tokens=128
+    )
+    import habana_frameworks.torch.hpu as torch_hpu
+    print(f"[llm - qwen] model and tokenizer initialized.")
+
+    from optimum.habana.utils import HabanaProfile
+    # compilation stage disable profiling
+    HabanaProfile.disable()
+    # Compilation
+    print("Graph compilation...")
+    warmup()
+    print(f"[llm - qwen] model warm up finished.")
+
+    torch_hpu.synchronize()
+    HabanaProfile.enable()
+    print("[llm - qwen] Ready to inference")
+
+    opea_microservices["opea_service@llm_qwen"].start()
diff --git a/comps/llms/text-generation/qwen2/requirements.txt b/comps/llms/text-generation/qwen2/requirements.txt
new file mode 100644
index 000000000..0a6509596
--- /dev/null
+++ b/comps/llms/text-generation/qwen2/requirements.txt
@@ -0,0 +1,8 @@
+docarray[full]
+fastapi
+langsmith
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+shortuuid
+transformers
diff --git a/comps/llms/text-generation/qwen2/utils.py b/comps/llms/text-generation/qwen2/utils.py
new file mode 100644
index 000000000..14fc3349d
--- /dev/null
+++ b/comps/llms/text-generation/qwen2/utils.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import copy
+import time
+import torch
+import shutil
+from transformers.utils import check_min_version
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from optimum.habana.checkpoint_utils import (
+    get_ds_injection_policy,
+    get_repo_root,
+    model_is_optimized,
+    model_on_meta,
+    write_checkpoints_json,
+)
+from optimum.habana.utils import (
+    check_habana_frameworks_version, 
+    check_optimum_habana_min_version, 
+    set_seed
+)
+
+
+def setup_env():
+    # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+    check_min_version("4.34.0")
+    check_optimum_habana_min_version("1.9.0.dev0")
+    # TODO: SW-167588 - WA for memory issue in hqt prep_model
+    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
+
+    # Tweak generation so that it runs faster on Gaudi
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+    adapt_transformers_to_gaudi()
+
+
+def setup_device():
+    import habana_frameworks.torch.core as htcore
+
+    return torch.device("hpu")
+
+
+def get_torch_compiled_model(model):
+    model.model = torch.compile(model.model, backend="hpu_backend")
+    return model
+
+
+def setup_model(model_name_or_path, model_dtype, model_kwargs):
+    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
+    model = model.eval().to("hpu")
+
+    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+    if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon":
+        model = wrap_in_hpu_graph(model, hash_with_views=False)
+    else:
+        model = wrap_in_hpu_graph(model)
+
+    if model.config.model_type == "llama":
+        model = get_torch_compiled_model(model)
+
+    return model
+
+
+def setup_tokenizer(model_name_or_path, model):
+    tokenizer_kwargs = {
+        "revision": "main",
+        "token": None,
+    }
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **tokenizer_kwargs)
+    if not model.config.is_encoder_decoder:
+        tokenizer.padding_side = "left"
+    # Some models like GPT2 do not have a PAD token so we have to set it if necessary
+    if model.config.model_type == "llama":
+        # unwind broken decapoda-research config
+        model.generation_config.pad_token_id = 0
+        model.generation_config.bos_token_id = 1
+        model.generation_config.eos_token_id = 2
+        tokenizer.bos_token_id = model.generation_config.bos_token_id
+        tokenizer.eos_token_id = model.generation_config.eos_token_id
+        tokenizer.pad_token_id = model.generation_config.pad_token_id
+        tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
+        tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
+        tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+    return tokenizer, model
+
+
+def setup_generation_config(model, tokenizer, max_new_tokens):
+    bad_words_ids = None
+    force_words_ids = None
+
+    is_optimized = model_is_optimized(model.config)
+    # Generation configuration
+    generation_config = copy.deepcopy(model.generation_config)
+    generation_config.max_new_tokens = max_new_tokens
+    generation_config.use_cache = True
+    generation_config.static_shapes = is_optimized
+    generation_config.bucket_size = -1
+    generation_config.bucket_internal = True
+    generation_config.do_sample = True
+    generation_config.num_beams = 1
+    generation_config.bad_words_ids = bad_words_ids
+    generation_config.force_words_ids = force_words_ids
+    generation_config.num_return_sequences = 1
+    generation_config.trim_logits = True
+    generation_config.attn_softmax_bf16 = True
+    generation_config.limit_hpu_graphs = True
+    generation_config.reuse_cache = False
+    generation_config.reduce_recompile = False
+    generation_config.use_flash_attention = False
+    generation_config.flash_attention_recompute = True
+    generation_config.flash_attention_causal_mask = True
+    return generation_config
+
+
+def initialize_model(model_name_or_path, max_new_tokens=128):
+    init_start = time.perf_counter()
+    setup_env()
+    setup_device()
+    set_seed(17)
+    get_repo_root(model_name_or_path, local_rank=0, token=None)
+    model_dtype = torch.bfloat16
+
+    model_kwargs = {
+        "revision": "main",
+        "token": None,
+        "device_map": "auto",
+        "offload_folder": "/tmp/offload_folder/"
+    }
+
+    model = setup_model(model_name_or_path, model_dtype, model_kwargs)
+    tokenizer, model = setup_tokenizer(model_name_or_path, model)
+    generation_config = setup_generation_config(model, tokenizer, max_new_tokens)
+
+    init_end = time.perf_counter()
+    print(f"Model initialization took {(init_end - init_start):.3f}s")
+    return model, tokenizer, generation_config

From 5a1cef4f97493f51e2e61acb0e0367243389c0a1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 4 Jun 2024 11:30:01 +0000
Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/llms/text-generation/qwen2/Dockerfile |  3 +++
 comps/llms/text-generation/qwen2/llm.py     | 22 ++++++++++-----------
 comps/llms/text-generation/qwen2/utils.py   | 22 +++++++--------------
 3 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/comps/llms/text-generation/qwen2/Dockerfile b/comps/llms/text-generation/qwen2/Dockerfile
index 760d80f2c..52cf3da3b 100644
--- a/comps/llms/text-generation/qwen2/Dockerfile
+++ b/comps/llms/text-generation/qwen2/Dockerfile
@@ -1,5 +1,8 @@
 
 
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 # HABANA environment
 FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest as hpu
 
diff --git a/comps/llms/text-generation/qwen2/llm.py b/comps/llms/text-generation/qwen2/llm.py
index ccc6d2a4f..4f407ccd6 100644
--- a/comps/llms/text-generation/qwen2/llm.py
+++ b/comps/llms/text-generation/qwen2/llm.py
@@ -13,21 +13,18 @@
 # limitations under the License.
 
 import os
-import torch
 from datetime import datetime
+
+import torch
 from fastapi.responses import StreamingResponse
 from langsmith import traceable
-from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
 from utils import initialize_model
 
+from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+
 
 def warmup():
-    input_sentences = [
-        "DeepSpeed is a machine learning framework",
-        "He is working on",
-        "He has a",
-        "He got all"
-    ]
+    input_sentences = ["DeepSpeed is a machine learning framework", "He is working on", "He has a", "He got all"]
     input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True)
     for t in input_tokens:
         if torch.is_tensor(input_tokens[t]):
@@ -78,19 +75,20 @@ def llm_generate(input: LLMParamsDoc):
 
 if __name__ == "__main__":
     model, tokenizer, generation_config = initialize_model(
-        model_name_or_path="Qwen/Qwen1.5-7B-Chat", 
-        max_new_tokens=128
+        model_name_or_path="Qwen/Qwen1.5-7B-Chat", max_new_tokens=128
     )
     import habana_frameworks.torch.hpu as torch_hpu
-    print(f"[llm - qwen] model and tokenizer initialized.")
+
+    print("[llm - qwen] model and tokenizer initialized.")
 
     from optimum.habana.utils import HabanaProfile
+
     # compilation stage disable profiling
     HabanaProfile.disable()
     # Compilation
     print("Graph compilation...")
     warmup()
-    print(f"[llm - qwen] model warm up finished.")
+    print("[llm - qwen] model warm up finished.")
 
     torch_hpu.synchronize()
     HabanaProfile.enable()
diff --git a/comps/llms/text-generation/qwen2/utils.py b/comps/llms/text-generation/qwen2/utils.py
index 14fc3349d..3eef7a6e2 100644
--- a/comps/llms/text-generation/qwen2/utils.py
+++ b/comps/llms/text-generation/qwen2/utils.py
@@ -13,13 +13,12 @@
 # limitations under the License.
 
 
-import os
 import copy
+import os
+import shutil
 import time
+
 import torch
-import shutil
-from transformers.utils import check_min_version
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from optimum.habana.checkpoint_utils import (
     get_ds_injection_policy,
     get_repo_root,
@@ -27,11 +26,9 @@
     model_on_meta,
     write_checkpoints_json,
 )
-from optimum.habana.utils import (
-    check_habana_frameworks_version, 
-    check_optimum_habana_min_version, 
-    set_seed
-)
+from optimum.habana.utils import check_habana_frameworks_version, check_optimum_habana_min_version, set_seed
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.utils import check_min_version
 
 
 def setup_env():
@@ -138,12 +135,7 @@ def initialize_model(model_name_or_path, max_new_tokens=128):
     get_repo_root(model_name_or_path, local_rank=0, token=None)
     model_dtype = torch.bfloat16
 
-    model_kwargs = {
-        "revision": "main",
-        "token": None,
-        "device_map": "auto",
-        "offload_folder": "/tmp/offload_folder/"
-    }
+    model_kwargs = {"revision": "main", "token": None, "device_map": "auto", "offload_folder": "/tmp/offload_folder/"}
 
     model = setup_model(model_name_or_path, model_dtype, model_kwargs)
     tokenizer, model = setup_tokenizer(model_name_or_path, model)

From 4cd37272a356ae76e4cbac64f98e5e751df5c9ed Mon Sep 17 00:00:00 2001
From: letonghan <letong.han@intel.com>
Date: Tue, 4 Jun 2024 19:32:01 +0800
Subject: [PATCH 4/5] add patch file

Signed-off-by: letonghan <letong.han@intel.com>
---
 comps/llms/text-generation/qwen2/qwen2.patch | 127 +++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100644 comps/llms/text-generation/qwen2/qwen2.patch

diff --git a/comps/llms/text-generation/qwen2/qwen2.patch b/comps/llms/text-generation/qwen2/qwen2.patch
new file mode 100644
index 000000000..9b5d93567
--- /dev/null
+++ b/comps/llms/text-generation/qwen2/qwen2.patch
@@ -0,0 +1,127 @@
+diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
+index b086c80..e0e5a9f 100644
+--- a/examples/text-generation/run_lm_eval.py
++++ b/examples/text-generation/run_lm_eval.py
+@@ -75,13 +75,13 @@ class HabanaModelAdapter(lm_eval.base.BaseLM):
+         self.options = options
+         self._device = args.device
+         self.model_inputs = {"use_cache": self.options.use_cache}
+-        if self.model.config.model_type in ["llama", "falcon"]:
++        if self.model.config.model_type in ["llama", "falcon", "qwen2"]:
+             self.model_inputs.update(
+                 {
+                     "reuse_cache": self.options.reuse_cache,
+                 }
+             )
+-        if self.model.config.model_type == "llama":
++        if self.model.config.model_type in ["llama","mistral","qwen2"]:
+             self.model_inputs.update(
+                 {
+                     "attn_softmax_bf16": self.options.attn_softmax_bf16,
+diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
+index 8bce0ae..c29f458 100644
+--- a/examples/text-generation/utils.py
++++ b/examples/text-generation/utils.py
+@@ -234,7 +234,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
+ 
+     model = deepspeed.init_inference(model, **ds_inference_kwargs)
+     model = model.module
+-    if model.config.model_type in ["llama", "falcon"]:
++    if model.config.model_type in ["llama", "falcon","qwen2"]:
+         patch_scoped_linear_all_reduce(model)
+ 
+     if args.quant_config:
+diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
+index 0d50470..94cc7eb 100755
+--- a/optimum/habana/transformers/generation/utils.py
++++ b/optimum/habana/transformers/generation/utils.py
+@@ -740,7 +740,7 @@ class GaudiGenerationMixin(GenerationMixin):
+                     )
+                     model_kwargs["kv_cache_len"] = calculated_max_length
+ 
+-            if self.config.model_type in ["llama", "falcon"]:
++            if self.config.model_type in ["llama", "falcon","qwen2"]:
+                 if self.config.max_position_embeddings < calculated_max_length:
+                     unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length)
+ 
+diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
+index 6dc40a7..b5044af 100644
+--- a/optimum/habana/transformers/modeling_utils.py
++++ b/optimum/habana/transformers/modeling_utils.py
+@@ -55,6 +55,9 @@ from .models import (
+     GaudiOPTForCausalLM,
+     GaudiOPTLearnedPositionalEmbedding,
+     GaudiPhiForCausalLM,
++    GaudiQwen2Model,
++    GaudiQwen2Attention,
++    GaudiQwen2MLP,
+     _gaudi_wav2vec2_compute_mask_indices,
+     _gaudi_wav2vec2_mask_hidden_states,
+     gaudi_albert_forward,
+@@ -118,6 +121,7 @@ from .models import (
+     gaudi_phi_attention_forward,
+     gaudi_phi_decoder_layer_forward,
+     gaudi_phi_model_forward,
++    gaudi_qwen2_rmsnorm_forward,
+     gaudi_rot_matmul,
+     gaudi_rot_vec_mul,
+     gaudi_SpeechT5Attention_forward,
+@@ -367,3 +371,11 @@ def adapt_transformers_to_gaudi():
+     transformers.models.speecht5.modeling_speecht5.SpeechT5SpeechDecoderPrenet.forward = (
+         gaudi_SpeechT5SpeechDecoderPrenet_forward
+     )
++
++    # Optimization for qwen2 on Gaudi
++    transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM = GaudiQwen2ForCausalLM
++    transformers.models.qwen2.modeling_qwen2.Qwen2Model = GaudiQwen2Model
++    transformers.models.qwen2.modeling_qwen2.Qwen2Attention = GaudiQwen2Attention
++    transformers.models.qwen2.modeling_qwen2.Qwen2MLP = GaudiQwen2MLP
++    transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer = GaudiQwen2DecoderLayer
++    transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm.forward = gaudi_qwen2_rmsnorm_forward
+diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
+index 1582d3f..41fdfdc 100644
+--- a/optimum/habana/transformers/models/__init__.py
++++ b/optimum/habana/transformers/models/__init__.py
+@@ -122,6 +122,14 @@ from .phi import (
+     gaudi_phi_decoder_layer_forward,
+     gaudi_phi_model_forward,
+ )
++from .qwen2 import (
++    GaudiQwen2Attention,
++    GaudiQwen2DecoderLayer,
++    GaudiQwen2ForCausalLM,
++    GaudiQwen2MLP,
++    GaudiQwen2Model,
++    gaudi_qwen2_rmsnorm_forward,
++)
+ from .speecht5 import (
+     gaudi_generate_speech,
+     gaudi_SpeechT5Attention_forward,
+diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
+index dc6e136..7dfebaa 100644
+--- a/optimum/habana/transformers/trainer.py
++++ b/optimum/habana/transformers/trainer.py
+@@ -916,9 +916,9 @@ class GaudiTrainer(Trainer):
+                 if step % args.gradient_accumulation_steps == 0:
+                     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
+ 
+-                # attn_softmax_bf16 and use_flash_attention is enabled only for llama
++                # attn_softmax_bf16 and use_flash_attention is enabled only for llama and qwen2
+                 if hasattr(self.model, "generation_config") and self.model.generation_config is not None:
+-                    if self.model.config.model_type == "llama":
++                    if self.model.config.model_type in ["llama", "qwen2"]:
+                         if self.model.generation_config.attn_softmax_bf16:
+                             inputs["attn_softmax_bf16"] = True
+                         if self.model.generation_config.use_flash_attention:
+@@ -1799,9 +1799,9 @@ class GaudiTrainer(Trainer):
+                 if batch_size is None:
+                     batch_size = observed_batch_size
+ 
+-            # attn_softmax_bf16 and use_flash_attention are enabled only for llama
++            # attn_softmax_bf16 and use_flash_attention are enabled only for llama and qwen2
+             if hasattr(self.model, "generation_config") and self.model.generation_config is not None:
+-                if self.model.config.model_type == "llama":
++                if self.model.config.model_type in ["llama", "qwen2"]:
+                     if self.model.generation_config.attn_softmax_bf16:
+                         inputs["attn_softmax_bf16"] = True
+                     if self.model.generation_config.use_flash_attention:

From a6b0d997428d7f53cb3894b604c171d8b3fbbc9f Mon Sep 17 00:00:00 2001
From: letonghan <letong.han@intel.com>
Date: Tue, 4 Jun 2024 20:40:17 +0800
Subject: [PATCH 5/5] update dockerfile

Signed-off-by: letonghan <letong.han@intel.com>
---
 comps/llms/text-generation/qwen2/Dockerfile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/comps/llms/text-generation/qwen2/Dockerfile b/comps/llms/text-generation/qwen2/Dockerfile
index 52cf3da3b..9d7d1e094 100644
--- a/comps/llms/text-generation/qwen2/Dockerfile
+++ b/comps/llms/text-generation/qwen2/Dockerfile
@@ -38,6 +38,4 @@ RUN cd /optimum-habana && git apply /qwen2.patch && \
 
 WORKDIR /home/user/comps/llms/text-generation/qwen2
 
-# ENTRYPOINT ["python", "llm.py"]
-
-ENTRYPOINT ["/usr/bin/sleep", "infinity"]
+ENTRYPOINT ["python", "llm.py"]