From 008c9866bbf329ad11a6d78f3def546c4adbb800 Mon Sep 17 00:00:00 2001 From: letonghan Date: Fri, 31 May 2024 15:31:12 +0800 Subject: [PATCH 1/5] fix stream=false doesn't work issue Signed-off-by: letonghan --- comps/cores/mega/gateway.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py index a59fdcea6..f2d6f99df 100644 --- a/comps/cores/mega/gateway.py +++ b/comps/cores/mega/gateway.py @@ -118,6 +118,7 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888): async def handle_request(self, request: Request): data = await request.json() + stream_opt = data.get("stream", True) chat_request = ChatCompletionRequest.parse_obj(data) prompt = self._handle_message(chat_request.messages) parameters = LLMParams( @@ -126,7 +127,7 @@ async def handle_request(self, request: Request): top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, - streaming=chat_request.stream if chat_request.stream else True, + streaming=stream_opt, ) await self.megaservice.schedule(initial_inputs={"text": prompt}, llm_parameters=parameters) for node, response in self.megaservice.result_dict.items(): @@ -159,6 +160,7 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888): async def handle_request(self, request: Request): data = await request.json() + stream_opt = data.get("stream", True) chat_request = ChatCompletionRequest.parse_obj(data) prompt = self._handle_message(chat_request.messages) parameters = LLMParams( @@ -167,7 +169,7 @@ async def handle_request(self, request: Request): top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, - streaming=chat_request.stream if chat_request.stream else True, + streaming=stream_opt, ) await self.megaservice.schedule(initial_inputs={"query": prompt}, llm_parameters=parameters) for node, response in self.megaservice.result_dict.items(): @@ -247,6 +249,7 @@ def __init__(self, megaservice, host="0.0.0.0", port=8888): async def handle_request(self, request: Request): data = await request.json() + stream_opt = data.get("stream", True) chat_request = ChatCompletionRequest.parse_obj(data) prompt = self._handle_message(chat_request.messages) parameters = LLMParams( @@ -255,7 +258,7 @@ async def handle_request(self, request: Request): top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, - streaming=chat_request.stream if chat_request.stream else True, + streaming=stream_opt, ) await self.megaservice.schedule(initial_inputs={"query": prompt}, llm_parameters=parameters) for node, response in self.megaservice.result_dict.items(): From 0bb2c0cc9787cc8b6b2352f594f2416bf6b70392 Mon Sep 17 00:00:00 2001 From: letonghan Date: Tue, 4 Jun 2024 19:28:32 +0800 Subject: [PATCH 2/5] support qwen2 in llm microservice Signed-off-by: letonghan --- comps/llms/text-generation/qwen2/Dockerfile | 40 +++++ comps/llms/text-generation/qwen2/llm.py | 99 +++++++++++ .../text-generation/qwen2/requirements.txt | 8 + comps/llms/text-generation/qwen2/utils.py | 154 ++++++++++++++++++ 4 files changed, 301 insertions(+) create mode 100644 comps/llms/text-generation/qwen2/Dockerfile create mode 100644 comps/llms/text-generation/qwen2/llm.py create mode 100644 comps/llms/text-generation/qwen2/requirements.txt create mode 100644 comps/llms/text-generation/qwen2/utils.py diff --git a/comps/llms/text-generation/qwen2/Dockerfile b/comps/llms/text-generation/qwen2/Dockerfile new file mode 100644 index 000000000..760d80f2c --- /dev/null +++ b/comps/llms/text-generation/qwen2/Dockerfile @@ -0,0 +1,40 @@ + + +# HABANA environment +FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest as hpu + +ENV LANG=en_US.UTF-8 +ARG REPO=https://github.com/huggingface/optimum-habana.git +ARG REPO_VER=v1.11.1 + +RUN apt-get update && \ + apt-get install git-lfs && \ + git-lfs install && \ + apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY comps/llm/text-generation/qwen2/qwen2.patch /home/user/qwen2.patch + +SHELL ["/bin/bash", "--login", "-c"] +RUN git clone --single-branch -b ${REPO_VER} ${REPO} /optimum-habana + +ENV PYTHONPATH=/root:/home/user + +RUN cd /optimum-habana && git apply /qwen2.patch && \ + cd /optimum-habana/examples/text-generation && pip install -r requirements.txt && \ + cd /optimum-habana && python setup.py install + +WORKDIR /home/user/comps/llms/text-generation/qwen2 + +# ENTRYPOINT ["python", "llm.py"] + +ENTRYPOINT ["/usr/bin/sleep", "infinity"] diff --git a/comps/llms/text-generation/qwen2/llm.py b/comps/llms/text-generation/qwen2/llm.py new file mode 100644 index 000000000..ccc6d2a4f --- /dev/null +++ b/comps/llms/text-generation/qwen2/llm.py @@ -0,0 +1,99 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +from datetime import datetime +from fastapi.responses import StreamingResponse +from langsmith import traceable +from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice +from utils import initialize_model + + +def warmup(): + input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all" + ] + input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to("hpu") + for i in range(3): + print(f"Current time: {datetime.now()}") + print(f"Warming up {i+1}...") + outputs = model.generate( + **input_tokens, + generation_config=generation_config, + lazy_mode=True, + hpu_graphs=True, + profiling_steps=0, + profiling_warmup_steps=0, + ).cpu() + res = tokenizer.batch_decode(outputs, skip_special_tokens=True) + print(f"res: {res}") + + +@register_microservice( + name="opea_service@llm_qwen", + service_type=ServiceType.LLM, + endpoint="/v1/chat/completions", + host="0.0.0.0", + port=8000, +) +@traceable(run_type="llm") +def llm_generate(input: LLMParamsDoc): + input_query = input.query + input_tokens = tokenizer.batch_encode_plus([input_query], return_tensors="pt", padding=True) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to("hpu") + + print(f"[llm - qwen] Current time: {datetime.now()}") + output = model.generate( + **input_tokens, + generation_config=generation_config, + lazy_mode=True, + hpu_graphs=True, + profiling_steps=0, + profiling_warmup_steps=0, + ).cpu() + res = tokenizer.batch_decode(output, skip_special_tokens=True)[0] + print(f"[llm - qwen] res: {res}") + return res + + +if __name__ == "__main__": + model, tokenizer, generation_config = initialize_model( + model_name_or_path="Qwen/Qwen1.5-7B-Chat", + max_new_tokens=128 + ) + import habana_frameworks.torch.hpu as torch_hpu + print(f"[llm - qwen] model and tokenizer initialized.") + + from optimum.habana.utils import HabanaProfile + # compilation stage disable profiling + HabanaProfile.disable() + # Compilation + print("Graph compilation...") + warmup() + print(f"[llm - qwen] model warm up finished.") + + torch_hpu.synchronize() + HabanaProfile.enable() + print("[llm - qwen] Ready to inference") + + opea_microservices["opea_service@llm_qwen"].start() diff --git a/comps/llms/text-generation/qwen2/requirements.txt b/comps/llms/text-generation/qwen2/requirements.txt new file mode 100644 index 000000000..0a6509596 --- /dev/null +++ b/comps/llms/text-generation/qwen2/requirements.txt @@ -0,0 +1,8 @@ +docarray[full] +fastapi +langsmith +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +shortuuid +transformers diff --git a/comps/llms/text-generation/qwen2/utils.py b/comps/llms/text-generation/qwen2/utils.py new file mode 100644 index 000000000..14fc3349d --- /dev/null +++ b/comps/llms/text-generation/qwen2/utils.py @@ -0,0 +1,154 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import copy +import time +import torch +import shutil +from transformers.utils import check_min_version +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from optimum.habana.checkpoint_utils import ( + get_ds_injection_policy, + get_repo_root, + model_is_optimized, + model_on_meta, + write_checkpoints_json, +) +from optimum.habana.utils import ( + check_habana_frameworks_version, + check_optimum_habana_min_version, + set_seed +) + + +def setup_env(): + # Will error if the minimal version of Transformers is not installed. Remove at your own risks. + check_min_version("4.34.0") + check_optimum_habana_min_version("1.9.0.dev0") + # TODO: SW-167588 - WA for memory issue in hqt prep_model + os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") + + # Tweak generation so that it runs faster on Gaudi + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + adapt_transformers_to_gaudi() + + +def setup_device(): + import habana_frameworks.torch.core as htcore + + return torch.device("hpu") + + +def get_torch_compiled_model(model): + model.model = torch.compile(model.model, backend="hpu_backend") + return model + + +def setup_model(model_name_or_path, model_dtype, model_kwargs): + model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=model_dtype, **model_kwargs) + model = model.eval().to("hpu") + + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon": + model = wrap_in_hpu_graph(model, hash_with_views=False) + else: + model = wrap_in_hpu_graph(model) + + if model.config.model_type == "llama": + model = get_torch_compiled_model(model) + + return model + + +def setup_tokenizer(model_name_or_path, model): + tokenizer_kwargs = { + "revision": "main", + "token": None, + } + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **tokenizer_kwargs) + if not model.config.is_encoder_decoder: + tokenizer.padding_side = "left" + # Some models like GPT2 do not have a PAD token so we have to set it if necessary + if model.config.model_type == "llama": + # unwind broken decapoda-research config + model.generation_config.pad_token_id = 0 + model.generation_config.bos_token_id = 1 + model.generation_config.eos_token_id = 2 + tokenizer.bos_token_id = model.generation_config.bos_token_id + tokenizer.eos_token_id = model.generation_config.eos_token_id + tokenizer.pad_token_id = model.generation_config.pad_token_id + tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id) + tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id) + tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id) + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + model.generation_config.pad_token_id = model.generation_config.eos_token_id + return tokenizer, model + + +def setup_generation_config(model, tokenizer, max_new_tokens): + bad_words_ids = None + force_words_ids = None + + is_optimized = model_is_optimized(model.config) + # Generation configuration + generation_config = copy.deepcopy(model.generation_config) + generation_config.max_new_tokens = max_new_tokens + generation_config.use_cache = True + generation_config.static_shapes = is_optimized + generation_config.bucket_size = -1 + generation_config.bucket_internal = True + generation_config.do_sample = True + generation_config.num_beams = 1 + generation_config.bad_words_ids = bad_words_ids + generation_config.force_words_ids = force_words_ids + generation_config.num_return_sequences = 1 + generation_config.trim_logits = True + generation_config.attn_softmax_bf16 = True + generation_config.limit_hpu_graphs = True + generation_config.reuse_cache = False + generation_config.reduce_recompile = False + generation_config.use_flash_attention = False + generation_config.flash_attention_recompute = True + generation_config.flash_attention_causal_mask = True + return generation_config + + +def initialize_model(model_name_or_path, max_new_tokens=128): + init_start = time.perf_counter() + setup_env() + setup_device() + set_seed(17) + get_repo_root(model_name_or_path, local_rank=0, token=None) + model_dtype = torch.bfloat16 + + model_kwargs = { + "revision": "main", + "token": None, + "device_map": "auto", + "offload_folder": "/tmp/offload_folder/" + } + + model = setup_model(model_name_or_path, model_dtype, model_kwargs) + tokenizer, model = setup_tokenizer(model_name_or_path, model) + generation_config = setup_generation_config(model, tokenizer, max_new_tokens) + + init_end = time.perf_counter() + print(f"Model initialization took {(init_end - init_start):.3f}s") + return model, tokenizer, generation_config From 5a1cef4f97493f51e2e61acb0e0367243389c0a1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:30:01 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/llms/text-generation/qwen2/Dockerfile | 3 +++ comps/llms/text-generation/qwen2/llm.py | 22 ++++++++++----------- comps/llms/text-generation/qwen2/utils.py | 22 +++++++-------------- 3 files changed, 20 insertions(+), 27 deletions(-) diff --git a/comps/llms/text-generation/qwen2/Dockerfile b/comps/llms/text-generation/qwen2/Dockerfile index 760d80f2c..52cf3da3b 100644 --- a/comps/llms/text-generation/qwen2/Dockerfile +++ b/comps/llms/text-generation/qwen2/Dockerfile @@ -1,5 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + # HABANA environment FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest as hpu diff --git a/comps/llms/text-generation/qwen2/llm.py b/comps/llms/text-generation/qwen2/llm.py index ccc6d2a4f..4f407ccd6 100644 --- a/comps/llms/text-generation/qwen2/llm.py +++ b/comps/llms/text-generation/qwen2/llm.py @@ -13,21 +13,18 @@ # limitations under the License. import os -import torch from datetime import datetime + +import torch from fastapi.responses import StreamingResponse from langsmith import traceable -from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice from utils import initialize_model +from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice + def warmup(): - input_sentences = [ - "DeepSpeed is a machine learning framework", - "He is working on", - "He has a", - "He got all" - ] + input_sentences = ["DeepSpeed is a machine learning framework", "He is working on", "He has a", "He got all"] input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True) for t in input_tokens: if torch.is_tensor(input_tokens[t]): @@ -78,19 +75,20 @@ def llm_generate(input: LLMParamsDoc): if __name__ == "__main__": model, tokenizer, generation_config = initialize_model( - model_name_or_path="Qwen/Qwen1.5-7B-Chat", - max_new_tokens=128 + model_name_or_path="Qwen/Qwen1.5-7B-Chat", max_new_tokens=128 ) import habana_frameworks.torch.hpu as torch_hpu - print(f"[llm - qwen] model and tokenizer initialized.") + + print("[llm - qwen] model and tokenizer initialized.") from optimum.habana.utils import HabanaProfile + # compilation stage disable profiling HabanaProfile.disable() # Compilation print("Graph compilation...") warmup() - print(f"[llm - qwen] model warm up finished.") + print("[llm - qwen] model warm up finished.") torch_hpu.synchronize() HabanaProfile.enable() diff --git a/comps/llms/text-generation/qwen2/utils.py b/comps/llms/text-generation/qwen2/utils.py index 14fc3349d..3eef7a6e2 100644 --- a/comps/llms/text-generation/qwen2/utils.py +++ b/comps/llms/text-generation/qwen2/utils.py @@ -13,13 +13,12 @@ # limitations under the License. -import os import copy +import os +import shutil import time + import torch -import shutil -from transformers.utils import check_min_version -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from optimum.habana.checkpoint_utils import ( get_ds_injection_policy, get_repo_root, @@ -27,11 +26,9 @@ model_on_meta, write_checkpoints_json, ) -from optimum.habana.utils import ( - check_habana_frameworks_version, - check_optimum_habana_min_version, - set_seed -) +from optimum.habana.utils import check_habana_frameworks_version, check_optimum_habana_min_version, set_seed +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers.utils import check_min_version def setup_env(): @@ -138,12 +135,7 @@ def initialize_model(model_name_or_path, max_new_tokens=128): get_repo_root(model_name_or_path, local_rank=0, token=None) model_dtype = torch.bfloat16 - model_kwargs = { - "revision": "main", - "token": None, - "device_map": "auto", - "offload_folder": "/tmp/offload_folder/" - } + model_kwargs = {"revision": "main", "token": None, "device_map": "auto", "offload_folder": "/tmp/offload_folder/"} model = setup_model(model_name_or_path, model_dtype, model_kwargs) tokenizer, model = setup_tokenizer(model_name_or_path, model) From 4cd37272a356ae76e4cbac64f98e5e751df5c9ed Mon Sep 17 00:00:00 2001 From: letonghan Date: Tue, 4 Jun 2024 19:32:01 +0800 Subject: [PATCH 4/5] add patch file Signed-off-by: letonghan --- comps/llms/text-generation/qwen2/qwen2.patch | 127 +++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 comps/llms/text-generation/qwen2/qwen2.patch diff --git a/comps/llms/text-generation/qwen2/qwen2.patch b/comps/llms/text-generation/qwen2/qwen2.patch new file mode 100644 index 000000000..9b5d93567 --- /dev/null +++ b/comps/llms/text-generation/qwen2/qwen2.patch @@ -0,0 +1,127 @@ +diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py +index b086c80..e0e5a9f 100644 +--- a/examples/text-generation/run_lm_eval.py ++++ b/examples/text-generation/run_lm_eval.py +@@ -75,13 +75,13 @@ class HabanaModelAdapter(lm_eval.base.BaseLM): + self.options = options + self._device = args.device + self.model_inputs = {"use_cache": self.options.use_cache} +- if self.model.config.model_type in ["llama", "falcon"]: ++ if self.model.config.model_type in ["llama", "falcon", "qwen2"]: + self.model_inputs.update( + { + "reuse_cache": self.options.reuse_cache, + } + ) +- if self.model.config.model_type == "llama": ++ if self.model.config.model_type in ["llama","mistral","qwen2"]: + self.model_inputs.update( + { + "attn_softmax_bf16": self.options.attn_softmax_bf16, +diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py +index 8bce0ae..c29f458 100644 +--- a/examples/text-generation/utils.py ++++ b/examples/text-generation/utils.py +@@ -234,7 +234,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): + + model = deepspeed.init_inference(model, **ds_inference_kwargs) + model = model.module +- if model.config.model_type in ["llama", "falcon"]: ++ if model.config.model_type in ["llama", "falcon","qwen2"]: + patch_scoped_linear_all_reduce(model) + + if args.quant_config: +diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py +index 0d50470..94cc7eb 100755 +--- a/optimum/habana/transformers/generation/utils.py ++++ b/optimum/habana/transformers/generation/utils.py +@@ -740,7 +740,7 @@ class GaudiGenerationMixin(GenerationMixin): + ) + model_kwargs["kv_cache_len"] = calculated_max_length + +- if self.config.model_type in ["llama", "falcon"]: ++ if self.config.model_type in ["llama", "falcon","qwen2"]: + if self.config.max_position_embeddings < calculated_max_length: + unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length) + +diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py +index 6dc40a7..b5044af 100644 +--- a/optimum/habana/transformers/modeling_utils.py ++++ b/optimum/habana/transformers/modeling_utils.py +@@ -55,6 +55,9 @@ from .models import ( + GaudiOPTForCausalLM, + GaudiOPTLearnedPositionalEmbedding, + GaudiPhiForCausalLM, ++ GaudiQwen2Model, ++ GaudiQwen2Attention, ++ GaudiQwen2MLP, + _gaudi_wav2vec2_compute_mask_indices, + _gaudi_wav2vec2_mask_hidden_states, + gaudi_albert_forward, +@@ -118,6 +121,7 @@ from .models import ( + gaudi_phi_attention_forward, + gaudi_phi_decoder_layer_forward, + gaudi_phi_model_forward, ++ gaudi_qwen2_rmsnorm_forward, + gaudi_rot_matmul, + gaudi_rot_vec_mul, + gaudi_SpeechT5Attention_forward, +@@ -367,3 +371,11 @@ def adapt_transformers_to_gaudi(): + transformers.models.speecht5.modeling_speecht5.SpeechT5SpeechDecoderPrenet.forward = ( + gaudi_SpeechT5SpeechDecoderPrenet_forward + ) ++ ++ # Optimization for qwen2 on Gaudi ++ transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM = GaudiQwen2ForCausalLM ++ transformers.models.qwen2.modeling_qwen2.Qwen2Model = GaudiQwen2Model ++ transformers.models.qwen2.modeling_qwen2.Qwen2Attention = GaudiQwen2Attention ++ transformers.models.qwen2.modeling_qwen2.Qwen2MLP = GaudiQwen2MLP ++ transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer = GaudiQwen2DecoderLayer ++ transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm.forward = gaudi_qwen2_rmsnorm_forward +diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py +index 1582d3f..41fdfdc 100644 +--- a/optimum/habana/transformers/models/__init__.py ++++ b/optimum/habana/transformers/models/__init__.py +@@ -122,6 +122,14 @@ from .phi import ( + gaudi_phi_decoder_layer_forward, + gaudi_phi_model_forward, + ) ++from .qwen2 import ( ++ GaudiQwen2Attention, ++ GaudiQwen2DecoderLayer, ++ GaudiQwen2ForCausalLM, ++ GaudiQwen2MLP, ++ GaudiQwen2Model, ++ gaudi_qwen2_rmsnorm_forward, ++) + from .speecht5 import ( + gaudi_generate_speech, + gaudi_SpeechT5Attention_forward, +diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py +index dc6e136..7dfebaa 100644 +--- a/optimum/habana/transformers/trainer.py ++++ b/optimum/habana/transformers/trainer.py +@@ -916,9 +916,9 @@ class GaudiTrainer(Trainer): + if step % args.gradient_accumulation_steps == 0: + self.control = self.callback_handler.on_step_begin(args, self.state, self.control) + +- # attn_softmax_bf16 and use_flash_attention is enabled only for llama ++ # attn_softmax_bf16 and use_flash_attention is enabled only for llama and qwen2 + if hasattr(self.model, "generation_config") and self.model.generation_config is not None: +- if self.model.config.model_type == "llama": ++ if self.model.config.model_type in ["llama", "qwen2"]: + if self.model.generation_config.attn_softmax_bf16: + inputs["attn_softmax_bf16"] = True + if self.model.generation_config.use_flash_attention: +@@ -1799,9 +1799,9 @@ class GaudiTrainer(Trainer): + if batch_size is None: + batch_size = observed_batch_size + +- # attn_softmax_bf16 and use_flash_attention are enabled only for llama ++ # attn_softmax_bf16 and use_flash_attention are enabled only for llama and qwen2 + if hasattr(self.model, "generation_config") and self.model.generation_config is not None: +- if self.model.config.model_type == "llama": ++ if self.model.config.model_type in ["llama", "qwen2"]: + if self.model.generation_config.attn_softmax_bf16: + inputs["attn_softmax_bf16"] = True + if self.model.generation_config.use_flash_attention: From a6b0d997428d7f53cb3894b604c171d8b3fbbc9f Mon Sep 17 00:00:00 2001 From: letonghan Date: Tue, 4 Jun 2024 20:40:17 +0800 Subject: [PATCH 5/5] update dockerfile Signed-off-by: letonghan --- comps/llms/text-generation/qwen2/Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/comps/llms/text-generation/qwen2/Dockerfile b/comps/llms/text-generation/qwen2/Dockerfile index 52cf3da3b..9d7d1e094 100644 --- a/comps/llms/text-generation/qwen2/Dockerfile +++ b/comps/llms/text-generation/qwen2/Dockerfile @@ -38,6 +38,4 @@ RUN cd /optimum-habana && git apply /qwen2.patch && \ WORKDIR /home/user/comps/llms/text-generation/qwen2 -# ENTRYPOINT ["python", "llm.py"] - -ENTRYPOINT ["/usr/bin/sleep", "infinity"] +ENTRYPOINT ["python", "llm.py"]