diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py
index 2636a0f83..115658e31 100644
--- a/comps/cores/mega/gateway.py
+++ b/comps/cores/mega/gateway.py
@@ -160,11 +160,13 @@ async def handle_request(self, request: Request):
         chat_request = ChatCompletionRequest.parse_obj(data)
         prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
-            max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
+            max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
             top_k=chat_request.top_k if chat_request.top_k else 10,
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
-            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
             chat_template=chat_request.chat_template if chat_request.chat_template else None,
         )
@@ -214,11 +216,13 @@ async def handle_request(self, request: Request):
         chat_request = ChatCompletionRequest.parse_obj(data)
         prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
-            max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
+            max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
             top_k=chat_request.top_k if chat_request.top_k else 10,
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
-            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
         )
         result_dict, runtime_graph = await self.megaservice.schedule(
@@ -350,11 +354,13 @@ async def handle_request(self, request: Request):
         chat_request = ChatCompletionRequest.parse_obj(data)
         prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
-            max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
+            max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
             top_k=chat_request.top_k if chat_request.top_k else 10,
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
-            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
         )
         result_dict, runtime_graph = await self.megaservice.schedule(
@@ -399,11 +405,13 @@ async def handle_request(self, request: Request):
         chat_request = AudioChatCompletionRequest.parse_obj(data)
         parameters = LLMParams(
             # relatively lower max_tokens for audio conversation
-            max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 128,
+            max_tokens=chat_request.max_tokens if chat_request.max_tokens else 128,
             top_k=chat_request.top_k if chat_request.top_k else 10,
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
-            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=False,  # TODO add streaming LLM output as input to TTS
         )
         result_dict, runtime_graph = await self.megaservice.schedule(
@@ -428,11 +436,13 @@ async def handle_request(self, request: Request):
         chat_request = ChatCompletionRequest.parse_obj(data)
         prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
-            max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
+            max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
             top_k=chat_request.top_k if chat_request.top_k else 10,
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
-            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
         )
         result_dict, runtime_graph = await self.megaservice.schedule(
@@ -472,11 +482,13 @@ async def handle_request(self, request: Request):
         chat_request = ChatCompletionRequest.parse_obj(data)
         prompt = self._handle_message(chat_request.messages)
         parameters = LLMParams(
-            max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
+            max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
             top_k=chat_request.top_k if chat_request.top_k else 10,
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
-            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
         )
         result_dict, runtime_graph = await self.megaservice.schedule(
@@ -520,7 +532,9 @@ async def handle_request(self, request: Request):
             top_k=chat_request.top_k if chat_request.top_k else 10,
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
-            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
         )
         result_dict, runtime_graph = await self.megaservice.schedule(
@@ -569,7 +583,9 @@ async def handle_request(self, request: Request):
             top_k=chat_request.top_k if chat_request.top_k else 10,
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
-            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
         )
         result_dict, runtime_graph = await self.megaservice.schedule(
@@ -758,7 +774,9 @@ async def handle_request(self, request: Request):
             top_k=chat_request.top_k if chat_request.top_k else 10,
             top_p=chat_request.top_p if chat_request.top_p else 0.95,
             temperature=chat_request.temperature if chat_request.temperature else 0.01,
-            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
+            presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
+            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             streaming=stream_opt,
             chat_template=chat_request.chat_template if chat_request.chat_template else None,
         )
diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py
index 2596fb17a..d2fb0adb1 100644
--- a/comps/cores/proto/api_protocol.py
+++ b/comps/cores/proto/api_protocol.py
@@ -285,8 +285,9 @@ class AudioChatCompletionRequest(BaseModel):
     max_tokens: Optional[int] = 1024
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
-    presence_penalty: Optional[float] = 1.03
+    presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
+    repetition_penalty: Optional[float] = 1.03
     user: Optional[str] = None
 
 
@@ -345,6 +346,7 @@ class CompletionRequest(BaseModel):
     echo: Optional[bool] = False
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
+    repetition_penalty: Optional[float] = 1.03
     user: Optional[str] = None
     use_beam_search: Optional[bool] = False
     best_of: Optional[int] = None
diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py
index af62f5104..94edba694 100644
--- a/comps/cores/proto/docarray.py
+++ b/comps/cores/proto/docarray.py
@@ -145,11 +145,14 @@ class RerankedDoc(BaseDoc):
 class LLMParamsDoc(BaseDoc):
     model: Optional[str] = None  # for openai and ollama
     query: str
+    max_tokens: int = 1024
     max_new_tokens: int = 1024
     top_k: int = 10
     top_p: float = 0.95
     typical_p: float = 0.95
     temperature: float = 0.01
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
     repetition_penalty: float = 1.03
     streaming: bool = True
 
@@ -179,11 +182,14 @@ def chat_template_must_contain_variables(cls, v):
 
 
 class LLMParams(BaseDoc):
+    max_tokens: int = 1024
     max_new_tokens: int = 1024
     top_k: int = 10
     top_p: float = 0.95
     typical_p: float = 0.95
     temperature: float = 0.01
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
     repetition_penalty: float = 1.03
     streaming: bool = True
 
diff --git a/comps/llms/faq-generation/tgi/langchain/llm.py b/comps/llms/faq-generation/tgi/langchain/llm.py
index 0b4d70e85..2b6a96060 100644
--- a/comps/llms/faq-generation/tgi/langchain/llm.py
+++ b/comps/llms/faq-generation/tgi/langchain/llm.py
@@ -40,7 +40,7 @@ def llm_generate(input: LLMParamsDoc):
     llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
     llm = HuggingFaceEndpoint(
         endpoint_url=llm_endpoint,
-        max_new_tokens=input.max_new_tokens,
+        max_new_tokens=input.max_tokens,
         top_k=input.top_k,
         top_p=input.top_p,
         typical_p=input.typical_p,
diff --git a/comps/llms/faq-generation/tgi/langchain/requirements.txt b/comps/llms/faq-generation/tgi/langchain/requirements.txt
index fa1548d7c..36257d393 100644
--- a/comps/llms/faq-generation/tgi/langchain/requirements.txt
+++ b/comps/llms/faq-generation/tgi/langchain/requirements.txt
@@ -2,7 +2,10 @@ docarray[full]
 fastapi
 huggingface_hub
 langchain
+langchain-huggingface
+langchain-openai
 langchain_community
+langchainhub
 opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
diff --git a/comps/llms/summarization/tgi/langchain/llm.py b/comps/llms/summarization/tgi/langchain/llm.py
index 80c5d3924..40150ff81 100644
--- a/comps/llms/summarization/tgi/langchain/llm.py
+++ b/comps/llms/summarization/tgi/langchain/llm.py
@@ -39,7 +39,7 @@ def llm_generate(input: LLMParamsDoc):
     llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
     llm = HuggingFaceEndpoint(
         endpoint_url=llm_endpoint,
-        max_new_tokens=input.max_new_tokens,
+        max_new_tokens=input.max_tokens,
         top_k=input.top_k,
         top_p=input.top_p,
         typical_p=input.typical_p,
diff --git a/comps/llms/text-generation/README.md b/comps/llms/text-generation/README.md
index 9c4af98c1..b31c571a5 100644
--- a/comps/llms/text-generation/README.md
+++ b/comps/llms/text-generation/README.md
@@ -374,7 +374,7 @@ curl http://${your_ip}:8008/v1/chat/completions \
 
 ### 3.3 Consume LLM Service
 
-You can set the following model parameters according to your actual needs, such as `max_new_tokens`, `streaming`.
+You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.
 
 The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.
 
@@ -385,7 +385,7 @@ curl http://${your_ip}:9000/v1/chat/completions \
   -H 'Content-Type: application/json' \
   -d '{
   "query":"What is Deep Learning?",
-  "max_new_tokens":17,
+  "max_tokens":17,
   "top_k":10,
   "top_p":0.95,
   "typical_p":0.95,
@@ -401,7 +401,7 @@ curl http://${your_ip}:9000/v1/chat/completions \
   -H 'Content-Type: application/json' \
   -d '{
   "query":"What is Deep Learning?",
-  "max_new_tokens":17,
+  "max_tokens":17,
   "top_k":10,
   "top_p":0.95,
   "typical_p":0.95,
diff --git a/comps/llms/text-generation/ollama/langchain/README.md b/comps/llms/text-generation/ollama/langchain/README.md
index ec9a293eb..65285bb11 100644
--- a/comps/llms/text-generation/ollama/langchain/README.md
+++ b/comps/llms/text-generation/ollama/langchain/README.md
@@ -70,5 +70,5 @@ docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy
 ## Consume the Ollama Microservice
 
 ```bash
-curl http://127.0.0.1:9000/v1/chat/completions  -X POST   -d '{"model": "llama3", "query":"What is Deep Learning?","max_new_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}'   -H 'Content-Type: application/json'
+curl http://127.0.0.1:9000/v1/chat/completions  -X POST   -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}'   -H 'Content-Type: application/json'
 ```
diff --git a/comps/llms/text-generation/ollama/langchain/llm.py b/comps/llms/text-generation/ollama/langchain/llm.py
index 06d02461c..9830cca15 100644
--- a/comps/llms/text-generation/ollama/langchain/llm.py
+++ b/comps/llms/text-generation/ollama/langchain/llm.py
@@ -25,7 +25,7 @@ def llm_generate(input: LLMParamsDoc):
     ollama = Ollama(
         base_url=ollama_endpoint,
         model=input.model if input.model else model_name,
-        num_predict=input.max_new_tokens,
+        num_predict=input.max_tokens,
         top_k=input.top_k,
         top_p=input.top_p,
         temperature=input.temperature,
diff --git a/comps/llms/text-generation/predictionguard/README.md b/comps/llms/text-generation/predictionguard/README.md
index e506793d9..1045d361d 100644
--- a/comps/llms/text-generation/predictionguard/README.md
+++ b/comps/llms/text-generation/predictionguard/README.md
@@ -29,7 +29,7 @@ curl -X POST http://localhost:9000/v1/chat/completions \
     -d '{
         "model": "Hermes-2-Pro-Llama-3-8B",
         "query": "Tell me a joke.",
-        "max_new_tokens": 100,
+        "max_tokens": 100,
         "temperature": 0.7,
         "top_p": 0.9,
         "top_k": 50,
@@ -45,7 +45,7 @@ curl -N -X POST http://localhost:9000/v1/chat/completions \
     -d '{
         "model": "Hermes-2-Pro-Llama-3-8B",
         "query": "Tell me a joke.",
-        "max_new_tokens": 100,
+        "max_tokens": 100,
         "temperature": 0.7,
         "top_p": 0.9,
         "top_k": 50,
diff --git a/comps/llms/text-generation/predictionguard/llm_predictionguard.py b/comps/llms/text-generation/predictionguard/llm_predictionguard.py
index ea70c11bc..d6c9398ce 100644
--- a/comps/llms/text-generation/predictionguard/llm_predictionguard.py
+++ b/comps/llms/text-generation/predictionguard/llm_predictionguard.py
@@ -49,7 +49,7 @@ async def stream_generator():
             for res in client.chat.completions.create(
                 model=input.model,
                 messages=messages,
-                max_tokens=input.max_new_tokens,
+                max_tokens=input.max_tokens,
                 temperature=input.temperature,
                 top_p=input.top_p,
                 top_k=input.top_k,
@@ -69,7 +69,7 @@ async def stream_generator():
             response = client.chat.completions.create(
                 model=input.model,
                 messages=messages,
-                max_tokens=input.max_new_tokens,
+                max_tokens=input.max_tokens,
                 temperature=input.temperature,
                 top_p=input.top_p,
                 top_k=input.top_k,
diff --git a/comps/llms/text-generation/ray_serve/llm.py b/comps/llms/text-generation/ray_serve/llm.py
index c86025625..1203794cd 100644
--- a/comps/llms/text-generation/ray_serve/llm.py
+++ b/comps/llms/text-generation/ray_serve/llm.py
@@ -47,7 +47,7 @@ def llm_generate(input: LLMParamsDoc):
         openai_api_base=llm_endpoint + "/v1",
         model_name=llm_model,
         openai_api_key=os.getenv("OPENAI_API_KEY", "not_needed"),
-        max_tokens=input.max_new_tokens,
+        max_tokens=input.max_tokens,
         temperature=input.temperature,
         streaming=input.streaming,
         request_timeout=600,
diff --git a/comps/llms/text-generation/tgi/README.md b/comps/llms/text-generation/tgi/README.md
index c6843df4e..37428f3f1 100644
--- a/comps/llms/text-generation/tgi/README.md
+++ b/comps/llms/text-generation/tgi/README.md
@@ -88,7 +88,7 @@ curl http://${your_ip}:9000/v1/health_check\
 
 ### 3.2 Consume LLM Service
 
-You can set the following model parameters according to your actual needs, such as `max_new_tokens`, `streaming`.
+You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.
 
 The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.
 
@@ -96,28 +96,34 @@ The `streaming` parameter determines the format of the data returned by the API.
 # non-streaming mode
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
   -H 'Content-Type: application/json'
 
 # streaming mode
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
   -H 'Content-Type: application/json'
 
-# custom chat template
+# consume with SearchedDoc
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
+  -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
   -H 'Content-Type: application/json'
+```
 
-# consume with SearchedDoc
+For parameters in above modes, please refer to [HuggingFace InferenceClient API](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation) (except we rename 'max_new_tokens' to 'max_tokens')
+
+```bash
+# custom chat template
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"presence_penalty":1.03", frequency_penalty":0.0, "streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
   -H 'Content-Type: application/json'
 ```
 
+For parameters in Chat mode, please refer to [OpenAI API](https://platform.openai.com/docs/api-reference/chat/create)
+
 ### 4. Validated Model
 
 | Model                     | TGI |
diff --git a/comps/llms/text-generation/tgi/llm.py b/comps/llms/text-generation/tgi/llm.py
index d0ad2dbf1..d96518296 100644
--- a/comps/llms/text-generation/tgi/llm.py
+++ b/comps/llms/text-generation/tgi/llm.py
@@ -69,7 +69,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
         text_generation = await llm.text_generation(
             prompt=prompt,
             stream=new_input.streaming,
-            max_new_tokens=new_input.max_new_tokens,
+            max_new_tokens=new_input.max_tokens,
             repetition_penalty=new_input.repetition_penalty,
             temperature=new_input.temperature,
             top_k=new_input.top_k,
@@ -119,7 +119,7 @@ async def stream_generator():
         text_generation = await llm.text_generation(
             prompt=prompt,
             stream=input.streaming,
-            max_new_tokens=input.max_new_tokens,
+            max_new_tokens=input.max_tokens,
             repetition_penalty=input.repetition_penalty,
             temperature=input.temperature,
             top_k=input.top_k,
diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md
index 6db006535..6f41b9fe0 100644
--- a/comps/llms/text-generation/vllm/langchain/README.md
+++ b/comps/llms/text-generation/vllm/langchain/README.md
@@ -196,26 +196,26 @@ curl http://${your_ip}:9000/v1/health_check\
 
 User can set the following model parameters according to needs:
 
-- max_new_tokens: Total output token
+- max_tokens: Total output token
 - streaming(true/false): return text response in streaming mode or non-streaming mode
 
 ```bash
 # 1. Non-streaming mode
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
   -H 'Content-Type: application/json'
 
 # 2. Streaming mode
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
   -H 'Content-Type: application/json'
 
 # 3. Custom chat template with streaming mode
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
   -H 'Content-Type: application/json'
 
 4. #  Chat with SearchedDoc (Retrieval context)
@@ -224,3 +224,5 @@ curl http://${your_ip}:9000/v1/chat/completions \
   -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
   -H 'Content-Type: application/json'
 ```
+
+For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
diff --git a/comps/llms/text-generation/vllm/langchain/llm.py b/comps/llms/text-generation/vllm/langchain/llm.py
index 9c6f1a047..fdb245320 100644
--- a/comps/llms/text-generation/vllm/langchain/llm.py
+++ b/comps/llms/text-generation/vllm/langchain/llm.py
@@ -79,10 +79,12 @@ def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, SearchedDoc])
         llm = VLLMOpenAI(
             openai_api_key="EMPTY",
             openai_api_base=llm_endpoint + "/v1",
-            max_tokens=new_input.max_new_tokens,
+            max_tokens=new_input.max_tokens,
             model_name=model_name,
             top_p=new_input.top_p,
             temperature=new_input.temperature,
+            frequency_penalty=new_input.frequency_penalty,
+            presence_penalty=new_input.presence_penalty,
             streaming=new_input.streaming,
         )
 
@@ -132,10 +134,12 @@ def stream_generator():
         llm = VLLMOpenAI(
             openai_api_key="EMPTY",
             openai_api_base=llm_endpoint + "/v1",
-            max_tokens=input.max_new_tokens,
+            max_tokens=input.max_tokens,
             model_name=model_name,
             top_p=input.top_p,
             temperature=input.temperature,
+            frequency_penalty=input.frequency_penalty,
+            presence_penalty=input.presence_penalty,
             streaming=input.streaming,
         )
 
diff --git a/comps/llms/text-generation/vllm/langchain/query.sh b/comps/llms/text-generation/vllm/langchain/query.sh
index 5784b13a6..13b63511b 100644
--- a/comps/llms/text-generation/vllm/langchain/query.sh
+++ b/comps/llms/text-generation/vllm/langchain/query.sh
@@ -15,5 +15,5 @@ curl http://${your_ip}:8008/v1/completions \
 ##query microservice
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
   -H 'Content-Type: application/json'
diff --git a/comps/llms/text-generation/vllm/langchain/requirements.txt b/comps/llms/text-generation/vllm/langchain/requirements.txt
index d096a69ca..9bea1261f 100644
--- a/comps/llms/text-generation/vllm/langchain/requirements.txt
+++ b/comps/llms/text-generation/vllm/langchain/requirements.txt
@@ -1,7 +1,11 @@
 docarray[full]
 fastapi
 huggingface_hub
-langchain==0.1.16
+langchain #==0.1.12
+langchain-huggingface
+langchain-openai
+langchain_community
+langchainhub
 opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
diff --git a/comps/llms/text-generation/vllm/llama_index/README.md b/comps/llms/text-generation/vllm/llama_index/README.md
index 4bd51c812..bf30abdf7 100644
--- a/comps/llms/text-generation/vllm/llama_index/README.md
+++ b/comps/llms/text-generation/vllm/llama_index/README.md
@@ -184,6 +184,6 @@ bash launch_microservice.sh
 ```bash
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
   -H 'Content-Type: application/json'
 ```
diff --git a/comps/llms/text-generation/vllm/llama_index/llm.py b/comps/llms/text-generation/vllm/llama_index/llm.py
index b66348bf3..4c3957bae 100644
--- a/comps/llms/text-generation/vllm/llama_index/llm.py
+++ b/comps/llms/text-generation/vllm/llama_index/llm.py
@@ -47,7 +47,7 @@ def llm_generate(input: LLMParamsDoc):
     llm = OpenAILike(
         api_key="fake",
         api_base=llm_endpoint + "/v1",
-        max_tokens=input.max_new_tokens,
+        max_tokens=input.max_tokens,
         model=model_name,
         top_p=input.top_p,
         temperature=input.temperature,
diff --git a/comps/llms/text-generation/vllm/llama_index/query.sh b/comps/llms/text-generation/vllm/llama_index/query.sh
index 5784b13a6..68beefc4d 100644
--- a/comps/llms/text-generation/vllm/llama_index/query.sh
+++ b/comps/llms/text-generation/vllm/llama_index/query.sh
@@ -15,5 +15,5 @@ curl http://${your_ip}:8008/v1/completions \
 ##query microservice
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
   -H 'Content-Type: application/json'
diff --git a/comps/llms/text-generation/vllm/ray/README.md b/comps/llms/text-generation/vllm/ray/README.md
index 0b9386d4f..f08aa8d24 100644
--- a/comps/llms/text-generation/vllm/ray/README.md
+++ b/comps/llms/text-generation/vllm/ray/README.md
@@ -82,6 +82,8 @@ bash ./launch_microservice.sh
 ```bash
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
   -H 'Content-Type: application/json'
 ```
+
+For parameters, can refer to [LangChain ChatOpenAI API](https://python.langchain.com/v0.2/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html)
diff --git a/comps/llms/text-generation/vllm/ray/llm.py b/comps/llms/text-generation/vllm/ray/llm.py
index e7efe6527..b11b45fb7 100644
--- a/comps/llms/text-generation/vllm/ray/llm.py
+++ b/comps/llms/text-generation/vllm/ray/llm.py
@@ -39,8 +39,11 @@ def llm_generate(input: LLMParamsDoc):
         openai_api_base=llm_endpoint + "/v1",
         model_name=llm_model,
         openai_api_key=os.getenv("OPENAI_API_KEY", "not_needed"),
-        max_tokens=input.max_new_tokens,
+        max_tokens=input.max_tokens,
+        top_p=input.top_p,
         temperature=input.temperature,
+        frequency_penalty=input.frequency_penalty,
+        presence_penalty=input.presence_penalty,
         streaming=input.streaming,
         request_timeout=600,
     )
diff --git a/comps/llms/text-generation/vllm/ray/query.sh b/comps/llms/text-generation/vllm/ray/query.sh
index 3555751d1..87c3ce4f6 100644
--- a/comps/llms/text-generation/vllm/ray/query.sh
+++ b/comps/llms/text-generation/vllm/ray/query.sh
@@ -11,5 +11,5 @@ curl http://${your_ip}:8006/v1/chat/completions \
 ##query microservice
 curl http://${your_ip}:9000/v1/chat/completions \
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
   -H 'Content-Type: application/json'
diff --git a/comps/llms/utils/lm-eval/self_hosted_hf.py b/comps/llms/utils/lm-eval/self_hosted_hf.py
index 441605be0..5ea4accc0 100644
--- a/comps/llms/utils/lm-eval/self_hosted_hf.py
+++ b/comps/llms/utils/lm-eval/self_hosted_hf.py
@@ -22,7 +22,7 @@
 class LLMCompletionDoc(BaseDoc):
     batched_inputs: List
     logprobs: int = 10
-    max_tokens: int = 0
+    max_new_tokens: int = 0
     temperature: float = 0.0
 
 
diff --git a/tests/llms/test_llms_text-generation_predictionguard.sh b/tests/llms/test_llms_text-generation_predictionguard.sh
index 39a66bcf4..0faad3ae4 100644
--- a/tests/llms/test_llms_text-generation_predictionguard.sh
+++ b/tests/llms/test_llms_text-generation_predictionguard.sh
@@ -36,7 +36,7 @@ function validate_microservice() {
     llm_service_port=9000
     result=$(http_proxy="" curl http://${ip_address}:${llm_service_port}/v1/chat/completions \
         -X POST \
-        -d '{"model": "Hermes-2-Pro-Llama-3-8B", "query": "What is AI?", "streaming": false, "max_new_tokens": 100, "temperature": 0.7, "top_p": 1.0, "top_k": 50}' \
+        -d '{"model": "Hermes-2-Pro-Llama-3-8B", "query": "What is AI?", "streaming": false, "max_tokens": 100, "temperature": 0.7, "top_p": 1.0, "top_k": 50}' \
         -H 'Content-Type: application/json')
 
     if [[ $result == *"text"* ]]; then
diff --git a/tests/llms/test_llms_text-generation_tgi.sh b/tests/llms/test_llms_text-generation_tgi.sh
index db01b60e2..383535efc 100644
--- a/tests/llms/test_llms_text-generation_tgi.sh
+++ b/tests/llms/test_llms_text-generation_tgi.sh
@@ -48,7 +48,7 @@ function validate_microservice() {
     llm_port=5005
     result=$(http_proxy="" curl http://${ip_address}:${llm_port}/v1/chat/completions \
         -X POST \
-        -d '{"query":"What is Deep Learning?", "max_new_tokens": 128}' \
+        -d '{"query":"What is Deep Learning?", "max_tokens": 128}' \
         -H 'Content-Type: application/json')
     if [[ $result == *"DONE"* ]]; then
         echo "Result correct."
diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh
index 291e729a5..6ecf5d2d6 100644
--- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh
+++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh
@@ -92,7 +92,7 @@ function validate_microservice() {
     fi
     result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
         -X POST \
-        -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
+        -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
         -H 'Content-Type: application/json')
     if [[ $result == *"text"* ]]; then
         echo "Result correct."
diff --git a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh
index 43fa4b8dc..ca67a00f4 100644
--- a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh
+++ b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh
@@ -92,7 +92,7 @@ function validate_microservice() {
     fi
     result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
         -X POST \
-        -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
+        -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
         -H 'Content-Type: application/json')
     if [[ $result == *"text"* ]]; then
         echo "Result correct."
diff --git a/tests/llms/test_llms_text-generation_vllm_ray_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_ray_on_intel_hpu.sh
index e20c1e537..8f9dbec64 100644
--- a/tests/llms/test_llms_text-generation_vllm_ray_on_intel_hpu.sh
+++ b/tests/llms/test_llms_text-generation_vllm_ray_on_intel_hpu.sh
@@ -91,7 +91,7 @@ function validate_microservice() {
     service_port=5032
     result=$(http_proxy="" curl http://${ip_address}:$service_port/v1/chat/completions \
         -X POST \
-        -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+        -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
         -H 'Content-Type: application/json')
     if [[ $result == *"text"* ]]; then
         echo "Result correct."