diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
index 3d5d7effc9de..d0010d68b889 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
@@ -28,7 +28,6 @@
 )
 
 IGNORE_INDEX = -100
-SYSTEM_TOKEN = "System"
 
 TYPE_INSTRUCTION = {
     'TEXT_TO_VALUE': "",
@@ -51,7 +50,8 @@ def _get_header_conversation_type_mask_role(source, special_tokens):
         if TYPE_INSTRUCTION[data_type] != '':
             conversation = conversation + '\n' + TYPE_INSTRUCTION[data_type]
     mask_role = source.get('mask', 'User')
-    header = f"{special_tokens['system_turn_start']}{SYSTEM_TOKEN}{END_NAME_SIGNAL}{conversation}{END_SIGNAL}"
+    system_token = source.get("system_token", "System")
+    header = f"{special_tokens['system_turn_start']}{system_token}{END_NAME_SIGNAL}{conversation}{END_SIGNAL}"
     conversation = _add_speaker_and_signal(header, source['conversations'], mask_role, data_type, special_tokens)
     return header, conversation, data_type, mask_role