diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py index 3d5d7effc9de..d0010d68b889 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py @@ -28,7 +28,6 @@ ) IGNORE_INDEX = -100 -SYSTEM_TOKEN = "System" TYPE_INSTRUCTION = { 'TEXT_TO_VALUE': "", @@ -51,7 +50,8 @@ def _get_header_conversation_type_mask_role(source, special_tokens): if TYPE_INSTRUCTION[data_type] != '': conversation = conversation + '\n' + TYPE_INSTRUCTION[data_type] mask_role = source.get('mask', 'User') - header = f"{special_tokens['system_turn_start']}{SYSTEM_TOKEN}{END_NAME_SIGNAL}{conversation}{END_SIGNAL}" + system_token = source.get("system_token", "System") + header = f"{special_tokens['system_turn_start']}{system_token}{END_NAME_SIGNAL}{conversation}{END_SIGNAL}" conversation = _add_speaker_and_signal(header, source['conversations'], mask_role, data_type, special_tokens) return header, conversation, data_type, mask_role