Use stop sequences when building Context.

zh-plus · Jun 26, 2024 · 4f247fe · 4f247fe
1 parent 5c5f86e
commit 4f247fe
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 9 deletions.
diff --git a/openlrc/agents.py b/openlrc/agents.py
@@ -143,7 +143,9 @@ def build_context(self, texts, title='', glossary: Optional[dict] = None) -> str
             {'role': 'system', 'content': self.prompter.system()},
             {'role': 'user', 'content': self.prompter.user(text_content, title=title, given_glossary=glossary)},
         ]
-        resp = self.chatbot.message(messages_list, output_checker=self.prompter.check_format)[0]
+        resp = self.chatbot.message(
+            messages_list, stop_sequences=[self.prompter.stop_sequence], output_checker=self.prompter.check_format
+        )[0]
         context = self.chatbot.get_content(resp)
 
         context_pool = [context]

diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py
@@ -188,7 +188,7 @@ def consumer_worker(self, transcription_queue, target_lang, skip_trans, bilingua
             subtitle_path = getattr(final_subtitle, f'to_{subtitle_format}')()
             result_path = subtitle_path.parents[1] / subtitle_path.name.replace(f'_preprocessed.{subtitle_format}',
                                                                                 f'.{subtitle_format}')
-            shutil.copy(subtitle_path, result_path)
+            shutil.move(subtitle_path, result_path)
 
             if not skip_trans and bilingual_sub:
                 bilingual_subtitle = BilingualSubtitle.from_preprocessed(
@@ -199,14 +199,14 @@ def consumer_worker(self, transcription_queue, target_lang, skip_trans, bilingua
                 # TODO: consider the edge case (audio file name contains _preprocessed)
                 getattr(bilingual_subtitle, f'to_{subtitle_format}')()
                 bilingual_lrc_path = bilingual_subtitle.filename.with_suffix(bilingual_subtitle.suffix)
-                shutil.copy(bilingual_lrc_path, result_path.parent / bilingual_lrc_path.name)
+                shutil.move(bilingual_lrc_path, result_path.parent / bilingual_lrc_path.name)
 
                 non_translated_subtitle = transcribed_opt_sub
                 optimizer = SubtitleOptimizer(non_translated_subtitle)
                 optimizer.extend_time()  # Extend 0.5s like what translated do
                 getattr(non_translated_subtitle, f'to_{subtitle_format}')()
                 non_translated_lrc_path = non_translated_subtitle.filename.with_suffix(non_translated_subtitle.suffix)
-                shutil.copy(
+                shutil.move(
                     non_translated_lrc_path,
                     result_path.parent / subtitle_path.name.replace(
                         f'_preprocessed.{subtitle_format}',

diff --git a/openlrc/prompter.py b/openlrc/prompter.py
@@ -190,6 +190,8 @@ def __init__(self, src_lang, target_lang):
         self.src_lang_display = Language.get(src_lang).display_name('en')
         self.target_lang_display = Language.get(target_lang).display_name('en')
 
+        self.stop_sequence = '<*--END-OF-CONTEXT--*>'
+
     def system(self):
         return f'''You are a context reviewer responsible for ensuring the consistency and accuracy of translations between two languages. Your task involves reviewing and providing necessary contextual information for translations.
 
@@ -221,7 +223,6 @@ def system(self):
 Then, they prepare to start their investigation.
 
 Example Output:
-
 ### Glossary:
 - suspect: 嫌疑人
 - uptown: 市中心
@@ -238,6 +239,8 @@ def system(self):
 
 ### Target Audience:
 The target audience is adult viewers with an interest in crime dramas. They are likely to be familiar with police procedurals and enjoy suspenseful storytelling.
+{self.stop_sequence}
+
 </example>
 
 Note:
@@ -247,7 +250,8 @@ def system(self):
 DO NOT include any translation segment.
 Sample Translation is NOT required for this task.
 You should adhere to the same format as the previous response, add or delete section is not allowed.
-Remember to include the glossary, characters, summary, tone and style, and target audience sections in your response.'''
+Remember to include the glossary, characters, summary, tone and style, and target audience sections in your response.
+Remember to add {self.stop_sequence} after the generated contexts.'''
 
     def user(self, text, title='', given_glossary: Optional[dict] = None):
         glossary_text = f'Given glossary: {given_glossary}' if given_glossary else ''

diff --git a/tests/test_validators.py b/tests/test_validators.py
@@ -53,14 +53,14 @@ class TestAtomicTranslateValidator(unittest.TestCase):
 
     def test_validate_returns_true_when_generated_content_matches_target_language(self):
         validator = AtomicTranslateValidator(target_lang='en')
-        user_input = "Hello"
-        generated_content = "Hello"
+        user_input = "你有什么问题？"
+        generated_content = "What's your problem?"
 
         result = validator.validate(user_input, generated_content)
         self.assertTrue(result)
 
     def test_validate_returns_false_when_generated_content_not_matches_target_language(self):
-        validator = AtomicTranslateValidator(target_lang='en')
+        validator = AtomicTranslateValidator(target_lang='cn-zh')
         user_input = "Hello"
         generated_content = "你好"