Update punctuator model to more powerful `1-800-BAD-CODE/xlm-roberta_…

…punctuation_fullstop_truecase`.
zh-plus · Jun 26, 2023 · 8edaa04 · 8edaa04
1 parent f25de37
commit 8edaa04
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 4 deletions.
diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py
@@ -157,6 +157,7 @@ def run(self, paths, target_lang='zh-cn', prompter='base_trans', audio_type='Ani
         Firstly, transcribe the audios one-by-one. At the same time, translation threads are created and waiting for
         the transcription results. After all the transcriptions are done, the translation threads will start to
         translate the transcribed texts.
+        TODO: Abstract audio_type, synopsis_path, background to Context class.
         :param paths: Audio/Video paths, can be a list or a single path.
         :param target_lang: Target language, default to Mandarin Chinese.
         :param prompter: Currently, only `base_trans` is supported.

diff --git a/openlrc/transcribe.py b/openlrc/transcribe.py
@@ -56,18 +56,19 @@ def sentence_align(transcribe_result):
 
         :return A dict with key 'sentences' and value a list of dict with key 'text', 'start_word', 'end_word'.
         """
-        pcs_model = PunctCapSegModelONNX.from_pretrained('pcs_47lang')
+        pcs_model: PunctCapSegModelONNX = PunctCapSegModelONNX.from_pretrained(
+            "1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
+        )
         punctuations = '.,?？，。、・।؟;።፣፧،'
 
-        sentences_list = pcs_model.infer([segment['text'] for segment in transcribe_result['segments']])
+        sentences_list = pcs_model.infer([segment['text'] for segment in transcribe_result['segments']], apply_sbd=True)
 
         pcs_result = {'sentences': []}
         for segment, sentences in zip(transcribe_result['segments'], sentences_list):
             last_end_idx = 0
             for sentence in sentences:
                 sentence = sentence.lower()
-                stc_split = re.split(f'[{punctuations}]|<unk>', sentence)
-                # TODO: Recover <unk> from segment['text']
+                stc_split = re.split(f'[{punctuations}]', sentence)
 
                 # Remove empty string
                 stc_split = [split for split in stc_split if split]