shashikg · shashikg · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/README.md b/README.md
@@ -116,7 +116,7 @@ out = model.transcribe_with_vad(files,
                                 initial_prompts=initial_prompts,
                                 batch_size=32)
 
-print(out[0][0])
+print(out[0][0]) # Print first utterance for first file
 """
 [Console Output]
 
@@ -152,7 +152,7 @@ out = model.transcribe_with_vad(files,
                                 initial_prompts=initial_prompts,
                                 batch_size=24)
 
-print(out[0][0])
+print(out[0][0]) # Print first utterance for first file
 """
 [Console Output]
 

diff --git a/docs.md b/docs.md
@@ -1,5 +1,12 @@
 # Detailed Usage and Documentation
 
+1. [Basic Usage](#basic-usage)
+1. [Using Custom VAD Model](#using-custom-vad-model)
+1. [Run Without VAD Model](#run-without-vad-model)
+1. [Passing Custom Model Configuration](#passing-custom-model-configuration)
+1. [Return Word-Alignments](#return-word-alignments)
+1. [Write Transcripts To a File](#write-transcripts-to-a-file)
+
 ## Basic Usage
 
 Load WhisperS2T with CTranslate2 backend with default parameters:
@@ -20,6 +27,17 @@ out = model.transcribe_with_vad(files,
                                 initial_prompts=initial_prompts, # to do prompting (currently only supported for CTranslate2 backend)
                                 batch_size=16)
 
+print(out[0][0]) # Print first utterance for first file
+"""
+[Console Output]
+
+{'text': "Let's bring in Phil Mackie who is there at the palace. We're looking at Teresa and Philip May. Philip, can you see how he's being transferred from the helicopters? It looks like, as you said, the beast. It's got its headlights on because the sun is beginning to set now, certainly sinking behind some clouds. It's about a quarter of a mile away down the Grand Drive",
+ 'avg_logprob': -0.25426941679184695,
+ 'no_speech_prob': 8.147954940795898e-05,
+ 'start_time': 0.0,
+ 'end_time': 24.8}
+"""
+
 ```
 
 Switch to HuggingFace backend (by default it will use FlashAttention2). Note: FlashAttention2 only works with Ampere/Hopper Nvidia GPUs.
@@ -37,6 +55,38 @@ vad_model = CustomVAD()
 model = whisper_s2t.load_model(model_identifier="large-v2", backend='CTranslate2', vad_model=vad_model)
 ```
 
+## Run Without VAD Model
+
+For some languages VAD model can give poor performance. For those cases, it's better to disable VAD.
+
+```py
+out = model.transcribe(files,
+                       lang_codes=lang_codes, # pass lang_codes for each file
+                       tasks=tasks, # pass transcribe/translate 
+                       initial_prompts=initial_prompts, # to do prompting (currently only supported for CTranslate2 backend)
+                       batch_size=24)
+
+print(out[0][0])
+"""
+{'text': "Let's bring in Phil Mackie who is there at the palace. We're looking at Theresa and Philip May. Philip, can you see how he's being transferred from the helicopters? It looks like, as you said, the beast. It's got its headlights on because the sun is beginning to set now, certainly sinking behind some clouds. It's about a quarter of a mile away down the Grand Drive leading up into the courtyard. So you've seen the pictures there of the Prime Minister",
+ 'avg_logprob': -0.25300603330135346,
+ 'no_speech_prob': 1.9311904907226562e-05,
+ 'start_time': 0,
+ 'end_time': 29.0}
+"""
+```
+
+VAD parameters can also be tweaked using:
+
+```py
+speech_segmenter_options = {
+    'eos_thresh': 0.1,
+    'bos_thresh': 0.1,
+}
+
+model = whisper_s2t.load_model(speech_segmenter_options=speech_segmenter_options)
+```
+
 ## Passing Custom Model Configuration
 
 Custom model configs can be passed as keyword arguments when loading the model:
@@ -57,4 +107,69 @@ OR to update the configs after loading the model:
 
 ```py
 model.update_params(model_kwargs)
+```
+
+## Return Word-Alignments
+
+Only for CTranslate2 and TensorRT backend.
+
+```py
+import whisper_s2t
+
+model = whisper_s2t.load_model(model_identifier="large-v2", asr_options={'word_timestamps': True})
+
+files = ['sample_1.wav']
+lang_codes = ['en']
+tasks = ['transcribe']
+initial_prompts = [None]
+
+out = model.transcribe_with_vad(files,
+                                lang_codes=lang_codes, # pass lang_codes for each file
+                                tasks=tasks, # pass transcribe/translate 
+                                initial_prompts=initial_prompts, # to do prompting (currently only supported for CTranslate2 backend)
+                                batch_size=24)
+
+print(out[0][0]) # Print first utterance for first file
+"""
+[Console Output]
+
+{'text': "Let's bring in Phil Mackie who is there at the palace. We're looking at Teresa and Philip May. Philip, can you see how he's being transferred from the helicopters? It looks like, as you said, the beast. It's got its headlights on because the sun is beginning to set now, certainly sinking behind some clouds. It's about a quarter of a mile away down the Grand Drive",
+ 'avg_logprob': -0.2544597674565143,
+ 'no_speech_prob': 8.213520050048828e-05,
+ 'word_timestamps': [{'word': "Let's",
+   'start': 0.0,
+   'end': 0.24,
+   'prob': 0.63},
+  {'word': 'bring', 'start': 0.24, 'end': 0.4, 'prob': 0.96},
+  {'word': 'in', 'start': 0.4, 'end': 0.52, 'prob': 0.71},
+  {'word': 'Phil', 'start': 0.52, 'end': 0.66, 'prob': 0.46},
+  {'word': 'Mackie', 'start': 0.66, 'end': 1.02, 'prob': 0.27},
+  {'word': 'who', 'start': 1.02, 'end': 1.2, 'prob': 0.01},
+  .
+  .
+  .
+  .
+}
+"""
+```
+
+## Write Transcripts To a File
+
+Predicted transcripts can be easily exported to following output formats: `vtt, srt, json, tsv`.
+
+```py
+files = ['file.wav']
+lang_codes = ['en']
+tasks = ['transcribe']
+initial_prompts = [None]
+
+out = model.transcribe_with_vad(files,
+                                lang_codes=lang_codes,
+                                tasks=tasks,
+                                initial_prompts=initial_prompts,
+                                batch_size=24)
+
+whisper_s2t.write_outputs(out, format='vtt', ip_files=files, save_dir="./save_dir") # Save outputs
+
+whisper_s2t.write_outputs(out, format='vtt', op_files=op_files) # custom output file names
 ```
diff --git a/install_tensorrt.sh b/install_tensorrt.sh
@@ -18,4 +18,5 @@ rm -rf /tmp/mpi4py*
 
 echo ""
 echo "###########################[ Installing TensorRT-LLM ]###########################"
+pip3 install -U torch==2.1.2
 pip3 install tensorrt_llm==0.8.0.dev2024012301 --extra-index-url https://pypi.nvidia.com
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,8 @@
 tqdm
+rich
 torch
 numpy
+platformdirs
 ctranslate2
 tokenizers
 huggingface-hub

diff --git a/whisper_s2t/backends/__init__.py b/whisper_s2t/backends/__init__.py
@@ -22,6 +22,17 @@ def encode(self, text):
         return [0]
 
 
+def fix_batch_param(param, default_value, N):
+    if param is None:
+        param = N*[default_value]
+    elif type(param) == type(default_value):
+        param = N*[param]
+    elif len(param) != N:
+        param = N*[param[0]]
+
+    return param
+
+
 class WhisperModel(ABC):
     def __init__(self,
                  tokenizer=None,
@@ -119,14 +130,9 @@ def transcribe(self, audio_files, lang_codes=None, tasks=None, initial_prompts=N
 
         # return responses
 
-        if lang_codes == None:
-            lang_codes = len(audio_files)*['en']
-
-        if tasks == None:
-            tasks = len(audio_files)*['transcribe']
-
-        if initial_prompts == None:
-            initial_prompts = len(audio_files)*[None]
+        lang_codes = fix_batch_param(lang_codes, 'en', len(audio_files))
+        tasks = fix_batch_param(tasks, 'transcribe', len(audio_files))
+        initial_prompts = fix_batch_param(initial_prompts, None, len(audio_files))
 
         responses = [[] for _ in audio_files]
 
@@ -151,15 +157,10 @@ def transcribe(self, audio_files, lang_codes=None, tasks=None, initial_prompts=N
 
     @torch.no_grad()
     def transcribe_with_vad(self, audio_files, lang_codes=None, tasks=None, initial_prompts=None, batch_size=8):
-
-        if lang_codes == None:
-            lang_codes = len(audio_files)*['en']
-
-        if tasks == None:
-            tasks = len(audio_files)*['transcribe']
-
-        if initial_prompts == None:
-            initial_prompts = len(audio_files)*[None]
+
+        lang_codes = fix_batch_param(lang_codes, 'en', len(audio_files))
+        tasks = fix_batch_param(tasks, 'transcribe', len(audio_files))
+        initial_prompts = fix_batch_param(initial_prompts, None, len(audio_files))
 
         responses = [[] for _ in audio_files]
 

diff --git a/whisper_s2t/backends/tensorrt/model.py b/whisper_s2t/backends/tensorrt/model.py
@@ -243,7 +243,7 @@ def generate_segment_batched(self, features, prompts, seq_lens, seg_metadata):
             response.append({'text': texts[idx].strip()})
 
         if self.asr_options['word_timestamps']:
-            text_tokens = [x.sequences_ids[0]+[self.tokenizer.eot] for x in result]
+            text_tokens = [[_t for _t in x[0] if _t < self.tokenizer.eot]+[self.tokenizer.eot] for x in result]
             sot_seqs = [tuple(_[-4:]) for _ in prompts]
             word_timings = self.align_words(features, texts, text_tokens, sot_seqs, seq_lens, seg_metadata)