Add Flush to VAD so that the last segment can be detected. (#1099)

k2-fsa · Jul 9, 2024 · c2cc9de · c2cc9de
1 parent 3e4307e
commit c2cc9de
Show file tree

Hide file tree

Showing 35 changed files with 237 additions and 29 deletions.
diff --git a/.github/workflows/dot-net.yaml b/.github/workflows/dot-net.yaml
@@ -52,11 +52,6 @@ jobs:
           cmake --build . --target install --config Release
           rm -rf install/pkgconfig
 
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-${{ matrix.arch }}
-          path: ./build/install/lib/
-
       - name: Create tar file
         shell: bash
         run: |
@@ -72,6 +67,11 @@ jobs:
           ls -lh *.tar.bz2
           mv *.tar.bz2 ../
 
+      - uses: actions/upload-artifact@v4
+        with:
+          name: windows-${{ matrix.arch }}
+          path: ./*.tar.bz2
+
       # https://huggingface.co/docs/hub/spaces-github-actions
       - name: Publish to huggingface
         if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event_name == 'push' || github.event_name == 'workflow_dispatch')
@@ -88,7 +88,9 @@ jobs:
 
             rm -rf huggingface
             export GIT_CLONE_PROTECTION_ACTIVE=false
-            GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
+            export GIT_LFS_SKIP_SMUDGE=1
+
+            git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
 
             cd huggingface
             mkdir -p windows-for-dotnet

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 1.10.12
+
+* Add Flush to VAD so that the last speech segment can be detected. See also
+  https://github.com/k2-fsa/sherpa-onnx/discussions/1077#discussioncomment-9979740
+
 ## 1.10.11
 
 * Support the iOS platform for iOS.

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,8 +10,8 @@ project(sherpa-onnx)
 # Remember to update
 # ./nodejs-addon-examples
 # ./dart-api-examples/
-# ./sherpa-onnx/flutter/CHANGELOG.md
-set(SHERPA_ONNX_VERSION "1.10.11")
+# ./CHANGELOG.md
+set(SHERPA_ONNX_VERSION "1.10.12")
 
 # Disable warning about
 #

diff --git a/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart
@@ -93,6 +93,28 @@ void main(List<String> arguments) async {
     }
   }
 
+  vad.flush();
+  while (!vad.isEmpty()) {
+    final stream = recognizer.createStream();
+    final segment = vad.front();
+    stream.acceptWaveform(
+        samples: segment.samples, sampleRate: waveData.sampleRate);
+    recognizer.decode(stream);
+
+    final result = recognizer.getResult(stream);
+
+    final startTime = segment.start * 1.0 / waveData.sampleRate;
+    final duration = segment.samples.length * 1.0 / waveData.sampleRate;
+    final stopTime = startTime + duration;
+    if (result.text != '') {
+      print(
+          '${startTime.toStringAsPrecision(4)} -- ${stopTime.toStringAsPrecision(4)}: ${result.text}');
+    }
+
+    stream.free();
+    vad.pop();
+  }
+
   vad.free();
   recognizer.free();
 }
diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml
@@ -10,7 +10,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml
@@ -11,7 +11,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml
@@ -8,7 +8,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dart-api-examples/vad/bin/vad.dart b/dart-api-examples/vad/bin/vad.dart
@@ -65,6 +65,12 @@ void main(List<String> arguments) async {
     }
   }
 
+  vad.flush();
+  while (!vad.isEmpty()) {
+    allSamples.add(vad.front().samples);
+    vad.pop();
+  }
+
   vad.free();
 
   final s = Float32List.fromList(allSamples.expand((x) => x).toList());

diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml
@@ -9,7 +9,7 @@ environment:
   sdk: ^3.4.0
 
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs b/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs
@@ -57,6 +57,26 @@ static void Main(string[] args)
         }
       }
     }
+
+    vad.Flush();
+
+    while (!vad.IsEmpty()) {
+      SpeechSegment segment = vad.Front();
+      float startTime = segment.Start / (float)sampleRate;
+      float duration = segment.Samples.Length / (float)sampleRate;
+
+      OfflineStream stream = recognizer.CreateStream();
+      stream.AcceptWaveform(sampleRate, segment.Samples);
+      recognizer.Decode(stream);
+      String text = stream.Result.Text;
+
+      if (!String.IsNullOrEmpty(text)) {
+        Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime),
+            String.Format("{0:0.00}", startTime+duration), text);
+      }
+
+      vad.Pop();
+    }
   }
 }
 
diff --git a/flutter-examples/streaming_asr/pubspec.yaml b/flutter-examples/streaming_asr/pubspec.yaml
@@ -5,7 +5,7 @@ description: >
 
 publish_to: 'none'
 
-version: 1.10.11
+version: 1.10.12
 
 topics:
   - speech-recognition
@@ -30,7 +30,7 @@ dependencies:
   record: ^5.1.0
   url_launcher: ^6.2.6
 
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   # sherpa_onnx:
     # path: ../../flutter/sherpa_onnx
 

diff --git a/flutter-examples/tts/pubspec.yaml b/flutter-examples/tts/pubspec.yaml
@@ -17,7 +17,7 @@ dependencies:
   cupertino_icons: ^1.0.6
   path_provider: ^2.1.3
   path: ^1.9.0
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   url_launcher: ^6.2.6
   audioplayers: ^5.0.0
 

diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
@@ -491,6 +491,12 @@ typedef SherpaOnnxVoiceActivityDetectorResetNative = Void Function(
 typedef SherpaOnnxVoiceActivityDetectorReset = void Function(
     Pointer<SherpaOnnxVoiceActivityDetector>);
 
+typedef SherpaOnnxVoiceActivityDetectorFlushNative = Void Function(
+    Pointer<SherpaOnnxVoiceActivityDetector>);
+
+typedef SherpaOnnxVoiceActivityDetectorFlush = void Function(
+    Pointer<SherpaOnnxVoiceActivityDetector>);
+
 typedef SherpaOnnxVoiceActivityDetectorFrontNative
     = Pointer<SherpaOnnxSpeechSegment> Function(
         Pointer<SherpaOnnxVoiceActivityDetector>);
@@ -779,6 +785,8 @@ class SherpaOnnxBindings {
 
   static SherpaOnnxVoiceActivityDetectorReset? voiceActivityDetectorReset;
 
+  static SherpaOnnxVoiceActivityDetectorFlush? voiceActivityDetectorFlush;
+
   static SherpaOnnxCreateCircularBuffer? createCircularBuffer;
 
   static SherpaOnnxDestroyCircularBuffer? destroyCircularBuffer;
@@ -1036,6 +1044,11 @@ class SherpaOnnxBindings {
             'SherpaOnnxVoiceActivityDetectorReset')
         .asFunction();
 
+    voiceActivityDetectorFlush ??= dynamicLibrary
+        .lookup<NativeFunction<SherpaOnnxVoiceActivityDetectorFlushNative>>(
+            'SherpaOnnxVoiceActivityDetectorFlush')
+        .asFunction();
+
     createCircularBuffer ??= dynamicLibrary
         .lookup<NativeFunction<SherpaOnnxCreateCircularBufferNative>>(
             'SherpaOnnxCreateCircularBuffer')

diff --git a/flutter/sherpa_onnx/lib/src/vad.dart b/flutter/sherpa_onnx/lib/src/vad.dart
@@ -207,6 +207,10 @@ class VoiceActivityDetector {
     SherpaOnnxBindings.voiceActivityDetectorReset?.call(ptr);
   }
 
+  void flush() {
+    SherpaOnnxBindings.voiceActivityDetectorFlush?.call(ptr);
+  }
+
   Pointer<SherpaOnnxVoiceActivityDetector> ptr;
   final VadModelConfig config;
 }
diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
-version: 1.10.11
+version: 1.10.12
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 
@@ -30,19 +30,19 @@ dependencies:
   flutter:
     sdk: flutter
 
-  sherpa_onnx_android: ^1.10.11
+  sherpa_onnx_android: ^1.10.12
     # path: ../sherpa_onnx_android
 
-  sherpa_onnx_macos: ^1.10.11
+  sherpa_onnx_macos: ^1.10.12
     # path: ../sherpa_onnx_macos
 
-  sherpa_onnx_linux: ^1.10.11
+  sherpa_onnx_linux: ^1.10.12
     # path: ../sherpa_onnx_linux
     #
-  sherpa_onnx_windows: ^1.10.11
+  sherpa_onnx_windows: ^1.10.12
     # path: ../sherpa_onnx_windows
 
-  sherpa_onnx_ios: ^1.10.11
+  sherpa_onnx_ios: ^1.10.12
   # sherpa_onnx_ios:
     # path: ../sherpa_onnx_ios
 

diff --git a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
@@ -7,7 +7,7 @@
 # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_ios'
-  s.version          = '1.10.11'
+  s.version          = '1.10.12'
   s.summary          = 'A new Flutter FFI plugin project.'
   s.description      = <<-DESC
 A new Flutter FFI plugin project.

diff --git a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
@@ -4,7 +4,7 @@
 #
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_macos'
-  s.version          = '1.10.11'
+  s.version          = '1.10.12'
   s.summary          = 'sherpa-onnx Flutter FFI plugin project.'
   s.description      = <<-DESC
 sherpa-onnx Flutter FFI plugin project.

diff --git a/java-api-examples/VadNonStreamingParaformer.java b/java-api-examples/VadNonStreamingParaformer.java
@@ -98,6 +98,25 @@ public static void main(String[] args) {
       }
     }
 
+    vad.flush();
+    while (!vad.empty()) {
+      SpeechSegment segment = vad.front();
+      float startTime = segment.getStart() / 16000.0f;
+      float duration = segment.getSamples().length / 16000.0f;
+
+      OfflineStream stream = recognizer.createStream();
+      stream.acceptWaveform(segment.getSamples(), 16000);
+      recognizer.decode(stream);
+      String text = recognizer.getResult(stream).getText();
+      stream.release();
+
+      if (!text.isEmpty()) {
+        System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
+      }
+
+      vad.pop();
+    }
+
     vad.release();
     recognizer.release();
   }

diff --git a/java-api-examples/VadRemoveSilence.java b/java-api-examples/VadRemoveSilence.java
@@ -59,6 +59,16 @@ public static void main(String[] args) {
       }
     }
 
+    vad.flush();
+    while (!vad.empty()) {
+
+      // if you want to get the starting time of this segment, you can use
+      /* float startTime = vad.front().getStart() / 16000.0f; */
+
+      segments.add(vad.front().getSamples());
+      vad.pop();
+    }
+
     // get total number of samples
     int n = 0;
     for (float[] s : segments) {

diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json
@@ -1,5 +1,5 @@
 {
   "dependencies": {
-    "sherpa-onnx-node": "^1.10.6"
+    "sherpa-onnx-node": "^1.10.12"
   }
 }
diff --git a/python-api-examples/vad-remove-non-speech-segments-from-file.py b/python-api-examples/vad-remove-non-speech-segments-from-file.py
@@ -105,6 +105,12 @@ def main():
             speech_samples.extend(vad.front.samples)
             vad.pop()
 
+    vad.flush()
+
+    while not vad.empty():
+        speech_samples.extend(vad.front.samples)
+        vad.pop()
+
     speech_samples = np.array(speech_samples, dtype=np.float32)
 
     sf.write(args.output, speech_samples, samplerate=sample_rate)

diff --git a/scripts/dart/sherpa-onnx-pubspec.yaml b/scripts/dart/sherpa-onnx-pubspec.yaml
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec
-version: 1.10.6
+version: 1.10.12
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 

diff --git a/scripts/dotnet/VoiceActivityDetector.cs b/scripts/dotnet/VoiceActivityDetector.cs
@@ -53,6 +53,11 @@ public void Reset()
             SherpaOnnxVoiceActivityDetectorReset(_handle.Handle);
         }
 
+        public void Flush()
+        {
+            SherpaOnnxVoiceActivityDetectorFlush(_handle.Handle);
+        }
+
         public void Dispose()
         {
             Cleanup();
@@ -106,5 +111,7 @@ private void Cleanup()
         [DllImport(Dll.Filename)]
         private static extern void SherpaOnnxVoiceActivityDetectorReset(IntPtr handle);
 
+        [DllImport(Dll.Filename)]
+        private static extern void SherpaOnnxVoiceActivityDetectorFlush(IntPtr handle);
     }
 }
diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go
@@ -856,6 +856,10 @@ func (vad *VoiceActivityDetector) Reset() {
 	C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
 }
 
+func (vad *VoiceActivityDetector) Flush() {
+	C.SherpaOnnxVoiceActivityDetectorFlush(vad.impl)
+}
+
 // Spoken language identification
 
 type SpokenLanguageIdentificationWhisperConfig struct {