Add Flush to VAD so that the last segment can be detected. (#1099)

csukuangfj · web-flow · commit c2cc9dec5866 · 2024-07-09T16:15:56.000+08:00
diff --git a/.github/workflows/dot-net.yaml b/.github/workflows/dot-net.yaml
@@ -52,11 +52,6 @@ jobs:
           cmake --build . --target install --config Release
           rm -rf install/pkgconfig
 
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-${{ matrix.arch }}
-          path: ./build/install/lib/
-
       - name: Create tar file
         shell: bash
         run: |
@@ -72,6 +67,11 @@ jobs:
           ls -lh *.tar.bz2
           mv *.tar.bz2 ../
 
+      - uses: actions/upload-artifact@v4
+        with:
+          name: windows-${{ matrix.arch }}
+          path: ./*.tar.bz2
+
       # https://huggingface.co/docs/hub/spaces-github-actions
       - name: Publish to huggingface
         if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event_name == 'push' || github.event_name == 'workflow_dispatch')
@@ -88,7 +88,9 @@ jobs:
 
             rm -rf huggingface
             export GIT_CLONE_PROTECTION_ACTIVE=false
-            GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
+            export GIT_LFS_SKIP_SMUDGE=1
+
+            git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
 
             cd huggingface
             mkdir -p windows-for-dotnet
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 1.10.12
+
+* Add Flush to VAD so that the last speech segment can be detected. See also
+  https://github.com/k2-fsa/sherpa-onnx/discussions/1077#discussioncomment-9979740
+
 ## 1.10.11
 
 * Support the iOS platform for iOS.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,8 +10,8 @@ project(sherpa-onnx)
 # Remember to update
 # ./nodejs-addon-examples
 # ./dart-api-examples/
-# ./sherpa-onnx/flutter/CHANGELOG.md
-set(SHERPA_ONNX_VERSION "1.10.11")
+# ./CHANGELOG.md
+set(SHERPA_ONNX_VERSION "1.10.12")
 
 # Disable warning about
 #
diff --git a/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart
@@ -93,6 +93,28 @@ void main(List<String> arguments) async {
     }
   }
 
+  vad.flush();
+  while (!vad.isEmpty()) {
+    final stream = recognizer.createStream();
+    final segment = vad.front();
+    stream.acceptWaveform(
+        samples: segment.samples, sampleRate: waveData.sampleRate);
+    recognizer.decode(stream);
+
+    final result = recognizer.getResult(stream);
+
+    final startTime = segment.start * 1.0 / waveData.sampleRate;
+    final duration = segment.samples.length * 1.0 / waveData.sampleRate;
+    final stopTime = startTime + duration;
+    if (result.text != '') {
+      print(
+          '${startTime.toStringAsPrecision(4)} -- ${stopTime.toStringAsPrecision(4)}: ${result.text}');
+    }
+
+    stream.free();
+    vad.pop();
+  }
+
   vad.free();
   recognizer.free();
 }
diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml
@@ -10,7 +10,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   path: ^1.9.0
   args: ^2.5.0
 
diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml
@@ -11,7 +11,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   path: ^1.9.0
   args: ^2.5.0
 
diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml
@@ -8,7 +8,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   path: ^1.9.0
   args: ^2.5.0
 
diff --git a/dart-api-examples/vad/bin/vad.dart b/dart-api-examples/vad/bin/vad.dart
@@ -65,6 +65,12 @@ void main(List<String> arguments) async {
     }
   }
 
+  vad.flush();
+  while (!vad.isEmpty()) {
+    allSamples.add(vad.front().samples);
+    vad.pop();
+  }
+
   vad.free();
 
   final s = Float32List.fromList(allSamples.expand((x) => x).toList());
diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml
@@ -9,7 +9,7 @@ environment:
   sdk: ^3.4.0
 
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   path: ^1.9.0
   args: ^2.5.0
 
diff --git a/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs b/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs
@@ -57,6 +57,26 @@ static void Main(string[] args)
         }
       }
     }
+
+    vad.Flush();
+
+    while (!vad.IsEmpty()) {
+      SpeechSegment segment = vad.Front();
+      float startTime = segment.Start / (float)sampleRate;
+      float duration = segment.Samples.Length / (float)sampleRate;
+
+      OfflineStream stream = recognizer.CreateStream();
+      stream.AcceptWaveform(sampleRate, segment.Samples);
+      recognizer.Decode(stream);
+      String text = stream.Result.Text;
+
+      if (!String.IsNullOrEmpty(text)) {
+        Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime),
+            String.Format("{0:0.00}", startTime+duration), text);
+      }
+
+      vad.Pop();
+    }
   }
 }
 
diff --git a/flutter-examples/streaming_asr/pubspec.yaml b/flutter-examples/streaming_asr/pubspec.yaml
@@ -5,7 +5,7 @@ description: >
 
 publish_to: 'none'
 
-version: 1.10.11
+version: 1.10.12
 
 topics:
   - speech-recognition
@@ -30,7 +30,7 @@ dependencies:
   record: ^5.1.0
   url_launcher: ^6.2.6
 
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   # sherpa_onnx:
     # path: ../../flutter/sherpa_onnx
 
diff --git a/flutter-examples/tts/pubspec.yaml b/flutter-examples/tts/pubspec.yaml
@@ -17,7 +17,7 @@ dependencies:
   cupertino_icons: ^1.0.6
   path_provider: ^2.1.3
   path: ^1.9.0
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
   url_launcher: ^6.2.6
   audioplayers: ^5.0.0
 
diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
@@ -491,6 +491,12 @@ typedef SherpaOnnxVoiceActivityDetectorResetNative = Void Function(
 typedef SherpaOnnxVoiceActivityDetectorReset = void Function(
     Pointer<SherpaOnnxVoiceActivityDetector>);
 
+typedef SherpaOnnxVoiceActivityDetectorFlushNative = Void Function(
+    Pointer<SherpaOnnxVoiceActivityDetector>);
+
+typedef SherpaOnnxVoiceActivityDetectorFlush = void Function(
+    Pointer<SherpaOnnxVoiceActivityDetector>);
+
 typedef SherpaOnnxVoiceActivityDetectorFrontNative
     = Pointer<SherpaOnnxSpeechSegment> Function(
         Pointer<SherpaOnnxVoiceActivityDetector>);
@@ -779,6 +785,8 @@ class SherpaOnnxBindings {
 
   static SherpaOnnxVoiceActivityDetectorReset? voiceActivityDetectorReset;
 
+  static SherpaOnnxVoiceActivityDetectorFlush? voiceActivityDetectorFlush;
+
   static SherpaOnnxCreateCircularBuffer? createCircularBuffer;
 
   static SherpaOnnxDestroyCircularBuffer? destroyCircularBuffer;
@@ -1036,6 +1044,11 @@ class SherpaOnnxBindings {
             'SherpaOnnxVoiceActivityDetectorReset')
         .asFunction();
 
+    voiceActivityDetectorFlush ??= dynamicLibrary
+        .lookup<NativeFunction<SherpaOnnxVoiceActivityDetectorFlushNative>>(
+            'SherpaOnnxVoiceActivityDetectorFlush')
+        .asFunction();
+
     createCircularBuffer ??= dynamicLibrary
         .lookup<NativeFunction<SherpaOnnxCreateCircularBufferNative>>(
             'SherpaOnnxCreateCircularBuffer')
diff --git a/flutter/sherpa_onnx/lib/src/vad.dart b/flutter/sherpa_onnx/lib/src/vad.dart
@@ -207,6 +207,10 @@ class VoiceActivityDetector {
     SherpaOnnxBindings.voiceActivityDetectorReset?.call(ptr);
   }
 
+  void flush() {
+    SherpaOnnxBindings.voiceActivityDetectorFlush?.call(ptr);
+  }
+
   Pointer<SherpaOnnxVoiceActivityDetector> ptr;
   final VadModelConfig config;
 }
diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
-version: 1.10.11
+version: 1.10.12
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 
@@ -30,19 +30,19 @@ dependencies:
   flutter:
     sdk: flutter
 
-  sherpa_onnx_android: ^1.10.11
+  sherpa_onnx_android: ^1.10.12
     # path: ../sherpa_onnx_android
 
-  sherpa_onnx_macos: ^1.10.11
+  sherpa_onnx_macos: ^1.10.12
     # path: ../sherpa_onnx_macos
 
-  sherpa_onnx_linux: ^1.10.11
+  sherpa_onnx_linux: ^1.10.12
     # path: ../sherpa_onnx_linux
     #
-  sherpa_onnx_windows: ^1.10.11
+  sherpa_onnx_windows: ^1.10.12
     # path: ../sherpa_onnx_windows
 
-  sherpa_onnx_ios: ^1.10.11
+  sherpa_onnx_ios: ^1.10.12
   # sherpa_onnx_ios:
     # path: ../sherpa_onnx_ios
 
diff --git a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
@@ -7,7 +7,7 @@
 # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_ios'
-  s.version          = '1.10.11'
+  s.version          = '1.10.12'
   s.summary          = 'A new Flutter FFI plugin project.'
   s.description      = <<-DESC
 A new Flutter FFI plugin project.
diff --git a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
@@ -4,7 +4,7 @@
 #
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_macos'
-  s.version          = '1.10.11'
+  s.version          = '1.10.12'
   s.summary          = 'sherpa-onnx Flutter FFI plugin project.'
   s.description      = <<-DESC
 sherpa-onnx Flutter FFI plugin project.
diff --git a/java-api-examples/VadNonStreamingParaformer.java b/java-api-examples/VadNonStreamingParaformer.java
@@ -98,6 +98,25 @@ public static void main(String[] args) {
       }
     }
 
+    vad.flush();
+    while (!vad.empty()) {
+      SpeechSegment segment = vad.front();
+      float startTime = segment.getStart() / 16000.0f;
+      float duration = segment.getSamples().length / 16000.0f;
+
+      OfflineStream stream = recognizer.createStream();
+      stream.acceptWaveform(segment.getSamples(), 16000);
+      recognizer.decode(stream);
+      String text = recognizer.getResult(stream).getText();
+      stream.release();
+
+      if (!text.isEmpty()) {
+        System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
+      }
+
+      vad.pop();
+    }
+
     vad.release();
     recognizer.release();
   }
diff --git a/java-api-examples/VadRemoveSilence.java b/java-api-examples/VadRemoveSilence.java
@@ -59,6 +59,16 @@ public static void main(String[] args) {
       }
     }
 
+    vad.flush();
+    while (!vad.empty()) {
+
+      // if you want to get the starting time of this segment, you can use
+      /* float startTime = vad.front().getStart() / 16000.0f; */
+
+      segments.add(vad.front().getSamples());
+      vad.pop();
+    }
+
     // get total number of samples
     int n = 0;
     for (float[] s : segments) {
diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json
@@ -1,5 +1,5 @@
 {
   "dependencies": {
-    "sherpa-onnx-node": "^1.10.6"
+    "sherpa-onnx-node": "^1.10.12"
   }
 }
diff --git a/python-api-examples/vad-remove-non-speech-segments-from-file.py b/python-api-examples/vad-remove-non-speech-segments-from-file.py
@@ -105,6 +105,12 @@ def main():
             speech_samples.extend(vad.front.samples)
             vad.pop()
 
+    vad.flush()
+
+    while not vad.empty():
+        speech_samples.extend(vad.front.samples)
+        vad.pop()
+
     speech_samples = np.array(speech_samples, dtype=np.float32)
 
     sf.write(args.output, speech_samples, samplerate=sample_rate)
diff --git a/scripts/dart/sherpa-onnx-pubspec.yaml b/scripts/dart/sherpa-onnx-pubspec.yaml
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec
-version: 1.10.6
+version: 1.10.12
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 
diff --git a/scripts/dotnet/VoiceActivityDetector.cs b/scripts/dotnet/VoiceActivityDetector.cs
@@ -53,6 +53,11 @@ public void Reset()
             SherpaOnnxVoiceActivityDetectorReset(_handle.Handle);
         }
 
+        public void Flush()
+        {
+            SherpaOnnxVoiceActivityDetectorFlush(_handle.Handle);
+        }
+
         public void Dispose()
         {
             Cleanup();
@@ -106,5 +111,7 @@ private void Cleanup()
         [DllImport(Dll.Filename)]
         private static extern void SherpaOnnxVoiceActivityDetectorReset(IntPtr handle);
 
+        [DllImport(Dll.Filename)]
+        private static extern void SherpaOnnxVoiceActivityDetectorFlush(IntPtr handle);
     }
 }
diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go
@@ -856,6 +856,10 @@ func (vad *VoiceActivityDetector) Reset() {
 	C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
 }
 
+func (vad *VoiceActivityDetector) Flush() {
+	C.SherpaOnnxVoiceActivityDetectorFlush(vad.impl)
+}
+
 // Spoken language identification
 
 type SpokenLanguageIdentificationWhisperConfig struct {
diff --git a/scripts/node-addon-api/lib/vad.js b/scripts/node-addon-api/lib/vad.js
diff --git a/scripts/node-addon-api/src/vad.cc b/scripts/node-addon-api/src/vad.cc
diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc
diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h
diff --git a/sherpa-onnx/csrc/voice-activity-detector.cc b/sherpa-onnx/csrc/voice-activity-detector.cc
diff --git a/sherpa-onnx/csrc/voice-activity-detector.h b/sherpa-onnx/csrc/voice-activity-detector.h
diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java
diff --git a/sherpa-onnx/jni/voice-activity-detector.cc b/sherpa-onnx/jni/voice-activity-detector.cc
diff --git a/sherpa-onnx/kotlin-api/Vad.kt b/sherpa-onnx/kotlin-api/Vad.kt
diff --git a/sherpa-onnx/python/csrc/voice-activity-detector.cc b/sherpa-onnx/python/csrc/voice-activity-detector.cc
diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,12 @@ void main(List<String> arguments) async {`
`65`	`65`	`}`
`66`	`66`	`}`
`67`	`67`
	`68`	`+ vad.flush();`
	`69`	`+ while (!vad.isEmpty()) {`
	`70`	`+ allSamples.add(vad.front().samples);`
	`71`	`+ vad.pop();`
	`72`	`+ }`
	`73`	`+`
`68`	`74`	`vad.free();`
`69`	`75`
`70`	`76`	`final s = Float32List.fromList(allSamples.expand((x) => x).toList());`
Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,26 @@ static void Main(string[] args)`
`57`	`57`	`}`
`58`	`58`	`}`
`59`	`59`	`}`
	`60`	`+`
	`61`	`+ vad.Flush();`
	`62`	`+`
	`63`	`+ while (!vad.IsEmpty()) {`
	`64`	`+ SpeechSegment segment = vad.Front();`
	`65`	`+ float startTime = segment.Start / (float)sampleRate;`
	`66`	`+ float duration = segment.Samples.Length / (float)sampleRate;`
	`67`	`+`
	`68`	`+ OfflineStream stream = recognizer.CreateStream();`
	`69`	`+ stream.AcceptWaveform(sampleRate, segment.Samples);`
	`70`	`+ recognizer.Decode(stream);`
	`71`	`+ String text = stream.Result.Text;`
	`72`	`+`
	`73`	`+ if (!String.IsNullOrEmpty(text)) {`
	`74`	`+ Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime),`
	`75`	`+ String.Format("{0:0.00}", startTime+duration), text);`
	`76`	`+ }`
	`77`	`+`
	`78`	`+ vad.Pop();`
	`79`	`+ }`
`60`	`80`	`}`
`61`	`81`	`}`
`62`	`82`
Original file line number	Diff line number	Diff line change
`@@ -207,6 +207,10 @@ class VoiceActivityDetector {`
`207`	`207`	`SherpaOnnxBindings.voiceActivityDetectorReset?.call(ptr);`
`208`	`208`	`}`
`209`	`209`
	`210`	`+ void flush() {`
	`211`	`+ SherpaOnnxBindings.voiceActivityDetectorFlush?.call(ptr);`
	`212`	`+ }`
	`213`	`+`
`210`	`214`	`Pointer<SherpaOnnxVoiceActivityDetector> ptr;`
`211`	`215`	`final VadModelConfig config;`
`212`	`216`	`}`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`#`
`5`	`5`	`Pod::Spec.new do \|s\|`
`6`	`6`	`s.name = 'sherpa_onnx_macos'`
`7`		`- s.version = '1.10.11'`
	`7`	`+ s.version = '1.10.12'`
`8`	`8`	`s.summary = 'sherpa-onnx Flutter FFI plugin project.'`
`9`	`9`	`s.description = <<-DESC`
`10`	`10`	`sherpa-onnx Flutter FFI plugin project.`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`	`2`	`"dependencies": {`
`3`		`- "sherpa-onnx-node": "^1.10.6"`
	`3`	`+ "sherpa-onnx-node": "^1.10.12"`
`4`	`4`	`}`
`5`	`5`	`}`
Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,11 @@ public void Reset()`
`53`	`53`	`SherpaOnnxVoiceActivityDetectorReset(_handle.Handle);`
`54`	`54`	`}`
`55`	`55`
	`56`	`+ public void Flush()`
	`57`	`+ {`
	`58`	`+ SherpaOnnxVoiceActivityDetectorFlush(_handle.Handle);`
	`59`	`+ }`
	`60`	`+`
`56`	`61`	`public void Dispose()`
`57`	`62`	`{`
`58`	`63`	`Cleanup();`
`@@ -106,5 +111,7 @@ private void Cleanup()`
`106`	`111`	`[DllImport(Dll.Filename)]`
`107`	`112`	`private static extern void SherpaOnnxVoiceActivityDetectorReset(IntPtr handle);`
`108`	`113`
	`114`	`+ [DllImport(Dll.Filename)]`
	`115`	`+ private static extern void SherpaOnnxVoiceActivityDetectorFlush(IntPtr handle);`
`109`	`116`	`}`
`110`	`117`	`}`
Original file line number	Diff line number	Diff line change
`@@ -856,6 +856,10 @@ func (vad *VoiceActivityDetector) Reset() {`
`856`	`856`	`C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)`
`857`	`857`	`}`
`858`	`858`
	`859`	`+func (vad *VoiceActivityDetector) Flush() {`
	`860`	`+ C.SherpaOnnxVoiceActivityDetectorFlush(vad.impl)`
	`861`	`+}`
	`862`	`+`
`859`	`863`	`// Spoken language identification`
`860`	`864`
`861`	`865`	`type SpokenLanguageIdentificationWhisperConfig struct {`