Skip to content

Commit 6cb0181

Browse files
authored
Fix for silero vad v5. (#1065)
The network input is 64 + 512 samples instead of 512 samples for 16kHz.
1 parent 61c7eb3 commit 6cb0181

File tree

3 files changed

+18
-12
lines changed

3 files changed

+18
-12
lines changed

sherpa-onnx/csrc/silero-vad-model.cc

+9-8
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,8 @@ class SileroVadModel::Impl {
7474
}
7575

7676
bool IsSpeech(const float *samples, int32_t n) {
77-
if (n != config_.silero_vad.window_size) {
78-
SHERPA_ONNX_LOGE("n: %d != window_size: %d", n,
79-
config_.silero_vad.window_size);
77+
if (n != WindowSize()) {
78+
SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, WindowSize());
8079
exit(-1);
8180
}
8281

@@ -146,9 +145,11 @@ class SileroVadModel::Impl {
146145
return false;
147146
}
148147

149-
int32_t WindowSize() const { return config_.silero_vad.window_size; }
148+
int32_t WindowShift() const { return config_.silero_vad.window_size; }
150149

151-
int32_t WindowShift() const { return WindowSize() - window_shift_; }
150+
int32_t WindowSize() const {
151+
return config_.silero_vad.window_size + window_overlap_;
152+
}
152153

153154
int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }
154155

@@ -177,9 +178,9 @@ class SileroVadModel::Impl {
177178

178179
// 64 for 16kHz
179180
// 32 for 8kHz
180-
window_shift_ = 64;
181+
window_overlap_ = 64;
181182

182-
if (WindowSize() != 512) {
183+
if (config_.silero_vad.window_size != 512) {
183184
SHERPA_ONNX_LOGE(
184185
"For silero_vad v5, we require window_size to be 512 for 16kHz");
185186
exit(-1);
@@ -423,7 +424,7 @@ class SileroVadModel::Impl {
423424
int32_t temp_start_ = 0;
424425
int32_t temp_end_ = 0;
425426

426-
int32_t window_shift_ = 0;
427+
int32_t window_overlap_ = 0;
427428

428429
bool is_v5_ = false;
429430
};

sherpa-onnx/csrc/silero-vad-model.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,12 @@ class SileroVadModel : public VadModel {
3737
*/
3838
bool IsSpeech(const float *samples, int32_t n) override;
3939

40+
// For silero vad V4, it is WindowShift().
41+
// For silero vad V5, it is WindowShift()+64 for 16kHz and
42+
// WindowShift()+32 for 8kHz
4043
int32_t WindowSize() const override;
4144

42-
// For silero vad V4, it is WindowSize().
43-
// For silero vad V5, it is WindowSize()-64 for 16kHz and
44-
// WindowSize()-32 for 8kHz
45+
// 512
4546
int32_t WindowShift() const override;
4647

4748
int32_t MinSilenceDurationSamples() const override;

sherpa-onnx/csrc/voice-activity-detector.cc

+5-1
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,17 @@ class VoiceActivityDetector::Impl {
4444
// an extra buffer here
4545
last_.insert(last_.end(), samples, samples + n);
4646

47+
if (last_.size() < window_size) {
48+
return;
49+
}
50+
4751
// Note: For v4, window_shift == window_size
4852
int32_t k =
4953
(static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1;
5054
const float *p = last_.data();
5155
bool is_speech = false;
5256

53-
for (int32_t i = 0; i != k; ++i, p += window_shift) {
57+
for (int32_t i = 0; i < k; ++i, p += window_shift) {
5458
buffer_.Push(p, window_shift);
5559
// NOTE(fangjun): Please don't use a very large n.
5660
bool this_window_is_speech = model_->IsSpeech(p, window_size);

0 commit comments

Comments
 (0)