Skip to content

Commit 17cd3a5

Browse files
authored
Add C++ runtime for non-streaming faster conformer transducer from NeMo. (#854)
1 parent 5d8c35e commit 17cd3a5

31 files changed

+1093
-153
lines changed

.github/scripts/test-offline-transducer.sh

+99
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,105 @@ echo "PATH: $PATH"
1313

1414
which $EXE
1515

16+
log "------------------------------------------------------------------------"
17+
log "Run Nemo fast conformer hybrid transducer ctc models (transducer branch)"
18+
log "------------------------------------------------------------------------"
19+
20+
url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2
21+
name=$(basename $url)
22+
curl -SL -O $url
23+
tar xvf $name
24+
rm $name
25+
repo=$(basename -s .tar.bz2 $name)
26+
ls -lh $repo
27+
28+
log "test $repo"
29+
test_wavs=(
30+
de-german.wav
31+
es-spanish.wav
32+
hr-croatian.wav
33+
po-polish.wav
34+
uk-ukrainian.wav
35+
en-english.wav
36+
fr-french.wav
37+
it-italian.wav
38+
ru-russian.wav
39+
)
40+
for w in ${test_wavs[@]}; do
41+
time $EXE \
42+
--tokens=$repo/tokens.txt \
43+
--encoder=$repo/encoder.onnx \
44+
--decoder=$repo/decoder.onnx \
45+
--joiner=$repo/joiner.onnx \
46+
--debug=1 \
47+
$repo/test_wavs/$w
48+
done
49+
50+
rm -rf $repo
51+
52+
url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-transducer-en-24500.tar.bz2
53+
name=$(basename $url)
54+
curl -SL -O $url
55+
tar xvf $name
56+
rm $name
57+
repo=$(basename -s .tar.bz2 $name)
58+
ls -lh $repo
59+
60+
log "Test $repo"
61+
62+
time $EXE \
63+
--tokens=$repo/tokens.txt \
64+
--encoder=$repo/encoder.onnx \
65+
--decoder=$repo/decoder.onnx \
66+
--joiner=$repo/joiner.onnx \
67+
--debug=1 \
68+
$repo/test_wavs/en-english.wav
69+
70+
rm -rf $repo
71+
72+
url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-transducer-es-1424.tar.bz2
73+
name=$(basename $url)
74+
curl -SL -O $url
75+
tar xvf $name
76+
rm $name
77+
repo=$(basename -s .tar.bz2 $name)
78+
ls -lh $repo
79+
80+
log "test $repo"
81+
82+
time $EXE \
83+
--tokens=$repo/tokens.txt \
84+
--encoder=$repo/encoder.onnx \
85+
--decoder=$repo/decoder.onnx \
86+
--joiner=$repo/joiner.onnx \
87+
--debug=1 \
88+
$repo/test_wavs/es-spanish.wav
89+
90+
rm -rf $repo
91+
92+
url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288.tar.bz2
93+
name=$(basename $url)
94+
curl -SL -O $url
95+
tar xvf $name
96+
rm $name
97+
repo=$(basename -s .tar.bz2 $name)
98+
ls -lh $repo
99+
100+
log "Test $repo"
101+
102+
time $EXE \
103+
--tokens=$repo/tokens.txt \
104+
--encoder=$repo/encoder.onnx \
105+
--decoder=$repo/decoder.onnx \
106+
--joiner=$repo/joiner.onnx \
107+
--debug=1 \
108+
$repo/test_wavs/en-english.wav \
109+
$repo/test_wavs/de-german.wav \
110+
$repo/test_wavs/fr-french.wav \
111+
$repo/test_wavs/es-spanish.wav
112+
113+
rm -rf $repo
114+
16115
log "------------------------------------------------------------"
17116
log "Run Conformer transducer (English)"
18117
log "------------------------------------------------------------"

.github/workflows/linux.yaml

+8-8
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,14 @@ jobs:
128128
name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
129129
path: install/*
130130

131+
- name: Test offline transducer
132+
shell: bash
133+
run: |
134+
export PATH=$PWD/build/bin:$PATH
135+
export EXE=sherpa-onnx-offline
136+
137+
.github/scripts/test-offline-transducer.sh
138+
131139
- name: Test spoken language identification (C++ API)
132140
shell: bash
133141
run: |
@@ -215,14 +223,6 @@ jobs:
215223
216224
.github/scripts/test-online-paraformer.sh
217225
218-
- name: Test offline transducer
219-
shell: bash
220-
run: |
221-
export PATH=$PWD/build/bin:$PATH
222-
export EXE=sherpa-onnx-offline
223-
224-
.github/scripts/test-offline-transducer.sh
225-
226226
- name: Test online transducer
227227
shell: bash
228228
run: |

.github/workflows/macos.yaml

+8-8
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,14 @@ jobs:
107107
otool -L build/bin/sherpa-onnx
108108
otool -l build/bin/sherpa-onnx
109109
110+
- name: Test offline transducer
111+
shell: bash
112+
run: |
113+
export PATH=$PWD/build/bin:$PATH
114+
export EXE=sherpa-onnx-offline
115+
116+
.github/scripts/test-offline-transducer.sh
117+
110118
- name: Test online CTC
111119
shell: bash
112120
run: |
@@ -192,14 +200,6 @@ jobs:
192200
193201
.github/scripts/test-offline-ctc.sh
194202
195-
- name: Test offline transducer
196-
shell: bash
197-
run: |
198-
export PATH=$PWD/build/bin:$PATH
199-
export EXE=sherpa-onnx-offline
200-
201-
.github/scripts/test-offline-transducer.sh
202-
203203
- name: Test online transducer
204204
shell: bash
205205
run: |

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,4 @@ sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
104104
sherpa-onnx-ced-*
105105
node_modules
106106
package-lock.json
107+
sherpa-onnx-nemo-*
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This file shows how to use a non-streaming CTC model from NeMo
5+
to decode files.
6+
7+
Please download model files from
8+
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
9+
10+
11+
The example model supports 10 languages and it is converted from
12+
https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc
13+
"""
14+
15+
from pathlib import Path
16+
17+
import sherpa_onnx
18+
import soundfile as sf
19+
20+
21+
def create_recognizer():
22+
model = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx"
23+
tokens = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt"
24+
25+
test_wav = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/de-german.wav"
26+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/en-english.wav"
27+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/es-spanish.wav"
28+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/fr-french.wav"
29+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/hr-croatian.wav"
30+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/it-italian.wav"
31+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/po-polish.wav"
32+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/ru-russian.wav"
33+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/uk-ukrainian.wav"
34+
35+
if not Path(model).is_file() or not Path(test_wav).is_file():
36+
raise ValueError(
37+
"""Please download model files from
38+
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
39+
"""
40+
)
41+
return (
42+
sherpa_onnx.OfflineRecognizer.from_nemo_ctc(
43+
model=model,
44+
tokens=tokens,
45+
debug=True,
46+
),
47+
test_wav,
48+
)
49+
50+
51+
def main():
52+
recognizer, wave_filename = create_recognizer()
53+
54+
audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
55+
audio = audio[:, 0] # only use the first channel
56+
57+
# audio is a 1-D float32 numpy array normalized to the range [-1, 1]
58+
# sample_rate does not need to be 16000 Hz
59+
60+
stream = recognizer.create_stream()
61+
stream.accept_waveform(sample_rate, audio)
62+
recognizer.decode_stream(stream)
63+
print(wave_filename)
64+
print(stream.result)
65+
66+
67+
if __name__ == "__main__":
68+
main()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This file shows how to use a non-streaming transducer model from NeMo
5+
to decode files.
6+
7+
Please download model files from
8+
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
9+
10+
11+
The example model supports 10 languages and it is converted from
12+
https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc
13+
"""
14+
15+
from pathlib import Path
16+
17+
import sherpa_onnx
18+
import soundfile as sf
19+
20+
21+
def create_recognizer():
22+
encoder = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx"
23+
decoder = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx"
24+
joiner = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx"
25+
tokens = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt"
26+
27+
test_wav = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/de-german.wav"
28+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/en-english.wav"
29+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/es-spanish.wav"
30+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/fr-french.wav"
31+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/hr-croatian.wav"
32+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/it-italian.wav"
33+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/po-polish.wav"
34+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/ru-russian.wav"
35+
# test_wav = "./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/uk-ukrainian.wav"
36+
37+
if not Path(encoder).is_file() or not Path(test_wav).is_file():
38+
raise ValueError(
39+
"""Please download model files from
40+
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
41+
"""
42+
)
43+
return (
44+
sherpa_onnx.OfflineRecognizer.from_transducer(
45+
encoder=encoder,
46+
decoder=decoder,
47+
joiner=joiner,
48+
tokens=tokens,
49+
model_type="nemo_transducer",
50+
debug=True,
51+
),
52+
test_wav,
53+
)
54+
55+
56+
def main():
57+
recognizer, wave_filename = create_recognizer()
58+
59+
audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
60+
audio = audio[:, 0] # only use the first channel
61+
62+
# audio is a 1-D float32 numpy array normalized to the range [-1, 1]
63+
# sample_rate does not need to be 16000 Hz
64+
65+
stream = recognizer.create_stream()
66+
stream.accept_waveform(sample_rate, audio)
67+
recognizer.decode_stream(stream)
68+
print(wave_filename)
69+
print(stream.result)
70+
71+
72+
if __name__ == "__main__":
73+
main()

sherpa-onnx/csrc/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,11 @@ set(sources
4040
offline-tdnn-ctc-model.cc
4141
offline-tdnn-model-config.cc
4242
offline-transducer-greedy-search-decoder.cc
43+
offline-transducer-greedy-search-nemo-decoder.cc
4344
offline-transducer-model-config.cc
4445
offline-transducer-model.cc
4546
offline-transducer-modified-beam-search-decoder.cc
47+
offline-transducer-nemo-model.cc
4648
offline-wenet-ctc-model-config.cc
4749
offline-wenet-ctc-model.cc
4850
offline-whisper-greedy-search-decoder.cc

sherpa-onnx/csrc/features.h

+13
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,19 @@ struct FeatureExtractorConfig {
5656
bool remove_dc_offset = true; // Subtract mean of wave before FFT.
5757
std::string window_type = "povey"; // e.g. Hamming window
5858

59+
// For models from NeMo
60+
// This option is not exposed and is set internally when loading models.
61+
// Possible values:
62+
// - per_feature
63+
// - all_features (not implemented yet)
64+
// - fixed_mean (not implemented)
65+
// - fixed_std (not implemented)
66+
// - or just leave it to empty
67+
// See
68+
// https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/parts/preprocessing/features.py#L59
69+
// for details
70+
std::string nemo_normalize_type;
71+
5972
std::string ToString() const;
6073

6174
void Register(ParseOptions *po);

sherpa-onnx/csrc/keyword-spotter-transducer-impl.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class KeywordSpotterTransducerImpl : public KeywordSpotterImpl {
6868
: config_(config),
6969
model_(OnlineTransducerModel::Create(config.model_config)),
7070
sym_(config.model_config.tokens) {
71-
if (sym_.contains("<unk>")) {
71+
if (sym_.Contains("<unk>")) {
7272
unk_id_ = sym_["<unk>"];
7373
}
7474

@@ -87,7 +87,7 @@ class KeywordSpotterTransducerImpl : public KeywordSpotterImpl {
8787
: config_(config),
8888
model_(OnlineTransducerModel::Create(mgr, config.model_config)),
8989
sym_(mgr, config.model_config.tokens) {
90-
if (sym_.contains("<unk>")) {
90+
if (sym_.Contains("<unk>")) {
9191
unk_id_ = sym_["<unk>"];
9292
}
9393

sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.cc
22
//
3-
// Copyright (c) 2023 Xiaomi Corporation
3+
// Copyright (c) 2023-2024 Xiaomi Corporation
44

55
#include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h"
66

0 commit comments

Comments
 (0)