Skip to content

Commit c84a833

Browse files
authored
Add C++ and Python API for Kokoro 1.0 multilingual TTS model (#1795)
1 parent 08cefe8 commit c84a833

20 files changed

+819
-39
lines changed

.github/scripts/test-python.sh

+21-3
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,27 @@ log "Offline TTS test"
267267
# test waves are saved in ./tts
268268
mkdir ./tts
269269

270+
log "kokoro-multi-lang-v1_0 test"
271+
272+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
273+
tar xf kokoro-multi-lang-v1_0.tar.bz2
274+
rm kokoro-multi-lang-v1_0.tar.bz2
275+
276+
python3 ./python-api-examples/offline-tts.py \
277+
--debug=1 \
278+
--kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
279+
--kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
280+
--kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
281+
--kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
282+
--kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
283+
--kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
284+
--num-threads=2 \
285+
--sid=18 \
286+
--output-filename="./tts/kokoro-18-zh-en.wav" \
287+
"中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
288+
289+
rm -rf kokoro-multi-lang-v1_0
290+
270291
log "kokoro-en-v0_19 test"
271292

272293
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
@@ -580,13 +601,10 @@ if [[ x$OS != x'windows-latest' ]]; then
580601
repo=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
581602
log "Start testing ${repo}"
582603

583-
pushd $dir
584604
curl -LS -O https://github.com/pkufool/keyword-spotting-models/releases/download/v0.1/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
585605
tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
586606
rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
587-
popd
588607

589-
repo=$dir/$repo
590608
ls -lh $repo
591609

592610
python3 ./python-api-examples/keyword-spotter.py

.github/workflows/export-kokoro.yaml

+15-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ on:
44
push:
55
branches:
66
- export-kokoro
7-
- kokoro-1.0-2
87

98
workflow_dispatch:
109

@@ -76,6 +75,14 @@ jobs:
7675
if: matrix.version == '1.0'
7776
shell: bash
7877
run: |
78+
curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
79+
tar xvf dict.tar.bz2
80+
rm dict.tar.bz2
81+
82+
curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
83+
curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
84+
curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
85+
7986
src=scripts/kokoro/v1.0
8087
8188
d=kokoro-multi-lang-v1_0
@@ -87,7 +94,12 @@ jobs:
8794
cp -v $src/tokens.txt $d/
8895
cp -v $src/lexicon*.txt $d/
8996
cp -v $src/README.md $d/README.md
97+
cp -av dict $d/
98+
cp -v ./*.fst $d/
9099
ls -lh $d/
100+
echo "---"
101+
ls -lh $d/dict
102+
91103
tar cjfv $d.tar.bz2 $d
92104
rm -rf $d
93105
@@ -180,6 +192,8 @@ jobs:
180192
cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
181193
cp -v ../scripts/kokoro/v1.0/README.md ./README.md
182194
cp -v ../LICENSE ./
195+
cp -av ../dict ./
196+
cp -v ../*.fst $d/
183197
184198
git lfs track "*.onnx"
185199
git add .

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -132,3 +132,4 @@ kokoro-en-v0_19
132132
lexicon.txt
133133
us_gold.json
134134
us_silver.json
135+
kokoro-multi-lang-v1_0

c-api-examples/kws-c-api.c

+7-6
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,28 @@ int32_t main() {
2525

2626
memset(&config, 0, sizeof(config));
2727
config.model_config.transducer.encoder =
28-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
28+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
2929
"encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
3030

3131
config.model_config.transducer.decoder =
32-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
32+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
3333
"decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
3434

3535
config.model_config.transducer.joiner =
36-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
36+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
3737
"joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
3838

3939
config.model_config.tokens =
40-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt";
40+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
41+
"tokens.txt";
4142

4243
config.model_config.provider = "cpu";
4344
config.model_config.num_threads = 1;
4445
config.model_config.debug = 1;
4546

4647
config.keywords_file =
47-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/"
48-
"test_keywords.txt";
48+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
49+
"test_wavs/test_keywords.txt";
4950

5051
const SherpaOnnxKeywordSpotter *kws = SherpaOnnxCreateKeywordSpotter(&config);
5152
if (!kws) {

cxx-api-examples/kws-cxx-api.cc

+7-6
Original file line numberDiff line numberDiff line change
@@ -24,27 +24,28 @@ int32_t main() {
2424

2525
KeywordSpotterConfig config;
2626
config.model_config.transducer.encoder =
27-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
27+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
2828
"encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
2929

3030
config.model_config.transducer.decoder =
31-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
31+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
3232
"decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
3333

3434
config.model_config.transducer.joiner =
35-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/"
35+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
3636
"joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
3737

3838
config.model_config.tokens =
39-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt";
39+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
40+
"tokens.txt";
4041

4142
config.model_config.provider = "cpu";
4243
config.model_config.num_threads = 1;
4344
config.model_config.debug = 1;
4445

4546
config.keywords_file =
46-
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/"
47-
"test_keywords.txt";
47+
"./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
48+
"test_wavs/test_keywords.txt";
4849

4950
KeywordSpotter kws = KeywordSpotter::Create(config);
5051
if (!kws.Get()) {

python-api-examples/offline-tts-play.py

+45-6
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
1212
Usage:
1313
14-
Example (1/6)
14+
Example (1/7)
1515
1616
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
1717
tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -23,7 +23,7 @@
2323
--output-filename=./generated.wav \
2424
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
2525
26-
Example (2/6)
26+
Example (2/7)
2727
2828
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
2929
tar xvf vits-zh-aishell3.tar.bz2
@@ -37,7 +37,7 @@
3737
--output-filename=./liubei-21.wav \
3838
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
3939
40-
Example (3/6)
40+
Example (3/7)
4141
4242
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
4343
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -53,7 +53,7 @@
5353
--output-filename=./test-2.wav \
5454
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
5555
56-
Example (4/6)
56+
Example (4/7)
5757
5858
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
5959
tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -71,7 +71,7 @@
7171
--output-filename=./test-matcha.wav \
7272
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
7373
74-
Example (5/6)
74+
Example (5/7)
7575
7676
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
7777
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -88,7 +88,9 @@
8888
--num-threads=2 \
8989
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
9090
91-
Example (6/6)
91+
Example (6/7)
92+
93+
(This version of kokoro supports only English)
9294
9395
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
9496
tar xf kokoro-en-v0_19.tar.bz2
@@ -105,6 +107,27 @@
105107
--output-filename="./kokoro-10.wav" \
106108
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
107109
110+
Example (7/7)
111+
112+
(This version of kokoro supports English, Chinese, etc.)
113+
114+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
115+
tar xf kokoro-multi-lang-v1_0.tar.bz2
116+
rm kokoro-multi-lang-v1_0.tar.bz2
117+
118+
python3 ./python-api-examples/offline-tts-play.py \
119+
--debug=1 \
120+
--kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
121+
--kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
122+
--kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
123+
--kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
124+
--kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
125+
--kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
126+
--num-threads=2 \
127+
--sid=18 \
128+
--output-filename="./kokoro-18-zh-en.wav" \
129+
"中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
130+
108131
You can find more models at
109132
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
110133
@@ -247,6 +270,20 @@ def add_kokoro_args(parser):
247270
help="Path to the dict directory of espeak-ng.",
248271
)
249272

273+
parser.add_argument(
274+
"--kokoro-dict-dir",
275+
type=str,
276+
default="",
277+
help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro",
278+
)
279+
280+
parser.add_argument(
281+
"--kokoro-lexicon",
282+
type=str,
283+
default="",
284+
help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro",
285+
)
286+
250287

251288
def get_args():
252289
parser = argparse.ArgumentParser(
@@ -459,6 +496,8 @@ def main():
459496
voices=args.kokoro_voices,
460497
tokens=args.kokoro_tokens,
461498
data_dir=args.kokoro_data_dir,
499+
dict_dir=args.kokoro_dict_dir,
500+
lexicon=args.kokoro_lexicon,
462501
),
463502
provider=args.provider,
464503
debug=args.debug,

python-api-examples/offline-tts.py

+45-6
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
1313
Usage:
1414
15-
Example (1/6)
15+
Example (1/7)
1616
1717
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
1818
tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -24,7 +24,7 @@
2424
--output-filename=./generated.wav \
2525
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
2626
27-
Example (2/6)
27+
Example (2/7)
2828
2929
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
3030
tar xvf vits-icefall-zh-aishell3.tar.bz2
@@ -38,7 +38,7 @@
3838
--output-filename=./liubei-21.wav \
3939
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
4040
41-
Example (3/6)
41+
Example (3/7)
4242
4343
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
4444
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -54,7 +54,7 @@
5454
--output-filename=./test-2.wav \
5555
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
5656
57-
Example (4/6)
57+
Example (4/7)
5858
5959
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
6060
tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -72,7 +72,7 @@
7272
--output-filename=./test-matcha.wav \
7373
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
7474
75-
Example (5/6)
75+
Example (5/7)
7676
7777
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
7878
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -89,7 +89,9 @@
8989
--num-threads=2 \
9090
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
9191
92-
Example (6/6)
92+
Example (6/7)
93+
94+
(This version of kokoro supports only English)
9395
9496
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
9597
tar xf kokoro-en-v0_19.tar.bz2
@@ -106,6 +108,27 @@
106108
--output-filename="./kokoro-10.wav" \
107109
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
108110
111+
Example (7/7)
112+
113+
(This version of kokoro supports English, Chinese, etc.)
114+
115+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
116+
tar xf kokoro-multi-lang-v1_0.tar.bz2
117+
rm kokoro-multi-lang-v1_0.tar.bz2
118+
119+
python3 ./python-api-examples/offline-tts.py \
120+
--debug=1 \
121+
--kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
122+
--kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
123+
--kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
124+
--kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
125+
--kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
126+
--kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
127+
--num-threads=2 \
128+
--sid=18 \
129+
--output-filename="./kokoro-18-zh-en.wav" \
130+
"中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
131+
109132
You can find more models at
110133
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
111134
@@ -234,6 +257,20 @@ def add_kokoro_args(parser):
234257
help="Path to the dict directory of espeak-ng.",
235258
)
236259

260+
parser.add_argument(
261+
"--kokoro-dict-dir",
262+
type=str,
263+
default="",
264+
help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro",
265+
)
266+
267+
parser.add_argument(
268+
"--kokoro-lexicon",
269+
type=str,
270+
default="",
271+
help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro",
272+
)
273+
237274

238275
def get_args():
239276
parser = argparse.ArgumentParser(
@@ -342,6 +379,8 @@ def main():
342379
voices=args.kokoro_voices,
343380
tokens=args.kokoro_tokens,
344381
data_dir=args.kokoro_data_dir,
382+
dict_dir=args.kokoro_dict_dir,
383+
lexicon=args.kokoro_lexicon,
345384
),
346385
provider=args.provider,
347386
debug=args.debug,

scripts/kokoro/v1.0/generate_voices_bin.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def main():
7171
with open("voices.bin", "wb") as f:
7272
for _, speaker in id2speaker.items():
7373
m = torch.load(
74-
f"{speaker}.pt",
74+
f"voices/{speaker}.pt",
7575
weights_only=True,
7676
map_location="cpu",
7777
).numpy()

0 commit comments

Comments
 (0)