Skip to content

Commit ffc6b48

Browse files
authored
Add C++ and Python API for Kokoro TTS models. (#1715)
1 parent 9efe26a commit ffc6b48

27 files changed

+1193
-29
lines changed

.github/scripts/test-offline-tts.sh

+25
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,31 @@ which $EXE
1818
# test waves are saved in ./tts
1919
mkdir ./tts
2020

21+
log "------------------------------------------------------------"
22+
log "kokoro-en-v0_19"
23+
log "------------------------------------------------------------"
24+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
25+
tar xf kokoro-en-v0_19.tar.bz2
26+
rm kokoro-en-v0_19.tar.bz2
27+
28+
# mapping of sid to voice name
29+
# 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam
30+
# 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis
31+
32+
for sid in $(seq 0 10); do
33+
$EXE \
34+
--debug=1 \
35+
--kokoro-model=./kokoro-en-v0_19/model.onnx \
36+
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
37+
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
38+
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
39+
--num-threads=2 \
40+
--sid=$sid \
41+
--output-filename="./tts/kokoro-$sid.wav" \
42+
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
43+
done
44+
rm -rf kokoro-en-v0_19
45+
2146
log "------------------------------------------------------------"
2247
log "matcha-icefall-en_US-ljspeech"
2348
log "------------------------------------------------------------"

.github/scripts/test-python.sh

+19
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,25 @@ log "Offline TTS test"
267267
# test waves are saved in ./tts
268268
mkdir ./tts
269269

270+
log "kokoro-en-v0_19 test"
271+
272+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
273+
tar xf kokoro-en-v0_19.tar.bz2
274+
rm kokoro-en-v0_19.tar.bz2
275+
276+
python3 ./python-api-examples/offline-tts.py \
277+
--debug=1 \
278+
--kokoro-model=./kokoro-en-v0_19/model.onnx \
279+
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
280+
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
281+
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
282+
--num-threads=2 \
283+
--sid=10 \
284+
--output-filename="./tts/kokoro-10.wav" \
285+
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
286+
287+
rm -rf kokoro-en-v0_19
288+
270289
log "matcha-ljspeech-en test"
271290

272291
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2

python-api-examples/offline-tts-play.py

+58-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
1212
Usage:
1313
14-
Example (1/5)
14+
Example (1/6)
1515
1616
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
1717
tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -23,7 +23,7 @@
2323
--output-filename=./generated.wav \
2424
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
2525
26-
Example (2/5)
26+
Example (2/6)
2727
2828
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
2929
tar xvf vits-zh-aishell3.tar.bz2
@@ -37,7 +37,7 @@
3737
--output-filename=./liubei-21.wav \
3838
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
3939
40-
Example (3/5)
40+
Example (3/6)
4141
4242
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
4343
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -53,7 +53,7 @@
5353
--output-filename=./test-2.wav \
5454
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
5555
56-
Example (4/5)
56+
Example (4/6)
5757
5858
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
5959
tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -71,7 +71,7 @@
7171
--output-filename=./test-matcha.wav \
7272
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
7373
74-
Example (5/5)
74+
Example (5/6)
7575
7676
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
7777
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -88,6 +88,22 @@
8888
--num-threads=2 \
8989
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
9090
91+
Example (6/6)
92+
93+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
94+
tar xf kokoro-en-v0_19.tar.bz2
95+
rm kokoro-en-v0_19.tar.bz2
96+
97+
python3 ./python-api-examples/offline-tts.py \
98+
--debug=1 \
99+
--kokoro-model=./kokoro-en-v0_19/model.onnx \
100+
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
101+
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
102+
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
103+
--num-threads=2 \
104+
--sid=10 \
105+
--output-filename="./kokoro-10.wav" \
106+
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
91107
92108
You can find more models at
93109
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
@@ -202,13 +218,44 @@ def add_matcha_args(parser):
202218
)
203219

204220

221+
def add_kokoro_args(parser):
222+
parser.add_argument(
223+
"--kokoro-model",
224+
type=str,
225+
default="",
226+
help="Path to model.onnx for kokoro",
227+
)
228+
229+
parser.add_argument(
230+
"--kokoro-voices",
231+
type=str,
232+
default="",
233+
help="Path to voices.bin for kokoro",
234+
)
235+
236+
parser.add_argument(
237+
"--kokoro-tokens",
238+
type=str,
239+
default="",
240+
help="Path to tokens.txt for kokoro",
241+
)
242+
243+
parser.add_argument(
244+
"--kokoro-data-dir",
245+
type=str,
246+
default="",
247+
help="Path to the dict directory of espeak-ng.",
248+
)
249+
250+
205251
def get_args():
206252
parser = argparse.ArgumentParser(
207253
formatter_class=argparse.ArgumentDefaultsHelpFormatter
208254
)
209255

210256
add_vits_args(parser)
211257
add_matcha_args(parser)
258+
add_kokoro_args(parser)
212259

213260
parser.add_argument(
214261
"--tts-rule-fsts",
@@ -407,6 +454,12 @@ def main():
407454
data_dir=args.matcha_data_dir,
408455
dict_dir=args.matcha_dict_dir,
409456
),
457+
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
458+
model=args.kokoro_model,
459+
voices=args.kokoro_voices,
460+
tokens=args.kokoro_tokens,
461+
data_dir=args.kokoro_data_dir,
462+
),
410463
provider=args.provider,
411464
debug=args.debug,
412465
num_threads=args.num_threads,

python-api-examples/offline-tts.py

+60-6
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
1313
Usage:
1414
15-
Example (1/5)
15+
Example (1/6)
1616
1717
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
1818
tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -24,7 +24,7 @@
2424
--output-filename=./generated.wav \
2525
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
2626
27-
Example (2/5)
27+
Example (2/6)
2828
2929
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
3030
tar xvf vits-icefall-zh-aishell3.tar.bz2
@@ -38,7 +38,7 @@
3838
--output-filename=./liubei-21.wav \
3939
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
4040
41-
Example (3/5)
41+
Example (3/6)
4242
4343
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
4444
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -54,7 +54,7 @@
5454
--output-filename=./test-2.wav \
5555
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
5656
57-
Example (4/5)
57+
Example (4/6)
5858
5959
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
6060
tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -72,7 +72,7 @@
7272
--output-filename=./test-matcha.wav \
7373
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
7474
75-
Example (5/5)
75+
Example (5/6)
7676
7777
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
7878
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -89,6 +89,23 @@
8989
--num-threads=2 \
9090
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
9191
92+
Example (6/6)
93+
94+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
95+
tar xf kokoro-en-v0_19.tar.bz2
96+
rm kokoro-en-v0_19.tar.bz2
97+
98+
python3 ./python-api-examples/offline-tts.py \
99+
--debug=1 \
100+
--kokoro-model=./kokoro-en-v0_19/model.onnx \
101+
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
102+
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
103+
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
104+
--num-threads=2 \
105+
--sid=10 \
106+
--output-filename="./kokoro-10.wav" \
107+
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
108+
92109
You can find more models at
93110
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
94111
@@ -188,13 +205,44 @@ def add_matcha_args(parser):
188205
)
189206

190207

208+
def add_kokoro_args(parser):
209+
parser.add_argument(
210+
"--kokoro-model",
211+
type=str,
212+
default="",
213+
help="Path to model.onnx for kokoro",
214+
)
215+
216+
parser.add_argument(
217+
"--kokoro-voices",
218+
type=str,
219+
default="",
220+
help="Path to voices.bin for kokoro",
221+
)
222+
223+
parser.add_argument(
224+
"--kokoro-tokens",
225+
type=str,
226+
default="",
227+
help="Path to tokens.txt for kokoro",
228+
)
229+
230+
parser.add_argument(
231+
"--kokoro-data-dir",
232+
type=str,
233+
default="",
234+
help="Path to the dict directory of espeak-ng.",
235+
)
236+
237+
191238
def get_args():
192239
parser = argparse.ArgumentParser(
193240
formatter_class=argparse.ArgumentDefaultsHelpFormatter
194241
)
195242

196243
add_vits_args(parser)
197244
add_matcha_args(parser)
245+
add_kokoro_args(parser)
198246

199247
parser.add_argument(
200248
"--tts-rule-fsts",
@@ -206,7 +254,7 @@ def get_args():
206254
parser.add_argument(
207255
"--max-num-sentences",
208256
type=int,
209-
default=2,
257+
default=1,
210258
help="""Max number of sentences in a batch to avoid OOM if the input
211259
text is very long. Set it to -1 to process all the sentences in a
212260
single batch. A smaller value does not mean it is slower compared
@@ -289,6 +337,12 @@ def main():
289337
data_dir=args.matcha_data_dir,
290338
dict_dir=args.matcha_dict_dir,
291339
),
340+
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
341+
model=args.kokoro_model,
342+
voices=args.kokoro_voices,
343+
tokens=args.kokoro_tokens,
344+
data_dir=args.kokoro_data_dir,
345+
),
292346
provider=args.provider,
293347
debug=args.debug,
294348
num_threads=args.num_threads,

sherpa-onnx/csrc/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,8 @@ if(SHERPA_ONNX_ENABLE_TTS)
158158
offline-tts-character-frontend.cc
159159
offline-tts-frontend.cc
160160
offline-tts-impl.cc
161+
offline-tts-kokoro-model-config.cc
162+
offline-tts-kokoro-model.cc
161163
offline-tts-matcha-model-config.cc
162164
offline-tts-matcha-model.cc
163165
offline-tts-model-config.cc

sherpa-onnx/csrc/melo-tts-lexicon.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#include <vector>
1212

1313
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
14-
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
14+
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
1515

1616
namespace sherpa_onnx {
1717

sherpa-onnx/csrc/offline-tts-character-frontend.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#include <vector>
1111

1212
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
13-
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
13+
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
1414

1515
namespace sherpa_onnx {
1616

sherpa-onnx/csrc/offline-tts-impl.cc

+8-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "rawfile/raw_file_manager.h"
1717
#endif
1818

19+
#include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h"
1920
#include "sherpa-onnx/csrc/offline-tts-matcha-impl.h"
2021
#include "sherpa-onnx/csrc/offline-tts-vits-impl.h"
2122

@@ -37,18 +38,23 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
3738
const OfflineTtsConfig &config) {
3839
if (!config.model.vits.model.empty()) {
3940
return std::make_unique<OfflineTtsVitsImpl>(config);
41+
} else if (!config.model.matcha.acoustic_model.empty()) {
42+
return std::make_unique<OfflineTtsMatchaImpl>(config);
4043
}
41-
return std::make_unique<OfflineTtsMatchaImpl>(config);
44+
45+
return std::make_unique<OfflineTtsKokoroImpl>(config);
4246
}
4347

4448
template <typename Manager>
4549
std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
4650
Manager *mgr, const OfflineTtsConfig &config) {
4751
if (!config.model.vits.model.empty()) {
4852
return std::make_unique<OfflineTtsVitsImpl>(mgr, config);
53+
} else if (!config.model.matcha.acoustic_model.empty()) {
54+
return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
4955
}
5056

51-
return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
57+
return std::make_unique<OfflineTtsKokoroImpl>(mgr, config);
5258
}
5359

5460
#if __ANDROID_API__ >= 9

0 commit comments

Comments
 (0)