Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support specifying voice in espeak-ng for kokoro tts models. #1836

Merged
merged 1 commit into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ class KokoroMultiLangLexicon::Impl {
// https://en.cppreference.com/w/cpp/regex
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
std::string expr =
"([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)";
"([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
")";

auto ws = ToWideString(text);
std::wstring wexpr = ToWideString(expr);
Expand All @@ -127,7 +128,7 @@ class KokoroMultiLangLexicon::Impl {
if (debug_) {
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
}
ids_vec = ConvertEnglishToTokenIDs(ms);
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
} else {
if (debug_) {
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
Expand Down Expand Up @@ -257,7 +258,7 @@ class KokoroMultiLangLexicon::Impl {
}

std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
const std::string &text) const {
const std::string &text, const std::string &voice) const {
std::vector<std::string> words = SplitUtf8(text);
if (debug_) {
std::ostringstream os;
Expand Down Expand Up @@ -315,7 +316,7 @@ class KokoroMultiLangLexicon::Impl {

piper::eSpeakPhonemeConfig config;

config.voice = "en-us";
config.voice = voice;

std::vector<std::vector<piper::Phoneme>> phonemes;

Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/offline-tts-kokoro-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
}

std::vector<TokenIDs> token_ids =
frontend_->ConvertTextToTokenIds(text, "en-us");
frontend_->ConvertTextToTokenIds(text, meta_data.voice);

if (token_ids.empty() ||
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
Expand Down
2 changes: 2 additions & 0 deletions sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ struct OfflineTtsKokoroModelMetaData {
int32_t version = 1;
int32_t has_espeak = 1;
int32_t max_token_len = 0;

std::string voice;
};

} // namespace sherpa_onnx
Expand Down
2 changes: 2 additions & 0 deletions sherpa-onnx/csrc/offline-tts-kokoro-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ class OfflineTtsKokoroModel::Impl {
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice",
"en-us");

if (config_.debug) {
std::vector<std::string> speaker_names;
Expand Down
Loading