|
| 1 | +// Copyright (c) 2024 Xiaomi Corporation |
| 2 | +// |
| 3 | +// This file shows how to use a non-streaming TTS model for text-to-speech |
| 4 | +// Please refer to |
| 5 | +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html |
| 6 | +// and |
| 7 | +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| 8 | +// to download pre-trained models |
| 9 | +// |
| 10 | +// Note that you need a speaker to run this file since it will play |
| 11 | +// the generated audio as it is generating. |
| 12 | + |
| 13 | +using CommandLine.Text; |
| 14 | +using CommandLine; |
| 15 | +using PortAudioSharp; |
| 16 | +using SherpaOnnx; |
| 17 | +using System.Collections.Concurrent; |
| 18 | +using System.Collections.Generic; |
| 19 | +using System.Runtime.InteropServices; |
| 20 | +using System.Threading; |
| 21 | +using System; |
| 22 | + |
| 23 | +class OfflineTtsPlayDemo |
| 24 | +{ |
| 25 | + class Options |
| 26 | + { |
| 27 | + |
| 28 | + [Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")] |
| 29 | + public string RuleFsts { get; set; } |
| 30 | + |
| 31 | + [Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] |
| 32 | + public string DataDir { get; set; } |
| 33 | + |
| 34 | + [Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] |
| 35 | + public float LengthScale { get; set; } |
| 36 | + |
| 37 | + [Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")] |
| 38 | + public float NoiseScale { get; set; } |
| 39 | + |
| 40 | + [Option("vits-noise-scale-w", Required = false, Default = 0.8f, HelpText = "noise_scale_w for VITS models")] |
| 41 | + public float NoiseScaleW { get; set; } |
| 42 | + |
| 43 | + [Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] |
| 44 | + public string Lexicon { get; set; } |
| 45 | + |
| 46 | + [Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")] |
| 47 | + public string Tokens { get; set; } |
| 48 | + |
| 49 | + [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")] |
| 50 | + public int MaxNumSentences { get; set; } |
| 51 | + |
| 52 | + [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")] |
| 53 | + public int Debug { get; set; } |
| 54 | + |
| 55 | + [Option("vits-model", Required = true, HelpText = "Path to VITS model")] |
| 56 | + public string Model { get; set; } |
| 57 | + |
| 58 | + [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")] |
| 59 | + public int SpeakerId { get; set; } |
| 60 | + |
| 61 | + [Option("text", Required = true, HelpText = "Text to synthesize")] |
| 62 | + public string Text { get; set; } |
| 63 | + |
| 64 | + [Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")] |
| 65 | + public string OutputFilename { get; set; } |
| 66 | + } |
| 67 | + |
| 68 | + static void Main(string[] args) |
| 69 | + { |
| 70 | + var parser = new CommandLine.Parser(with => with.HelpWriter = null); |
| 71 | + var parserResult = parser.ParseArguments<Options>(args); |
| 72 | + |
| 73 | + parserResult |
| 74 | + .WithParsed<Options>(options => Run(options)) |
| 75 | + .WithNotParsed(errs => DisplayHelp(parserResult, errs)); |
| 76 | + } |
| 77 | + |
| 78 | + private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs) |
| 79 | + { |
| 80 | + string usage = @" |
| 81 | +# vits-aishell3 |
| 82 | +
|
| 83 | +wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 |
| 84 | +tar xf vits-zh-aishell3.tar.bz2 |
| 85 | +
|
| 86 | +dotnet run \ |
| 87 | + --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ |
| 88 | + --vits-tokens=./vits-zh-aishell3/tokens.txt \ |
| 89 | + --vits-lexicon=./vits-zh-aishell3/lexicon.txt \ |
| 90 | + --tts-rule-fsts=./vits-zh-aishell3/rule.fst \ |
| 91 | + --sid=66 \ |
| 92 | + --debug=1 \ |
| 93 | + --output-filename=./aishell3-66.wav \ |
| 94 | + --text=这是一个语音合成测试 |
| 95 | +
|
| 96 | +# Piper models |
| 97 | +
|
| 98 | +wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 99 | +tar xf vits-piper-en_US-amy-low.tar.bz2 |
| 100 | +
|
| 101 | +dotnet run \ |
| 102 | + --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ |
| 103 | + --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ |
| 104 | + --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ |
| 105 | + --debug=1 \ |
| 106 | + --output-filename=./amy.wav \ |
| 107 | + --text='This is a text to speech application in dotnet with Next Generation Kaldi' |
| 108 | +
|
| 109 | +Please refer to |
| 110 | +https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html |
| 111 | +to download more models. |
| 112 | +"; |
| 113 | + |
| 114 | + var helpText = HelpText.AutoBuild(result, h => |
| 115 | + { |
| 116 | + h.AdditionalNewLineAfterOption = false; |
| 117 | + h.Heading = usage; |
| 118 | + h.Copyright = "Copyright (c) 2024 Xiaomi Corporation"; |
| 119 | + return HelpText.DefaultParsingErrorsHandler(result, h); |
| 120 | + }, e => e); |
| 121 | + Console.WriteLine(helpText); |
| 122 | + } |
| 123 | + |
| 124 | + |
| 125 | + private static void Run(Options options) |
| 126 | + { |
| 127 | + OfflineTtsConfig config = new OfflineTtsConfig(); |
| 128 | + config.Model.Vits.Model = options.Model; |
| 129 | + config.Model.Vits.Lexicon = options.Lexicon; |
| 130 | + config.Model.Vits.Tokens = options.Tokens; |
| 131 | + config.Model.Vits.DataDir = options.DataDir; |
| 132 | + config.Model.Vits.NoiseScale = options.NoiseScale; |
| 133 | + config.Model.Vits.NoiseScaleW = options.NoiseScaleW; |
| 134 | + config.Model.Vits.LengthScale = options.LengthScale; |
| 135 | + config.Model.NumThreads = 1; |
| 136 | + config.Model.Debug = options.Debug; |
| 137 | + config.Model.Provider = "cpu"; |
| 138 | + config.RuleFsts = options.RuleFsts; |
| 139 | + config.MaxNumSentences = options.MaxNumSentences; |
| 140 | + |
| 141 | + OfflineTts tts = new OfflineTts(config); |
| 142 | + float speed = 1.0f / options.LengthScale; |
| 143 | + int sid = options.SpeakerId; |
| 144 | + |
| 145 | + |
| 146 | + Console.WriteLine(PortAudio.VersionInfo.versionText); |
| 147 | + PortAudio.Initialize(); |
| 148 | + Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}"); |
| 149 | + |
| 150 | + for (int i = 0; i != PortAudio.DeviceCount; ++i) |
| 151 | + { |
| 152 | + Console.WriteLine($" Device {i}"); |
| 153 | + DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i); |
| 154 | + Console.WriteLine($" Name: {deviceInfo.name}"); |
| 155 | + Console.WriteLine($" Max output channels: {deviceInfo.maxOutputChannels}"); |
| 156 | + Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}"); |
| 157 | + } |
| 158 | + int deviceIndex = PortAudio.DefaultOutputDevice; |
| 159 | + if (deviceIndex == PortAudio.NoDevice) |
| 160 | + { |
| 161 | + Console.WriteLine("No default output device found. Please use ../offline-tts instead"); |
| 162 | + Environment.Exit(1); |
| 163 | + } |
| 164 | + |
| 165 | + DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex); |
| 166 | + Console.WriteLine(); |
| 167 | + Console.WriteLine($"Use output default device {deviceIndex} ({info.name})"); |
| 168 | + |
| 169 | + StreamParameters param = new StreamParameters(); |
| 170 | + param.device = deviceIndex; |
| 171 | + param.channelCount = 1; |
| 172 | + param.sampleFormat = SampleFormat.Float32; |
| 173 | + param.suggestedLatency = info.defaultLowOutputLatency; |
| 174 | + param.hostApiSpecificStreamInfo = IntPtr.Zero; |
| 175 | + |
| 176 | + // https://learn.microsoft.com/en-us/dotnet/standard/collections/thread-safe/blockingcollection-overview |
| 177 | + BlockingCollection<float[]> dataItems = new BlockingCollection<float[]>(); |
| 178 | + |
| 179 | + var MyCallback = (IntPtr samples, int n) => |
| 180 | + { |
| 181 | + float[] data = new float[n]; |
| 182 | + |
| 183 | + Marshal.Copy(samples, data, 0, n); |
| 184 | + |
| 185 | + dataItems.Add(data); |
| 186 | + }; |
| 187 | + |
| 188 | + bool playFinished = false; |
| 189 | + |
| 190 | + float[] lastSampleArray = null; |
| 191 | + int lastIndex = 0; // not played |
| 192 | + |
| 193 | + PortAudioSharp.Stream.Callback playCallback = (IntPtr input, IntPtr output, |
| 194 | + UInt32 frameCount, |
| 195 | + ref StreamCallbackTimeInfo timeInfo, |
| 196 | + StreamCallbackFlags statusFlags, |
| 197 | + IntPtr userData |
| 198 | + ) => |
| 199 | + { |
| 200 | + if (dataItems.IsCompleted && lastSampleArray == null && lastIndex == 0) |
| 201 | + { |
| 202 | + Console.WriteLine($"Finished playing"); |
| 203 | + playFinished = true; |
| 204 | + return StreamCallbackResult.Complete; |
| 205 | + } |
| 206 | + |
| 207 | + int expected = Convert.ToInt32(frameCount); |
| 208 | + int i = 0; |
| 209 | + |
| 210 | + while ((lastSampleArray != null || dataItems.Count != 0) && (i < expected)) |
| 211 | + { |
| 212 | + int needed = expected - i; |
| 213 | + |
| 214 | + if (lastSampleArray != null) |
| 215 | + { |
| 216 | + int remaining = lastSampleArray.Length - lastIndex; |
| 217 | + if (remaining >= needed) |
| 218 | + { |
| 219 | + float[] this_block = lastSampleArray.Skip(lastIndex).Take(needed).ToArray(); |
| 220 | + lastIndex += needed; |
| 221 | + if (lastIndex == lastSampleArray.Length) |
| 222 | + { |
| 223 | + lastSampleArray = null; |
| 224 | + lastIndex = 0; |
| 225 | + } |
| 226 | + |
| 227 | + Marshal.Copy(this_block, 0, IntPtr.Add(output, i * sizeof(float)), needed); |
| 228 | + return StreamCallbackResult.Continue; |
| 229 | + } |
| 230 | + |
| 231 | + float[] this_block2 = lastSampleArray.Skip(lastIndex).Take(remaining).ToArray(); |
| 232 | + lastIndex = 0; |
| 233 | + lastSampleArray = null; |
| 234 | + |
| 235 | + Marshal.Copy(this_block2, 0, IntPtr.Add(output, i * sizeof(float)), remaining); |
| 236 | + i += remaining; |
| 237 | + continue; |
| 238 | + } |
| 239 | + |
| 240 | + if (dataItems.Count != 0) |
| 241 | + { |
| 242 | + lastSampleArray = dataItems.Take(); |
| 243 | + lastIndex = 0; |
| 244 | + } |
| 245 | + } |
| 246 | + |
| 247 | + if (i < expected) |
| 248 | + { |
| 249 | + int sizeInBytes = (expected - i) * 4; |
| 250 | + Marshal.Copy(new byte[sizeInBytes], 0, IntPtr.Add(output, i * sizeof(float)), sizeInBytes); |
| 251 | + } |
| 252 | + |
| 253 | + return StreamCallbackResult.Continue; |
| 254 | + }; |
| 255 | + |
| 256 | + PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: null, outParams: param, sampleRate: tts.SampleRate, |
| 257 | + framesPerBuffer: 0, |
| 258 | + streamFlags: StreamFlags.ClipOff, |
| 259 | + callback: playCallback, |
| 260 | + userData: IntPtr.Zero |
| 261 | + ); |
| 262 | + |
| 263 | + stream.Start(); |
| 264 | + |
| 265 | + OfflineTtsCallback callback = new OfflineTtsCallback(MyCallback); |
| 266 | + |
| 267 | + OfflineTtsGeneratedAudio audio = tts.GenerateWithCallback(options.Text, speed, sid, callback); |
| 268 | + bool ok = audio.SaveToWaveFile(options.OutputFilename); |
| 269 | + |
| 270 | + if (ok) |
| 271 | + { |
| 272 | + Console.WriteLine($"Wrote to {options.OutputFilename} succeeded!"); |
| 273 | + } |
| 274 | + else |
| 275 | + { |
| 276 | + Console.WriteLine($"Failed to write {options.OutputFilename}"); |
| 277 | + } |
| 278 | + dataItems.CompleteAdding(); |
| 279 | + |
| 280 | + while (!playFinished) |
| 281 | + { |
| 282 | + Thread.Sleep(100); // 100ms |
| 283 | + } |
| 284 | + } |
| 285 | +} |
0 commit comments