Skip to content

Commit 6571fc9

Browse files
authored
Add tts play example for .Net. (#676)
It plays the generated audio via a speaker as it is generating.
1 parent ce60100 commit 6571fc9

12 files changed

+371
-14
lines changed

.github/workflows/build-wheels-aarch64.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424
matrix:
2525
os: [ubuntu-latest]
2626
python-version: ["cp37", "cp38", "cp39", "cp310", "cp311", "cp312"]
27-
manylinux: [manylinux2014, manylinux_2_28]
27+
manylinux: [manylinux2014] #, manylinux_2_28]
2828

2929
steps:
3030
- uses: actions/checkout@v4

.github/workflows/build-wheels-linux.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424
matrix:
2525
os: [ubuntu-latest]
2626
python-version: ["cp37", "cp38", "cp39", "cp310", "cp311", "cp312"]
27-
manylinux: [manylinux2014, manylinux_2_28]
27+
manylinux: [manylinux2014] #, manylinux_2_28]
2828

2929

3030
steps:

.github/workflows/build-wheels-win32.yaml

-2
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,6 @@ jobs:
4343
run: |
4444
ls -lh ./wheelhouse/
4545
46-
ls -lh ./wheelhouse/*.whl
47-
4846
- uses: actions/upload-artifact@v4
4947
with:
5048
name: wheel-${{ matrix.python-version }}

dotnet-examples/README.md

+10
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,14 @@ Please refer to the documentation
66
https://k2-fsa.github.io/sherpa/onnx/csharp-api/index.html
77
for details.
88

9+
```bash
10+
dotnet new console -n offline-tts-play
11+
dotnet sln ./sherpa-onnx.sln add ./offline-tts-play
12+
```
13+
14+
```bash
15+
dotnet nuget locals all --list
16+
dotnet nuget locals all --clear
17+
```
18+
919
[sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
run-piper.sh
+285
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
// Copyright (c) 2024 Xiaomi Corporation
2+
//
3+
// This file shows how to use a non-streaming TTS model for text-to-speech
4+
// Please refer to
5+
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
6+
// and
7+
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
8+
// to download pre-trained models
9+
//
10+
// Note that you need a speaker to run this file since it will play
11+
// the generated audio as it is generating.
12+
13+
using CommandLine.Text;
14+
using CommandLine;
15+
using PortAudioSharp;
16+
using SherpaOnnx;
17+
using System.Collections.Concurrent;
18+
using System.Collections.Generic;
19+
using System.Runtime.InteropServices;
20+
using System.Threading;
21+
using System;
22+
23+
class OfflineTtsPlayDemo
24+
{
25+
class Options
26+
{
27+
28+
[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
29+
public string RuleFsts { get; set; }
30+
31+
[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
32+
public string DataDir { get; set; }
33+
34+
[Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
35+
public float LengthScale { get; set; }
36+
37+
[Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")]
38+
public float NoiseScale { get; set; }
39+
40+
[Option("vits-noise-scale-w", Required = false, Default = 0.8f, HelpText = "noise_scale_w for VITS models")]
41+
public float NoiseScaleW { get; set; }
42+
43+
[Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
44+
public string Lexicon { get; set; }
45+
46+
[Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]
47+
public string Tokens { get; set; }
48+
49+
[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
50+
public int MaxNumSentences { get; set; }
51+
52+
[Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")]
53+
public int Debug { get; set; }
54+
55+
[Option("vits-model", Required = true, HelpText = "Path to VITS model")]
56+
public string Model { get; set; }
57+
58+
[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
59+
public int SpeakerId { get; set; }
60+
61+
[Option("text", Required = true, HelpText = "Text to synthesize")]
62+
public string Text { get; set; }
63+
64+
[Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
65+
public string OutputFilename { get; set; }
66+
}
67+
68+
static void Main(string[] args)
69+
{
70+
var parser = new CommandLine.Parser(with => with.HelpWriter = null);
71+
var parserResult = parser.ParseArguments<Options>(args);
72+
73+
parserResult
74+
.WithParsed<Options>(options => Run(options))
75+
.WithNotParsed(errs => DisplayHelp(parserResult, errs));
76+
}
77+
78+
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
79+
{
80+
string usage = @"
81+
# vits-aishell3
82+
83+
wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
84+
tar xf vits-zh-aishell3.tar.bz2
85+
86+
dotnet run \
87+
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
88+
--vits-tokens=./vits-zh-aishell3/tokens.txt \
89+
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \
90+
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \
91+
--sid=66 \
92+
--debug=1 \
93+
--output-filename=./aishell3-66.wav \
94+
--text=这是一个语音合成测试
95+
96+
# Piper models
97+
98+
wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
99+
tar xf vits-piper-en_US-amy-low.tar.bz2
100+
101+
dotnet run \
102+
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
103+
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
104+
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
105+
--debug=1 \
106+
--output-filename=./amy.wav \
107+
--text='This is a text to speech application in dotnet with Next Generation Kaldi'
108+
109+
Please refer to
110+
https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html
111+
to download more models.
112+
";
113+
114+
var helpText = HelpText.AutoBuild(result, h =>
115+
{
116+
h.AdditionalNewLineAfterOption = false;
117+
h.Heading = usage;
118+
h.Copyright = "Copyright (c) 2024 Xiaomi Corporation";
119+
return HelpText.DefaultParsingErrorsHandler(result, h);
120+
}, e => e);
121+
Console.WriteLine(helpText);
122+
}
123+
124+
125+
private static void Run(Options options)
126+
{
127+
OfflineTtsConfig config = new OfflineTtsConfig();
128+
config.Model.Vits.Model = options.Model;
129+
config.Model.Vits.Lexicon = options.Lexicon;
130+
config.Model.Vits.Tokens = options.Tokens;
131+
config.Model.Vits.DataDir = options.DataDir;
132+
config.Model.Vits.NoiseScale = options.NoiseScale;
133+
config.Model.Vits.NoiseScaleW = options.NoiseScaleW;
134+
config.Model.Vits.LengthScale = options.LengthScale;
135+
config.Model.NumThreads = 1;
136+
config.Model.Debug = options.Debug;
137+
config.Model.Provider = "cpu";
138+
config.RuleFsts = options.RuleFsts;
139+
config.MaxNumSentences = options.MaxNumSentences;
140+
141+
OfflineTts tts = new OfflineTts(config);
142+
float speed = 1.0f / options.LengthScale;
143+
int sid = options.SpeakerId;
144+
145+
146+
Console.WriteLine(PortAudio.VersionInfo.versionText);
147+
PortAudio.Initialize();
148+
Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}");
149+
150+
for (int i = 0; i != PortAudio.DeviceCount; ++i)
151+
{
152+
Console.WriteLine($" Device {i}");
153+
DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i);
154+
Console.WriteLine($" Name: {deviceInfo.name}");
155+
Console.WriteLine($" Max output channels: {deviceInfo.maxOutputChannels}");
156+
Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}");
157+
}
158+
int deviceIndex = PortAudio.DefaultOutputDevice;
159+
if (deviceIndex == PortAudio.NoDevice)
160+
{
161+
Console.WriteLine("No default output device found. Please use ../offline-tts instead");
162+
Environment.Exit(1);
163+
}
164+
165+
DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex);
166+
Console.WriteLine();
167+
Console.WriteLine($"Use output default device {deviceIndex} ({info.name})");
168+
169+
StreamParameters param = new StreamParameters();
170+
param.device = deviceIndex;
171+
param.channelCount = 1;
172+
param.sampleFormat = SampleFormat.Float32;
173+
param.suggestedLatency = info.defaultLowOutputLatency;
174+
param.hostApiSpecificStreamInfo = IntPtr.Zero;
175+
176+
// https://learn.microsoft.com/en-us/dotnet/standard/collections/thread-safe/blockingcollection-overview
177+
BlockingCollection<float[]> dataItems = new BlockingCollection<float[]>();
178+
179+
var MyCallback = (IntPtr samples, int n) =>
180+
{
181+
float[] data = new float[n];
182+
183+
Marshal.Copy(samples, data, 0, n);
184+
185+
dataItems.Add(data);
186+
};
187+
188+
bool playFinished = false;
189+
190+
float[] lastSampleArray = null;
191+
int lastIndex = 0; // not played
192+
193+
PortAudioSharp.Stream.Callback playCallback = (IntPtr input, IntPtr output,
194+
UInt32 frameCount,
195+
ref StreamCallbackTimeInfo timeInfo,
196+
StreamCallbackFlags statusFlags,
197+
IntPtr userData
198+
) =>
199+
{
200+
if (dataItems.IsCompleted && lastSampleArray == null && lastIndex == 0)
201+
{
202+
Console.WriteLine($"Finished playing");
203+
playFinished = true;
204+
return StreamCallbackResult.Complete;
205+
}
206+
207+
int expected = Convert.ToInt32(frameCount);
208+
int i = 0;
209+
210+
while ((lastSampleArray != null || dataItems.Count != 0) && (i < expected))
211+
{
212+
int needed = expected - i;
213+
214+
if (lastSampleArray != null)
215+
{
216+
int remaining = lastSampleArray.Length - lastIndex;
217+
if (remaining >= needed)
218+
{
219+
float[] this_block = lastSampleArray.Skip(lastIndex).Take(needed).ToArray();
220+
lastIndex += needed;
221+
if (lastIndex == lastSampleArray.Length)
222+
{
223+
lastSampleArray = null;
224+
lastIndex = 0;
225+
}
226+
227+
Marshal.Copy(this_block, 0, IntPtr.Add(output, i * sizeof(float)), needed);
228+
return StreamCallbackResult.Continue;
229+
}
230+
231+
float[] this_block2 = lastSampleArray.Skip(lastIndex).Take(remaining).ToArray();
232+
lastIndex = 0;
233+
lastSampleArray = null;
234+
235+
Marshal.Copy(this_block2, 0, IntPtr.Add(output, i * sizeof(float)), remaining);
236+
i += remaining;
237+
continue;
238+
}
239+
240+
if (dataItems.Count != 0)
241+
{
242+
lastSampleArray = dataItems.Take();
243+
lastIndex = 0;
244+
}
245+
}
246+
247+
if (i < expected)
248+
{
249+
int sizeInBytes = (expected - i) * 4;
250+
Marshal.Copy(new byte[sizeInBytes], 0, IntPtr.Add(output, i * sizeof(float)), sizeInBytes);
251+
}
252+
253+
return StreamCallbackResult.Continue;
254+
};
255+
256+
PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: null, outParams: param, sampleRate: tts.SampleRate,
257+
framesPerBuffer: 0,
258+
streamFlags: StreamFlags.ClipOff,
259+
callback: playCallback,
260+
userData: IntPtr.Zero
261+
);
262+
263+
stream.Start();
264+
265+
OfflineTtsCallback callback = new OfflineTtsCallback(MyCallback);
266+
267+
OfflineTtsGeneratedAudio audio = tts.GenerateWithCallback(options.Text, speed, sid, callback);
268+
bool ok = audio.SaveToWaveFile(options.OutputFilename);
269+
270+
if (ok)
271+
{
272+
Console.WriteLine($"Wrote to {options.OutputFilename} succeeded!");
273+
}
274+
else
275+
{
276+
Console.WriteLine($"Failed to write {options.OutputFilename}");
277+
}
278+
dataItems.CompleteAdding();
279+
280+
while (!playFinished)
281+
{
282+
Thread.Sleep(100); // 100ms
283+
}
284+
}
285+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net6.0</TargetFramework>
6+
<RootNamespace>offline_tts_play</RootNamespace>
7+
<ImplicitUsings>enable</ImplicitUsings>
8+
<Nullable>enable</Nullable>
9+
</PropertyGroup>
10+
11+
<PropertyGroup>
12+
<RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
13+
</PropertyGroup>
14+
15+
<ItemGroup>
16+
<PackageReference Include="CommandLineParser" Version="2.9.1" />
17+
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
18+
<PackageReference Include="PortAudioSharp2" Version="*" />
19+
</ItemGroup>
20+
21+
</Project>

dotnet-examples/sherpa-onnx.sln

+6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "speech-recognition-from-mic
1111
EndProject
1212
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts", "offline-tts\offline-tts.csproj", "{72196886-7143-4043-96E2-BCACEC6C79EB}"
1313
EndProject
14+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline-tts-play\offline-tts-play.csproj", "{40781464-5948-462B-BA4B-98932711513F}"
15+
EndProject
1416
Global
1517
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1618
Debug|Any CPU = Debug|Any CPU
@@ -36,5 +38,9 @@ Global
3638
{72196886-7143-4043-96E2-BCACEC6C79EB}.Debug|Any CPU.Build.0 = Debug|Any CPU
3739
{72196886-7143-4043-96E2-BCACEC6C79EB}.Release|Any CPU.ActiveCfg = Release|Any CPU
3840
{72196886-7143-4043-96E2-BCACEC6C79EB}.Release|Any CPU.Build.0 = Release|Any CPU
41+
{40781464-5948-462B-BA4B-98932711513F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
42+
{40781464-5948-462B-BA4B-98932711513F}.Debug|Any CPU.Build.0 = Debug|Any CPU
43+
{40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.ActiveCfg = Release|Any CPU
44+
{40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.Build.0 = Release|Any CPU
3945
EndGlobalSection
4046
EndGlobal
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net6.0</TargetFramework>
6+
<RootNamespace>offline_tts_play</RootNamespace>
7+
<ImplicitUsings>enable</ImplicitUsings>
8+
<Nullable>enable</Nullable>
9+
</PropertyGroup>
10+
11+
<PropertyGroup>
12+
<RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
13+
</PropertyGroup>
14+
15+
<ItemGroup>
16+
<PackageReference Include="CommandLineParser" Version="2.9.1" />
17+
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
18+
<PackageReference Include="PortAudioSharp2" Version="*" />
19+
</ItemGroup>
20+
21+
</Project>

0 commit comments

Comments
 (0)