Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optional language parameter to synthesize methods. #34

Merged
merged 1 commit into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion demo/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async def writer_task(conn, prompt):

async def main(args):
speech = Speech(os.getenv('LMNT_API_KEY'))
conn = await speech.synthesize_streaming(VOICE_ID, return_extras=False)
conn = await speech.synthesize_streaming(VOICE_ID, return_extras=False, language=args.language)

t1 = asyncio.create_task(reader_task(conn))
t2 = asyncio.create_task(writer_task(conn, args.prompt))
Expand All @@ -51,4 +51,5 @@ async def main(args):
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('prompt', default=DEFAULT_PROMPT, nargs='?')
parser.add_argument('-l', '--language', required=False, default='en', help='Language code')
asyncio.run(main(parser.parse_args()))
3 changes: 2 additions & 1 deletion demo/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ async def main(args):
print(account)

# Synthesize text to speech.
synthesize = await s.synthesize(text=args.text, voice=args.voice)
synthesize = await s.synthesize(text=args.text, voice=args.voice, language=args.language)
with open('output.mp3', 'wb') as f:
f.write(synthesize['audio'])
print('Done.')
Expand All @@ -27,5 +27,6 @@ async def main(args):
parser = argparse.ArgumentParser(description='Synthesize text to speech using LMNT API')
parser.add_argument('-t', '--text', required=False, default='This is a test of the LMNT API.', help='Text to synthesize')
parser.add_argument('-v', '--voice', required=False, default='lily', help='Voice to use')
parser.add_argument('-l', '--language', required=False, default='en', help='Language code')
args = parser.parse_args()
asyncio.run(main(args))
7 changes: 6 additions & 1 deletion src/lmnt/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ async def synthesize(self, text: str, voice: str, **kwargs):
- `speed` (float): Floating point value between 0.25 (slow) and 2.0 (fast); Defaults to 1.0
- `return_durations` (bool): If `True`, the response will include word durations detail. Defaults to `False`.
- `return_seed` (bool): If `True`, the response will include the seed used for synthesis. Defaults to `False`.
- `language` (str): The desired language of the synthesized speech. Two letter ISO 639-1 code. Defaults to `en`.
- `length` (int): The desired target length of the output speech in seconds. Maximum 300.0 (5 minutes)

Deprecated parameters:
Expand Down Expand Up @@ -362,7 +363,8 @@ async def synthesize(self, text: str, voice: str, **kwargs):
if return_durations is True:
form_data.add_field('return_durations', 'true')
return_seed = kwargs.get('return_seed', False)

if 'language' in kwargs:
form_data.add_field('language', kwargs.get('language'))
async with self._session.post(url, data=form_data, headers=self._build_headers()) as resp:
await self._handle_response_errors(resp, 'Speech.synthesize')
response_data = await resp.json()
Expand All @@ -384,6 +386,7 @@ async def synthesize_streaming(self, voice: str, return_extras: bool = False, **
- `voice` (str): The voice id to use for this connection.
- `speed` (float): The speed to use for synthesis. Defaults to 1.0.
- `return_extras` (bool): If `True`, the response will include word durations detail. Defaults to `False`.
- `language` (str): The desired language of the synthesized speech. Two letter ISO 639-1 code. Defaults to `en`.

Returns:
- `StreamingSynthesisConnection`: The streaming connection object.
Expand All @@ -406,6 +409,8 @@ async def synthesize_streaming(self, voice: str, return_extras: bool = False, **
if 'expressive' in kwargs:
init_msg['expressive'] = kwargs['expressive']
init_msg['send_extras'] = return_extras
if 'language' in kwargs:
init_msg['language'] = kwargs['language']
ws = await self._session.ws_connect(f'{self._base_url}{_SYNTHESIZE_STREAMING_ENDPOINT}')
await ws.send_str(json.dumps(init_msg))
return StreamingSynthesisConnection(ws, return_extras)
Expand Down
14 changes: 14 additions & 0 deletions test/integration/smoke_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,20 @@ async def test_synthesize(api: Speech):
assert isinstance(result['audio'], bytes)


@pytest.mark.asyncio
async def test_synthesize__non_en_language(api: Speech):
voice = 'lily'
text = 'Example Text'
language = 'pt'
result = await api.synthesize(text=text, voice=voice, language=language)
assert result is not None
assert 'audio' in result
assert 'durations' not in result
assert 'seed' not in result
assert len(result['audio']) > 0
assert isinstance(result['audio'], bytes)


@pytest.mark.asyncio
async def test_synthesize_with_empty_voice(api: Speech):
voice = ''
Expand Down
19 changes: 17 additions & 2 deletions test/unit/test_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,19 @@ async def test_synthesize_return_durations_and_seed(api):
assert synthesis_result == {'audio': base64.b64decode(mock_response['audio']), 'durations': mock_response['durations'], 'seed': mock_response['seed']}


@pytest.mark.asyncio
async def test_synthesize__non_en_language(api):
text = 'Hello, world!'
voice = 'Voice1'
language = 'pt'
mock_response = {'audio': MOCK_AUDIO, 'durations': [], 'seed': 'random_seed'}
api._session.post.return_value.__aenter__.return_value.json = AsyncMock(return_value=mock_response)
api._session.post.return_value.__aenter__.return_value.status = 200

synthesis_result = await api.synthesize(text, voice, language=language)
assert synthesis_result == {'audio': base64.b64decode(mock_response['audio'])}


@pytest.mark.asyncio
async def test_synthesize_no_text(api):
with pytest.raises(AssertionError):
Expand Down Expand Up @@ -113,12 +126,13 @@ async def test_synthesize_streaming(api):
speed = 1.5
expressive = 0.8
return_extras = True
language = 'pt'

mock_ws = AsyncMock()
api._session = AsyncMock()
api._session.ws_connect.return_value = mock_ws

connection = await api.synthesize_streaming(voice, return_extras=return_extras, speed=speed, expressive=expressive)
connection = await api.synthesize_streaming(voice, return_extras=return_extras, speed=speed, expressive=expressive, language=language)

assert isinstance(connection, StreamingSynthesisConnection)
api._session.ws_connect.assert_called_once_with(f'{api._base_url}{_SYNTHESIZE_STREAMING_ENDPOINT}')
Expand All @@ -127,7 +141,8 @@ async def test_synthesize_streaming(api):
'voice': voice,
'speed': speed,
'expressive': expressive,
'send_extras': return_extras
'send_extras': return_extras,
'language': language
}))


Expand Down