generated from royshil/obs-plugintemplate
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from locaal-ai/roy.revai_provider
Rev AI provider
- Loading branch information
Showing
13 changed files
with
951 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
#include "deepgram-provider.h" | ||
#include <nlohmann/json.hpp> | ||
|
||
#include "language-codes/language-codes.h" | ||
|
||
using json = nlohmann::json; | ||
|
||
namespace http = beast::http; | ||
|
||
DeepgramProvider::DeepgramProvider(TranscriptionCallback callback, cloudvocal_data *gf_) | ||
: CloudProvider(callback, gf_), | ||
ioc(), | ||
ssl_ctx(ssl::context::tlsv12_client), | ||
resolver(ioc), | ||
ws(ioc, ssl_ctx) | ||
{ | ||
needs_results_thread = true; // We need a separate thread for reading results | ||
} | ||
|
||
bool DeepgramProvider::init() | ||
{ | ||
try { | ||
// Setup SSL context | ||
ssl_ctx.set_verify_mode(ssl::verify_peer); | ||
ssl_ctx.set_default_verify_paths(); | ||
|
||
// Resolve the Deepgram endpoint | ||
auto const results = resolver.resolve("api.deepgram.com", "443"); | ||
|
||
// Connect to Deepgram | ||
net::connect(get_lowest_layer(ws), results); | ||
|
||
// Set SNI hostname (required for TLS) | ||
if (!SSL_set_tlsext_host_name(ws.next_layer().native_handle(), | ||
"api.deepgram.com")) { | ||
throw beast::system_error( | ||
beast::error_code(static_cast<int>(::ERR_get_error()), | ||
net::error::get_ssl_category()), | ||
"Failed to set SNI hostname"); | ||
} | ||
|
||
// Perform SSL handshake | ||
ws.next_layer().handshake(ssl::stream_base::client); | ||
|
||
// Set up WebSocket handshake with API key | ||
ws.set_option( | ||
websocket::stream_base::decorator([this](websocket::request_type &req) { | ||
req.set(http::field::sec_websocket_protocol, | ||
"token, " + std::string(gf->cloud_provider_api_key)); | ||
})); | ||
|
||
std::string query = std::string("/v1/listen?encoding=linear16&sample_rate=16000") + | ||
"&language=" + language_codes_from_underscore[gf->language]; | ||
// Perform WebSocket handshake | ||
ws.handshake("api.deepgram.com", query); | ||
|
||
obs_log(LOG_INFO, "Connected to Deepgram WebSocket successfully"); | ||
return true; | ||
} catch (std::exception const &e) { | ||
obs_log(LOG_ERROR, "Error initializing Deepgram connection: %s", e.what()); | ||
return false; | ||
} | ||
} | ||
|
||
void DeepgramProvider::sendAudioBufferToTranscription(const std::deque<float> &audio_buffer) | ||
{ | ||
if (audio_buffer.empty()) | ||
return; | ||
|
||
try { | ||
// Convert float audio to int16_t (linear16 format) | ||
std::vector<int16_t> pcm_data; | ||
pcm_data.reserve(audio_buffer.size()); | ||
|
||
for (float sample : audio_buffer) { | ||
// Clamp and convert to int16 | ||
float clamped = std::max(-1.0f, std::min(1.0f, sample)); | ||
pcm_data.push_back(static_cast<int16_t>(clamped * 32767.0f)); | ||
} | ||
|
||
// Send binary message | ||
ws.write(net::buffer(pcm_data.data(), pcm_data.size() * sizeof(int16_t))); | ||
|
||
} catch (std::exception const &e) { | ||
obs_log(LOG_ERROR, "Error sending audio to Deepgram: %s", e.what()); | ||
running = false; | ||
} | ||
} | ||
|
||
void DeepgramProvider::readResultsFromTranscription() | ||
{ | ||
try { | ||
// Read message into buffer | ||
beast::flat_buffer buffer; | ||
ws.read(buffer); | ||
|
||
// Convert to string and parse JSON | ||
std::string msg = beast::buffers_to_string(buffer.data()); | ||
json result = json::parse(msg); | ||
|
||
// Check if this is a transcription result | ||
if (result["type"] == "Results" && !result["channel"]["alternatives"].empty()) { | ||
DetectionResultWithText detection_result; | ||
|
||
// Fill the detection result structure | ||
detection_result.text = result["channel"]["alternatives"][0]["transcript"]; | ||
detection_result.result = result["is_final"] ? DETECTION_RESULT_SPEECH | ||
: DETECTION_RESULT_PARTIAL; | ||
|
||
// If there are words with timestamps | ||
if (!result["channel"]["alternatives"][0]["words"].empty()) { | ||
auto &words = result["channel"]["alternatives"][0]["words"]; | ||
detection_result.start_timestamp_ms = words[0]["start"]; | ||
detection_result.end_timestamp_ms = words[words.size() - 1]["end"]; | ||
} | ||
|
||
// Send result through callback | ||
transcription_callback(detection_result); | ||
} | ||
} catch (std::exception const &e) { | ||
obs_log(LOG_ERROR, "Error reading from Deepgram: %s", e.what()); | ||
} | ||
} | ||
|
||
void DeepgramProvider::shutdown() | ||
{ | ||
try { | ||
// Send close message | ||
ws.write(net::buffer(R"({"type":"CloseStream"})")); | ||
|
||
// Close WebSocket connection | ||
ws.close(websocket::close_code::normal); | ||
|
||
obs_log(LOG_INFO, "Deepgram connection closed successfully"); | ||
} catch (std::exception const &e) { | ||
obs_log(LOG_ERROR, "Error during Deepgram shutdown: %s", e.what()); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#pragma once | ||
|
||
#include <boost/beast/core.hpp> | ||
#include <boost/beast/websocket.hpp> | ||
#include <boost/asio/strand.hpp> | ||
#include <boost/beast/core/tcp_stream.hpp> | ||
#include <boost/beast/ssl.hpp> | ||
#include "cloud-providers/cloud-provider.h" | ||
|
||
namespace beast = boost::beast; | ||
namespace websocket = beast::websocket; | ||
namespace net = boost::asio; | ||
namespace ssl = boost::asio::ssl; | ||
using tcp = boost::asio::ip::tcp; | ||
|
||
class DeepgramProvider : public CloudProvider { | ||
public: | ||
DeepgramProvider(TranscriptionCallback callback, cloudvocal_data *gf_); | ||
bool init() override; | ||
|
||
protected: | ||
void sendAudioBufferToTranscription(const std::deque<float> &audio_buffer) override; | ||
void readResultsFromTranscription() override; | ||
void shutdown() override; | ||
|
||
private: | ||
net::io_context ioc; | ||
ssl::context ssl_ctx; | ||
tcp::resolver resolver; | ||
websocket::stream<beast::ssl_stream<tcp::socket>> ws; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
Transcribe - Live audio | ||
Use Deepgram's speech-to-text API to transcribe live-streaming audio. | ||
|
||
Deepgram provides its customers with real-time, streaming transcription via its streaming endpoints. These endpoints are high-performance, full-duplex services running over the WebSocket protocol. | ||
|
||
To learn more about working with real-time streaming data and results, see Get Started with Streaming Audio. | ||
|
||
Endpoint | ||
Production WebSocket server for Deepgram's real-time transcription with streaming audio. TLS encryption will protect your connection and data. We support a minimum of TLS 1.2. | ||
|
||
Detail Description | ||
Path wss://api.deepgram.com/v1/listen | ||
Accepts | ||
Type Description | ||
Raw Audio File Data Unprocessed or uncompressed binary audio data (such as PCM) | ||
Messages JSON formatted operations. | ||
Headers | ||
Header Value Description | ||
Sec-WebSocket-Protocol token, <DEEPGRAM_API_KEY> Used to establish a WebSocket connection with a specific protocol, include your Deepgram API key for authentication. | ||
Body Params | ||
Parameter Type Description | ||
callback string Callback URL to provide if you would like your submitted audio to be processed asynchronously. Learn More. | ||
callback_method string Enable a callback method. Use put or post. Learn More. | ||
channels int32 Number of independent audio channels contained in submitted streaming audio. Only read when a value is provided for encoding. Learn More. | ||
dictation boolean Dictation automatically formats spoken commands for punctuation into their respective punctuation marks. Learn More. | ||
diarize boolean Indicates whether to recognize speaker changes. When set to true, each word in the transcript will be assigned a speaker number starting at 0. Learn More. | ||
diarize_version string Indicates the version of the diarization feature to use. Only available when the diarization feature is enabled. Learn More. | ||
encoding string Expected encoding of the submitted streaming audio. If this parameter is set, sample_rate must also be specified. Learn More. | ||
endpointing boolean Indicates how long Deepgram will wait to detect whether a speaker has finished speaking or pauses for a significant period of time. When set to true, the streaming endpoint immediately finalizes the transcription for the processed time range and returns the transcript with a speech_final parameter set to true. Learn More. | ||
extra string Add any extra key-value pairs to the query string to customize the response. Learn More. | ||
filler_words boolean Indicates whether to include filler words like "uh" and "um" in transcript output. When set to true, these words will be included. Learn More. | ||
interim_results boolean Specifies whether the streaming endpoint should provide ongoing transcription updates as more audio is received. When set to true, the endpoint sends continuous updates, meaning transcription results may evolve over time. Learn More. | ||
keywords string Unique proper nouns or specialized terms you want the model to include in its predictions, which aren't part of the model's default vocabulary. Learn More. | ||
language string The BCP-47 language tag that hints at the primary spoken language. Learn More. | ||
model string The AI model used to process submitted audio. Learn More. | ||
multichannel boolean Indicates whether to transcribe each audio channel independently. Learn More. | ||
numerals boolean Indicates whether to convert numbers from written format (e.g., one) to numerical format (e.g., 1). Learn More. | ||
profanity_filter boolean Indicates whether to remove profanity from the transcript. Learn More. | ||
punctuate boolean Indicates whether to add punctuation and capitalization to the transcript Learn More. | ||
redact string Indicates whether to redact sensitive information, replacing redacted content with asterisks *. Learn More. | ||
replace string Terms or phrases to search for in the submitted audio and replace. Learn More. | ||
sample_rate int32 Sample rate of submitted streaming audio. Required (and only read) when a value is provided for encoding. Learn More. | ||
search string Terms or phrases to search for in the submitted audio. Learn More. | ||
smart_format boolean Indicates whether to apply formatting to transcript output. When set to true, additional formatting will be applied to transcripts to improve readability. Learn More. | ||
tag string Set a tag to associate with the request. Learn More. | ||
utterance_end_ms string Indicates how long Deepgram will wait to send a {"type": "UtteranceEnd"} message after a word has been transcribed. Learn More. | ||
vad_events boolean Indicates that speech has started. {"type": "SpeechStarted"} You'll begin receiving messages upon speech starting. Learn More. | ||
version string Version of the model to use. Learn More. | ||
Sending Audio Data | ||
All audio data is transmitted to the streaming endpoint as binary WebSocket messages, with payloads containing the raw audio data. The full-duplex protocol allows for real-time streaming, enabling you to receive transcription responses simultaneously as you upload data. For optimal performance, each streaming buffer should represent between 20 and 250 milliseconds of audio. | ||
|
||
Messages | ||
Keep Alive | ||
Optional | ||
|
||
Periodically send KeepAlive messages while streaming can ensure uninterrupted communication and minimizing costs. Learn More . | ||
|
||
JSON | ||
|
||
{ | ||
"type": "KeepAlive" | ||
} | ||
Finalize | ||
Optional | ||
|
||
Finalize message can be used to handle specific scenarios where you need to force the server to process all unprocessed audio data and immediately return the final results. Learn More. | ||
|
||
JSON | ||
|
||
{ | ||
"type": "Finalize" | ||
} | ||
Close Stream | ||
Optional | ||
|
||
The CloseStream message can be sent to the Deepgram server, instructing it close the connection. Learn More. | ||
|
||
JSON | ||
|
||
{ | ||
"type": "CloseStream" | ||
} | ||
Responses | ||
Refer to API Errors for more information. | ||
|
||
Status Description | ||
200 Audio submitted for transcription. | ||
400 Bad Request. | ||
401 Invalid Authorization. | ||
402 Payment Required, insufficient credits | ||
403 Insufficient permissions. | ||
503 Internal server error if the server is temporarily unable to serve requests. | ||
Response Schema | ||
JSON | ||
|
||
{ | ||
"metadata": { | ||
"transaction_key": "string", | ||
"request_id": "uuid", | ||
"sha256": "string", | ||
"created": "string", | ||
"duration": 0, | ||
"channels": 0, | ||
"models": [ | ||
"string" | ||
], | ||
}, | ||
"type": "Results", | ||
"channel_index": [ | ||
0, | ||
0 | ||
], | ||
"duration": 0.0, | ||
"start": 0.0, | ||
"is_final": boolean, | ||
"speech_final": boolean, | ||
"channel": { | ||
"alternatives": [ | ||
{ | ||
"transcript": "string", | ||
"confidence": 0, | ||
"words": [ | ||
{ | ||
"word": "string", | ||
"start": 0, | ||
"end": 0, | ||
"confidence": 0 | ||
} | ||
] | ||
} | ||
], | ||
"search": [ | ||
{ | ||
"query": "string", | ||
"hits": [ | ||
{ | ||
"confidence": 0, | ||
"start": 0, | ||
"end": 0, | ||
"snippet": "string" | ||
} | ||
] | ||
} | ||
] | ||
} | ||
} | ||
Errors & Warnings | ||
If Deepgram encounters an error during real-time streaming, we will return a WebSocket Close frame. The body of the Close frame will indicate the reason for closing using one of the specification’s pre-defined status codes followed by a UTF-8-encoded payload that represents the reason for the error. | ||
|
||
Current codes and payloads in use include: | ||
|
||
Code Payload Description | ||
1000 N/A Normal Closure | ||
1008 DATA-0000 The payload cannot be decoded as audio. Either the encoding is incorrectly specified, the payload is not audio data, or the audio is in a format unsupported by Deepgram. | ||
1011 NET-0000 The service has not transmitted a Text frame to the client within the timeout window. This may indicate an issue internally in Deepgram's systems or could be due to Deepgram not receiving enough audio data to transcribe a frame. | ||
1011 NET-0001 The service has not received a Binary or Text frame from the client within the timeout window. This may indicate an internal issue in Deepgram's systems, the client's systems, or the network connecting them. |
Oops, something went wrong.