Skip to content

Commit

Permalink
Merge pull request #3 from locaal-ai/roy.revai_provider
Browse files Browse the repository at this point in the history
Rev AI provider
  • Loading branch information
royshil authored Dec 19, 2024
2 parents fd27e65 + 3b1b163 commit 993f2ef
Show file tree
Hide file tree
Showing 13 changed files with 951 additions and 2 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/build-project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@ jobs:
"pluginName=${ProductName}" >> $env:GITHUB_OUTPUT
"pluginVersion=${ProductVersion}" >> $env:GITHUB_OUTPUT
- uses: actions/cache@v4
id: conan-cache
with:
path: C:\Users\runneradmin\.conan2\
key: ${{ runner.os }}-conan-cache-${{ needs.check-event.outputs.config }}
restore-keys: |
${{ runner.os }}-conan-cache-
- name: Build Plugin 🧱
uses: ./.github/actions/build-plugin
with:
Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ target_sources(
src/language-codes/language-codes.cpp
src/cloud-providers/cloud-provider.cpp
src/cloud-providers/clova/clova-provider.cpp
src/cloud-providers/deepgram/deepgram-provider.cpp
src/cloud-providers/google/google-provider.cpp
src/cloud-providers/revai/revai-provider.cpp
src/utils/ssl-utils.cpp
src/utils/curl-helper.cpp
src/timed-metadata/timed-metadata-utils.cpp)
Expand Down
1 change: 0 additions & 1 deletion cmake/common/compiler_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ set(_obs_clang_c_options
# cmake-format: sortable
-fno-strict-aliasing
-Wbool-conversion
-Wcomma
-Wconstant-conversion
-Wdeprecated-declarations
-Wempty-body
Expand Down
2 changes: 1 addition & 1 deletion cmake/macos/xcode.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ set(CMAKE_XCODE_ATTRIBUTE_ENABLE_STRICT_OBJC_MSGSEND YES)
set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING YES_ERROR)
set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_BOOL_CONVERSION YES)
set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS YES)
set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_COMMA YES)
set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_COMMA NO)
set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_CONSTANT_CONVERSION YES)
set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_EMPTY_BODY YES)
set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_ENUM_CONVERSION YES)
Expand Down
6 changes: 6 additions & 0 deletions src/cloud-providers/cloud-provider.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include "clova/clova-provider.h"
#include "google/google-provider.h"
#include "aws/aws_provider.h"
#include "revai/revai-provider.h"
#include "deepgram/deepgram-provider.h"

std::shared_ptr<CloudProvider> createCloudProvider(const std::string &providerType,
CloudProvider::TranscriptionCallback callback,
Expand All @@ -14,6 +16,10 @@ std::shared_ptr<CloudProvider> createCloudProvider(const std::string &providerTy
return std::make_unique<GoogleProvider>(callback, gf);
} else if (providerType == "aws") {
return std::make_unique<AWSProvider>(callback, gf);
} else if (providerType == "revai") {
return std::make_unique<RevAIProvider>(callback, gf);
} else if (providerType == "deepgram") {
return std::make_unique<DeepgramProvider>(callback, gf);
}

return nullptr; // Return nullptr if no matching provider is found
Expand Down
138 changes: 138 additions & 0 deletions src/cloud-providers/deepgram/deepgram-provider.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#include "deepgram-provider.h"
#include <nlohmann/json.hpp>

#include "language-codes/language-codes.h"

using json = nlohmann::json;

namespace http = beast::http;

DeepgramProvider::DeepgramProvider(TranscriptionCallback callback, cloudvocal_data *gf_)
: CloudProvider(callback, gf_),
ioc(),
ssl_ctx(ssl::context::tlsv12_client),
resolver(ioc),
ws(ioc, ssl_ctx)
{
needs_results_thread = true; // We need a separate thread for reading results
}

bool DeepgramProvider::init()
{
try {
// Setup SSL context
ssl_ctx.set_verify_mode(ssl::verify_peer);
ssl_ctx.set_default_verify_paths();

// Resolve the Deepgram endpoint
auto const results = resolver.resolve("api.deepgram.com", "443");

// Connect to Deepgram
net::connect(get_lowest_layer(ws), results);

// Set SNI hostname (required for TLS)
if (!SSL_set_tlsext_host_name(ws.next_layer().native_handle(),
"api.deepgram.com")) {
throw beast::system_error(
beast::error_code(static_cast<int>(::ERR_get_error()),
net::error::get_ssl_category()),
"Failed to set SNI hostname");
}

// Perform SSL handshake
ws.next_layer().handshake(ssl::stream_base::client);

// Set up WebSocket handshake with API key
ws.set_option(
websocket::stream_base::decorator([this](websocket::request_type &req) {
req.set(http::field::sec_websocket_protocol,
"token, " + std::string(gf->cloud_provider_api_key));
}));

std::string query = std::string("/v1/listen?encoding=linear16&sample_rate=16000") +
"&language=" + language_codes_from_underscore[gf->language];
// Perform WebSocket handshake
ws.handshake("api.deepgram.com", query);

obs_log(LOG_INFO, "Connected to Deepgram WebSocket successfully");
return true;
} catch (std::exception const &e) {
obs_log(LOG_ERROR, "Error initializing Deepgram connection: %s", e.what());
return false;
}
}

void DeepgramProvider::sendAudioBufferToTranscription(const std::deque<float> &audio_buffer)
{
if (audio_buffer.empty())
return;

try {
// Convert float audio to int16_t (linear16 format)
std::vector<int16_t> pcm_data;
pcm_data.reserve(audio_buffer.size());

for (float sample : audio_buffer) {
// Clamp and convert to int16
float clamped = std::max(-1.0f, std::min(1.0f, sample));
pcm_data.push_back(static_cast<int16_t>(clamped * 32767.0f));
}

// Send binary message
ws.write(net::buffer(pcm_data.data(), pcm_data.size() * sizeof(int16_t)));

} catch (std::exception const &e) {
obs_log(LOG_ERROR, "Error sending audio to Deepgram: %s", e.what());
running = false;
}
}

void DeepgramProvider::readResultsFromTranscription()
{
try {
// Read message into buffer
beast::flat_buffer buffer;
ws.read(buffer);

// Convert to string and parse JSON
std::string msg = beast::buffers_to_string(buffer.data());
json result = json::parse(msg);

// Check if this is a transcription result
if (result["type"] == "Results" && !result["channel"]["alternatives"].empty()) {
DetectionResultWithText detection_result;

// Fill the detection result structure
detection_result.text = result["channel"]["alternatives"][0]["transcript"];
detection_result.result = result["is_final"] ? DETECTION_RESULT_SPEECH
: DETECTION_RESULT_PARTIAL;

// If there are words with timestamps
if (!result["channel"]["alternatives"][0]["words"].empty()) {
auto &words = result["channel"]["alternatives"][0]["words"];
detection_result.start_timestamp_ms = words[0]["start"];
detection_result.end_timestamp_ms = words[words.size() - 1]["end"];
}

// Send result through callback
transcription_callback(detection_result);
}
} catch (std::exception const &e) {
obs_log(LOG_ERROR, "Error reading from Deepgram: %s", e.what());
}
}

void DeepgramProvider::shutdown()
{
try {
// Send close message
ws.write(net::buffer(R"({"type":"CloseStream"})"));

// Close WebSocket connection
ws.close(websocket::close_code::normal);

obs_log(LOG_INFO, "Deepgram connection closed successfully");
} catch (std::exception const &e) {
obs_log(LOG_ERROR, "Error during Deepgram shutdown: %s", e.what());
}
}
31 changes: 31 additions & 0 deletions src/cloud-providers/deepgram/deepgram-provider.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#pragma once

#include <boost/beast/core.hpp>
#include <boost/beast/websocket.hpp>
#include <boost/asio/strand.hpp>
#include <boost/beast/core/tcp_stream.hpp>
#include <boost/beast/ssl.hpp>
#include "cloud-providers/cloud-provider.h"

namespace beast = boost::beast;
namespace websocket = beast::websocket;
namespace net = boost::asio;
namespace ssl = boost::asio::ssl;
using tcp = boost::asio::ip::tcp;

class DeepgramProvider : public CloudProvider {
public:
DeepgramProvider(TranscriptionCallback callback, cloudvocal_data *gf_);
bool init() override;

protected:
void sendAudioBufferToTranscription(const std::deque<float> &audio_buffer) override;
void readResultsFromTranscription() override;
void shutdown() override;

private:
net::io_context ioc;
ssl::context ssl_ctx;
tcp::resolver resolver;
websocket::stream<beast::ssl_stream<tcp::socket>> ws;
};
156 changes: 156 additions & 0 deletions src/cloud-providers/deepgram/live_audio_api.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
Transcribe - Live audio
Use Deepgram's speech-to-text API to transcribe live-streaming audio.

Deepgram provides its customers with real-time, streaming transcription via its streaming endpoints. These endpoints are high-performance, full-duplex services running over the WebSocket protocol.

To learn more about working with real-time streaming data and results, see Get Started with Streaming Audio.

Endpoint
Production WebSocket server for Deepgram's real-time transcription with streaming audio. TLS encryption will protect your connection and data. We support a minimum of TLS 1.2.

Detail Description
Path wss://api.deepgram.com/v1/listen
Accepts
Type Description
Raw Audio File Data Unprocessed or uncompressed binary audio data (such as PCM)
Messages JSON formatted operations.
Headers
Header Value Description
Sec-WebSocket-Protocol token, <DEEPGRAM_API_KEY> Used to establish a WebSocket connection with a specific protocol, include your Deepgram API key for authentication.
Body Params
Parameter Type Description
callback string Callback URL to provide if you would like your submitted audio to be processed asynchronously. Learn More.
callback_method string Enable a callback method. Use put or post. Learn More.
channels int32 Number of independent audio channels contained in submitted streaming audio. Only read when a value is provided for encoding. Learn More.
dictation boolean Dictation automatically formats spoken commands for punctuation into their respective punctuation marks. Learn More.
diarize boolean Indicates whether to recognize speaker changes. When set to true, each word in the transcript will be assigned a speaker number starting at 0. Learn More.
diarize_version string Indicates the version of the diarization feature to use. Only available when the diarization feature is enabled. Learn More.
encoding string Expected encoding of the submitted streaming audio. If this parameter is set, sample_rate must also be specified. Learn More.
endpointing boolean Indicates how long Deepgram will wait to detect whether a speaker has finished speaking or pauses for a significant period of time. When set to true, the streaming endpoint immediately finalizes the transcription for the processed time range and returns the transcript with a speech_final parameter set to true. Learn More.
extra string Add any extra key-value pairs to the query string to customize the response. Learn More.
filler_words boolean Indicates whether to include filler words like "uh" and "um" in transcript output. When set to true, these words will be included. Learn More.
interim_results boolean Specifies whether the streaming endpoint should provide ongoing transcription updates as more audio is received. When set to true, the endpoint sends continuous updates, meaning transcription results may evolve over time. Learn More.
keywords string Unique proper nouns or specialized terms you want the model to include in its predictions, which aren't part of the model's default vocabulary. Learn More.
language string The BCP-47 language tag that hints at the primary spoken language. Learn More.
model string The AI model used to process submitted audio. Learn More.
multichannel boolean Indicates whether to transcribe each audio channel independently. Learn More.
numerals boolean Indicates whether to convert numbers from written format (e.g., one) to numerical format (e.g., 1). Learn More.
profanity_filter boolean Indicates whether to remove profanity from the transcript. Learn More.
punctuate boolean Indicates whether to add punctuation and capitalization to the transcript Learn More.
redact string Indicates whether to redact sensitive information, replacing redacted content with asterisks *. Learn More.
replace string Terms or phrases to search for in the submitted audio and replace. Learn More.
sample_rate int32 Sample rate of submitted streaming audio. Required (and only read) when a value is provided for encoding. Learn More.
search string Terms or phrases to search for in the submitted audio. Learn More.
smart_format boolean Indicates whether to apply formatting to transcript output. When set to true, additional formatting will be applied to transcripts to improve readability. Learn More.
tag string Set a tag to associate with the request. Learn More.
utterance_end_ms string Indicates how long Deepgram will wait to send a {"type": "UtteranceEnd"} message after a word has been transcribed. Learn More.
vad_events boolean Indicates that speech has started. {"type": "SpeechStarted"} You'll begin receiving messages upon speech starting. Learn More.
version string Version of the model to use. Learn More.
Sending Audio Data
All audio data is transmitted to the streaming endpoint as binary WebSocket messages, with payloads containing the raw audio data. The full-duplex protocol allows for real-time streaming, enabling you to receive transcription responses simultaneously as you upload data. For optimal performance, each streaming buffer should represent between 20 and 250 milliseconds of audio.

Messages
Keep Alive
Optional

Periodically send KeepAlive messages while streaming can ensure uninterrupted communication and minimizing costs. Learn More .

JSON

{
"type": "KeepAlive"
}
Finalize
Optional

Finalize message can be used to handle specific scenarios where you need to force the server to process all unprocessed audio data and immediately return the final results. Learn More.

JSON

{
"type": "Finalize"
}
Close Stream
Optional

The CloseStream message can be sent to the Deepgram server, instructing it close the connection. Learn More.

JSON

{
"type": "CloseStream"
}
Responses
Refer to API Errors for more information.

Status Description
200 Audio submitted for transcription.
400 Bad Request.
401 Invalid Authorization.
402 Payment Required, insufficient credits
403 Insufficient permissions.
503 Internal server error if the server is temporarily unable to serve requests.
Response Schema
JSON

{
"metadata": {
"transaction_key": "string",
"request_id": "uuid",
"sha256": "string",
"created": "string",
"duration": 0,
"channels": 0,
"models": [
"string"
],
},
"type": "Results",
"channel_index": [
0,
0
],
"duration": 0.0,
"start": 0.0,
"is_final": boolean,
"speech_final": boolean,
"channel": {
"alternatives": [
{
"transcript": "string",
"confidence": 0,
"words": [
{
"word": "string",
"start": 0,
"end": 0,
"confidence": 0
}
]
}
],
"search": [
{
"query": "string",
"hits": [
{
"confidence": 0,
"start": 0,
"end": 0,
"snippet": "string"
}
]
}
]
}
}
Errors & Warnings
If Deepgram encounters an error during real-time streaming, we will return a WebSocket Close frame. The body of the Close frame will indicate the reason for closing using one of the specification’s pre-defined status codes followed by a UTF-8-encoded payload that represents the reason for the error.

Current codes and payloads in use include:

Code Payload Description
1000 N/A Normal Closure
1008 DATA-0000 The payload cannot be decoded as audio. Either the encoding is incorrectly specified, the payload is not audio data, or the audio is in a format unsupported by Deepgram.
1011 NET-0000 The service has not transmitted a Text frame to the client within the timeout window. This may indicate an issue internally in Deepgram's systems or could be due to Deepgram not receiving enough audio data to transcribe a frame.
1011 NET-0001 The service has not received a Binary or Text frame from the client within the timeout window. This may indicate an internal issue in Deepgram's systems, the client's systems, or the network connecting them.
Loading

0 comments on commit 993f2ef

Please sign in to comment.