diff --git a/.github/workflows/build-project.yaml b/.github/workflows/build-project.yaml index 03af915..001562c 100644 --- a/.github/workflows/build-project.yaml +++ b/.github/workflows/build-project.yaml @@ -279,6 +279,14 @@ jobs: "pluginName=${ProductName}" >> $env:GITHUB_OUTPUT "pluginVersion=${ProductVersion}" >> $env:GITHUB_OUTPUT + - uses: actions/cache@v4 + id: conan-cache + with: + path: C:\Users\runneradmin\.conan2\ + key: ${{ runner.os }}-conan-cache-${{ needs.check-event.outputs.config }} + restore-keys: | + ${{ runner.os }}-conan-cache- + - name: Build Plugin 🧱 uses: ./.github/actions/build-plugin with: diff --git a/CMakeLists.txt b/CMakeLists.txt index b5d383d..2b74e53 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,7 +64,9 @@ target_sources( src/language-codes/language-codes.cpp src/cloud-providers/cloud-provider.cpp src/cloud-providers/clova/clova-provider.cpp + src/cloud-providers/deepgram/deepgram-provider.cpp src/cloud-providers/google/google-provider.cpp + src/cloud-providers/revai/revai-provider.cpp src/utils/ssl-utils.cpp src/utils/curl-helper.cpp src/timed-metadata/timed-metadata-utils.cpp) diff --git a/cmake/common/compiler_common.cmake b/cmake/common/compiler_common.cmake index de86046..b6c017b 100644 --- a/cmake/common/compiler_common.cmake +++ b/cmake/common/compiler_common.cmake @@ -22,7 +22,6 @@ set(_obs_clang_c_options # cmake-format: sortable -fno-strict-aliasing -Wbool-conversion - -Wcomma -Wconstant-conversion -Wdeprecated-declarations -Wempty-body diff --git a/cmake/macos/xcode.cmake b/cmake/macos/xcode.cmake index f5a3e83..0720863 100644 --- a/cmake/macos/xcode.cmake +++ b/cmake/macos/xcode.cmake @@ -122,7 +122,7 @@ set(CMAKE_XCODE_ATTRIBUTE_ENABLE_STRICT_OBJC_MSGSEND YES) set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING YES_ERROR) set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_BOOL_CONVERSION YES) set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS YES) -set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_COMMA YES) +set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_COMMA NO) set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_CONSTANT_CONVERSION YES) set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_EMPTY_BODY YES) set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_ENUM_CONVERSION YES) diff --git a/src/cloud-providers/cloud-provider.cpp b/src/cloud-providers/cloud-provider.cpp index 4947dfd..20fcd01 100644 --- a/src/cloud-providers/cloud-provider.cpp +++ b/src/cloud-providers/cloud-provider.cpp @@ -3,6 +3,8 @@ #include "clova/clova-provider.h" #include "google/google-provider.h" #include "aws/aws_provider.h" +#include "revai/revai-provider.h" +#include "deepgram/deepgram-provider.h" std::shared_ptr createCloudProvider(const std::string &providerType, CloudProvider::TranscriptionCallback callback, @@ -14,6 +16,10 @@ std::shared_ptr createCloudProvider(const std::string &providerTy return std::make_unique(callback, gf); } else if (providerType == "aws") { return std::make_unique(callback, gf); + } else if (providerType == "revai") { + return std::make_unique(callback, gf); + } else if (providerType == "deepgram") { + return std::make_unique(callback, gf); } return nullptr; // Return nullptr if no matching provider is found diff --git a/src/cloud-providers/deepgram/deepgram-provider.cpp b/src/cloud-providers/deepgram/deepgram-provider.cpp new file mode 100644 index 0000000..c6cde2e --- /dev/null +++ b/src/cloud-providers/deepgram/deepgram-provider.cpp @@ -0,0 +1,138 @@ +#include "deepgram-provider.h" +#include + +#include "language-codes/language-codes.h" + +using json = nlohmann::json; + +namespace http = beast::http; + +DeepgramProvider::DeepgramProvider(TranscriptionCallback callback, cloudvocal_data *gf_) + : CloudProvider(callback, gf_), + ioc(), + ssl_ctx(ssl::context::tlsv12_client), + resolver(ioc), + ws(ioc, ssl_ctx) +{ + needs_results_thread = true; // We need a separate thread for reading results +} + +bool DeepgramProvider::init() +{ + try { + // Setup SSL context + ssl_ctx.set_verify_mode(ssl::verify_peer); + ssl_ctx.set_default_verify_paths(); + + // Resolve the Deepgram endpoint + auto const results = resolver.resolve("api.deepgram.com", "443"); + + // Connect to Deepgram + net::connect(get_lowest_layer(ws), results); + + // Set SNI hostname (required for TLS) + if (!SSL_set_tlsext_host_name(ws.next_layer().native_handle(), + "api.deepgram.com")) { + throw beast::system_error( + beast::error_code(static_cast(::ERR_get_error()), + net::error::get_ssl_category()), + "Failed to set SNI hostname"); + } + + // Perform SSL handshake + ws.next_layer().handshake(ssl::stream_base::client); + + // Set up WebSocket handshake with API key + ws.set_option( + websocket::stream_base::decorator([this](websocket::request_type &req) { + req.set(http::field::sec_websocket_protocol, + "token, " + std::string(gf->cloud_provider_api_key)); + })); + + std::string query = std::string("/v1/listen?encoding=linear16&sample_rate=16000") + + "&language=" + language_codes_from_underscore[gf->language]; + // Perform WebSocket handshake + ws.handshake("api.deepgram.com", query); + + obs_log(LOG_INFO, "Connected to Deepgram WebSocket successfully"); + return true; + } catch (std::exception const &e) { + obs_log(LOG_ERROR, "Error initializing Deepgram connection: %s", e.what()); + return false; + } +} + +void DeepgramProvider::sendAudioBufferToTranscription(const std::deque &audio_buffer) +{ + if (audio_buffer.empty()) + return; + + try { + // Convert float audio to int16_t (linear16 format) + std::vector pcm_data; + pcm_data.reserve(audio_buffer.size()); + + for (float sample : audio_buffer) { + // Clamp and convert to int16 + float clamped = std::max(-1.0f, std::min(1.0f, sample)); + pcm_data.push_back(static_cast(clamped * 32767.0f)); + } + + // Send binary message + ws.write(net::buffer(pcm_data.data(), pcm_data.size() * sizeof(int16_t))); + + } catch (std::exception const &e) { + obs_log(LOG_ERROR, "Error sending audio to Deepgram: %s", e.what()); + running = false; + } +} + +void DeepgramProvider::readResultsFromTranscription() +{ + try { + // Read message into buffer + beast::flat_buffer buffer; + ws.read(buffer); + + // Convert to string and parse JSON + std::string msg = beast::buffers_to_string(buffer.data()); + json result = json::parse(msg); + + // Check if this is a transcription result + if (result["type"] == "Results" && !result["channel"]["alternatives"].empty()) { + DetectionResultWithText detection_result; + + // Fill the detection result structure + detection_result.text = result["channel"]["alternatives"][0]["transcript"]; + detection_result.result = result["is_final"] ? DETECTION_RESULT_SPEECH + : DETECTION_RESULT_PARTIAL; + + // If there are words with timestamps + if (!result["channel"]["alternatives"][0]["words"].empty()) { + auto &words = result["channel"]["alternatives"][0]["words"]; + detection_result.start_timestamp_ms = words[0]["start"]; + detection_result.end_timestamp_ms = words[words.size() - 1]["end"]; + } + + // Send result through callback + transcription_callback(detection_result); + } + } catch (std::exception const &e) { + obs_log(LOG_ERROR, "Error reading from Deepgram: %s", e.what()); + } +} + +void DeepgramProvider::shutdown() +{ + try { + // Send close message + ws.write(net::buffer(R"({"type":"CloseStream"})")); + + // Close WebSocket connection + ws.close(websocket::close_code::normal); + + obs_log(LOG_INFO, "Deepgram connection closed successfully"); + } catch (std::exception const &e) { + obs_log(LOG_ERROR, "Error during Deepgram shutdown: %s", e.what()); + } +} diff --git a/src/cloud-providers/deepgram/deepgram-provider.h b/src/cloud-providers/deepgram/deepgram-provider.h new file mode 100644 index 0000000..777ac0e --- /dev/null +++ b/src/cloud-providers/deepgram/deepgram-provider.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include +#include +#include +#include +#include "cloud-providers/cloud-provider.h" + +namespace beast = boost::beast; +namespace websocket = beast::websocket; +namespace net = boost::asio; +namespace ssl = boost::asio::ssl; +using tcp = boost::asio::ip::tcp; + +class DeepgramProvider : public CloudProvider { +public: + DeepgramProvider(TranscriptionCallback callback, cloudvocal_data *gf_); + bool init() override; + +protected: + void sendAudioBufferToTranscription(const std::deque &audio_buffer) override; + void readResultsFromTranscription() override; + void shutdown() override; + +private: + net::io_context ioc; + ssl::context ssl_ctx; + tcp::resolver resolver; + websocket::stream> ws; +}; diff --git a/src/cloud-providers/deepgram/live_audio_api.md b/src/cloud-providers/deepgram/live_audio_api.md new file mode 100644 index 0000000..87d36ba --- /dev/null +++ b/src/cloud-providers/deepgram/live_audio_api.md @@ -0,0 +1,156 @@ +Transcribe - Live audio +Use Deepgram's speech-to-text API to transcribe live-streaming audio. + +Deepgram provides its customers with real-time, streaming transcription via its streaming endpoints. These endpoints are high-performance, full-duplex services running over the WebSocket protocol. + +To learn more about working with real-time streaming data and results, see Get Started with Streaming Audio. + +Endpoint +Production WebSocket server for Deepgram's real-time transcription with streaming audio. TLS encryption will protect your connection and data. We support a minimum of TLS 1.2. + +Detail Description +Path wss://api.deepgram.com/v1/listen +Accepts +Type Description +Raw Audio File Data Unprocessed or uncompressed binary audio data (such as PCM) +Messages JSON formatted operations. +Headers +Header Value Description +Sec-WebSocket-Protocol token, Used to establish a WebSocket connection with a specific protocol, include your Deepgram API key for authentication. +Body Params +Parameter Type Description +callback string Callback URL to provide if you would like your submitted audio to be processed asynchronously. Learn More. +callback_method string Enable a callback method. Use put or post. Learn More. +channels int32 Number of independent audio channels contained in submitted streaming audio. Only read when a value is provided for encoding. Learn More. +dictation boolean Dictation automatically formats spoken commands for punctuation into their respective punctuation marks. Learn More. +diarize boolean Indicates whether to recognize speaker changes. When set to true, each word in the transcript will be assigned a speaker number starting at 0. Learn More. +diarize_version string Indicates the version of the diarization feature to use. Only available when the diarization feature is enabled. Learn More. +encoding string Expected encoding of the submitted streaming audio. If this parameter is set, sample_rate must also be specified. Learn More. +endpointing boolean Indicates how long Deepgram will wait to detect whether a speaker has finished speaking or pauses for a significant period of time. When set to true, the streaming endpoint immediately finalizes the transcription for the processed time range and returns the transcript with a speech_final parameter set to true. Learn More. +extra string Add any extra key-value pairs to the query string to customize the response. Learn More. +filler_words boolean Indicates whether to include filler words like "uh" and "um" in transcript output. When set to true, these words will be included. Learn More. +interim_results boolean Specifies whether the streaming endpoint should provide ongoing transcription updates as more audio is received. When set to true, the endpoint sends continuous updates, meaning transcription results may evolve over time. Learn More. +keywords string Unique proper nouns or specialized terms you want the model to include in its predictions, which aren't part of the model's default vocabulary. Learn More. +language string The BCP-47 language tag that hints at the primary spoken language. Learn More. +model string The AI model used to process submitted audio. Learn More. +multichannel boolean Indicates whether to transcribe each audio channel independently. Learn More. +numerals boolean Indicates whether to convert numbers from written format (e.g., one) to numerical format (e.g., 1). Learn More. +profanity_filter boolean Indicates whether to remove profanity from the transcript. Learn More. +punctuate boolean Indicates whether to add punctuation and capitalization to the transcript Learn More. +redact string Indicates whether to redact sensitive information, replacing redacted content with asterisks *. Learn More. +replace string Terms or phrases to search for in the submitted audio and replace. Learn More. +sample_rate int32 Sample rate of submitted streaming audio. Required (and only read) when a value is provided for encoding. Learn More. +search string Terms or phrases to search for in the submitted audio. Learn More. +smart_format boolean Indicates whether to apply formatting to transcript output. When set to true, additional formatting will be applied to transcripts to improve readability. Learn More. +tag string Set a tag to associate with the request. Learn More. +utterance_end_ms string Indicates how long Deepgram will wait to send a {"type": "UtteranceEnd"} message after a word has been transcribed. Learn More. +vad_events boolean Indicates that speech has started. {"type": "SpeechStarted"} You'll begin receiving messages upon speech starting. Learn More. +version string Version of the model to use. Learn More. +Sending Audio Data +All audio data is transmitted to the streaming endpoint as binary WebSocket messages, with payloads containing the raw audio data. The full-duplex protocol allows for real-time streaming, enabling you to receive transcription responses simultaneously as you upload data. For optimal performance, each streaming buffer should represent between 20 and 250 milliseconds of audio. + +Messages +Keep Alive +Optional + +Periodically send KeepAlive messages while streaming can ensure uninterrupted communication and minimizing costs. Learn More . + +JSON + +{ + "type": "KeepAlive" +} +Finalize +Optional + +Finalize message can be used to handle specific scenarios where you need to force the server to process all unprocessed audio data and immediately return the final results. Learn More. + +JSON + +{ + "type": "Finalize" +} +Close Stream +Optional + +The CloseStream message can be sent to the Deepgram server, instructing it close the connection. Learn More. + +JSON + +{ + "type": "CloseStream" +} +Responses +Refer to API Errors for more information. + +Status Description +200 Audio submitted for transcription. +400 Bad Request. +401 Invalid Authorization. +402 Payment Required, insufficient credits +403 Insufficient permissions. +503 Internal server error if the server is temporarily unable to serve requests. +Response Schema +JSON + +{ + "metadata": { + "transaction_key": "string", + "request_id": "uuid", + "sha256": "string", + "created": "string", + "duration": 0, + "channels": 0, + "models": [ + "string" + ], + }, + "type": "Results", + "channel_index": [ + 0, + 0 + ], + "duration": 0.0, + "start": 0.0, + "is_final": boolean, + "speech_final": boolean, + "channel": { + "alternatives": [ + { + "transcript": "string", + "confidence": 0, + "words": [ + { + "word": "string", + "start": 0, + "end": 0, + "confidence": 0 + } + ] + } + ], + "search": [ + { + "query": "string", + "hits": [ + { + "confidence": 0, + "start": 0, + "end": 0, + "snippet": "string" + } + ] + } + ] + } +} +Errors & Warnings +If Deepgram encounters an error during real-time streaming, we will return a WebSocket Close frame. The body of the Close frame will indicate the reason for closing using one of the specification’s pre-defined status codes followed by a UTF-8-encoded payload that represents the reason for the error. + +Current codes and payloads in use include: + +Code Payload Description +1000 N/A Normal Closure +1008 DATA-0000 The payload cannot be decoded as audio. Either the encoding is incorrectly specified, the payload is not audio data, or the audio is in a format unsupported by Deepgram. +1011 NET-0000 The service has not transmitted a Text frame to the client within the timeout window. This may indicate an issue internally in Deepgram's systems or could be due to Deepgram not receiving enough audio data to transcribe a frame. +1011 NET-0001 The service has not received a Binary or Text frame from the client within the timeout window. This may indicate an internal issue in Deepgram's systems, the client's systems, or the network connecting them. diff --git a/src/cloud-providers/deepgram/websocket_api.md b/src/cloud-providers/deepgram/websocket_api.md new file mode 100644 index 0000000..c0d8a23 --- /dev/null +++ b/src/cloud-providers/deepgram/websocket_api.md @@ -0,0 +1,202 @@ +Deepgram Streaming API +The goal of your SDK should minimally be: + +Manage the Connection Lifecycle: Implement robust connection management to handle opening, error handling, message sending, receiving, and closing of the WebSocket connection. +Concurrency and Threading: Depending on the SDK's target language, manage concurrency appropriately to handle the asynchronous nature of WebSocket communication without blocking the main thread. +Error Handling and Reconnection: Implement error handling and automatic reconnection logic. Transient network issues should not result in lost data or service interruptions. +Implement KeepAlives: Deepgram's API may require keepalive messages to maintain the connection. Implement a mechanism to send periodic pings or other suitable messages to prevent timeouts. +High-Level Pseudo-Code for Deepgram Streaming API +It's essential that you encapsulate your WebSocket connection in a class or similar representation. This will reduce undesired, highly coupled WebSocket code with your application's code. In the industry, this has often been referred to as minimizing "Spaghetti code". If you have WebSocket code or you need to import the above WebSocket libraries into your func main(), this is undesirable unless your application is trivially small. + +To implement the WebSocket Client correctly, you must implement based on the WebSocket protocol defined in RFC-6455. Please refer to section 4.1 Client Requirements in RFC-6455. + +You want first to declare a WebSocket class of some sort specific to your implementation language: + +Text + +// This class could simply be called WebSocketClient +// However, since this is specifically for Deepgram, it could be called DeepgramClient +class WebSocketClient { + private url: String + private apiKey: String + private websocket: WebSocket + + // other class properties + + // other class methods +} +NOTE: Depending on the programming language of choice, you might either need to implement async/await and threaded classes to support both threading models. These concepts occur in languages like Javascript, Python, and others. You can implement one or both based on your user's needs. + +You will then need to implement the following class methods. + +Function: Connect + +class WebSocketClient { + ... + function Connect() { + // Implement the websocket connection here + } + ... +} +This function should: + +Initialize the WebSocket connection using the URL and API Key. +Set up event listener threads for connection events (message, metadata, error). +Start the keep alive timer based on the Keepalive Interval. +Thread: Receive and Process Messages + +class WebSocketClient { + ... + function ThreadProcessMessages() { + // Implement the thread handler to process messages + } + ... +} +This thread should: + +When a message arrives, check if it's a transcription result or a system message. +For transcription messages, process or handle the transcription data. +Handle system messages accordingly (may include error messages or status updates). +Function: Send + +class WebSocketClient { + ... + function SendBinary([]bytes) { + // Implements a send function to transport audio to the Deepgram server + } + + function SendMessage([]byte) { + // Implements a send function to transport control messages to the Deepgram server + } + ... +} +The SendBinary() function should: + +Accept audio data as input. +Send the audio data over the WebSocket connection to Deepgram for processing. +The SendMessage() function should: + +Accept JSON data as input. +Send the JSON over the WebSocket connection to Deepgram for handling control or connection management type functions. A KeepAlive or CloseStreammessages are examples of these types of messages. +If you need more information on the difference, please refer to RFC-6455. + +(Optional) Thread: KeepAlive + +class WebSocketClient { + ... + function ThreadKeepAlive() { + // Implement the thread handler to process messages + } + ... +} +This thread is optional providing that audio data is constantly streaming to through the WebSocket; otherwise, it should: + +Regularly send a keepalive message (such as a ping) to Deepgram based on the Keepalive Interval to maintain the connection. +Notice this thread is independent of the Receive/Process Message Thread above. + +Function: Close + +class WebSocketClient { + ... + function Close() { + // Implement shutting down the websocket + } + ... +} +This function should: + +Send a command to close the WebSocket connection. +Stop the keepalive timer to clean up resources. +Deepgram API Specifics +Now that you have a basic client, you must handle the Deepgram API specifics. Refer to this documentation for more information . + +Function: Connect +When establishing a connection, you must pass the required parameters defined by the Deepgram Query Parameters. + +Thread: Receive and Process Messages +If successfully connected, you should start receiving transcription messages (albeit empty) in the Response Schema defined below. + +JSON + +{ + "metadata": { + "transaction_key": "string", + "request_id": "uuid", + "sha256": "string", + "created": "string", + "duration": 0, + "channels": 0, + "models": [ + "string" + ], + }, + "type": "Results", + "channel_index": [ + 0, + 0 + ], + "duration": 0.0, + "start": 0.0, + "is_final": boolean, + "speech_final": boolean, + "channel": { + "alternatives": [ + { + "transcript": "string", + "confidence": 0, + "words": [ + { + "word": "string", + "start": 0, + "end": 0, + "confidence": 0 + } + ] + } + ], + "search": [ + { + "query": "string", + "hits": [ + { + "confidence": 0, + "start": 0, + "end": 0, + "snippet": "string" + } + ] + } + ] + } +} +For convenience, you will need to marshal these JSON representations into usable objects/classes to give your users an easier time using your SDK. + +(Optional) Thread: KeepAlive +If you do implement the KeepAlive message, you will need to follow the guidelines defined here. + +Function: Close +When you are ready to close your WebSocket client, you will need to follow the shutdown guidelines defined here. + +Special Considerations: Errors +You must be able to handle any protocol-level defined in RFC-6455 and application-level (i.e., messages from Deepgram) you will need to follow the guidelines defined here. + +Troubleshooting +Here are some common implementation mistakes. + +My WebSocket Connection Immediately Disconnects +There are usually a few reasons why the Deepgram Platform will terminate the connection: + +No audio data is making it through the WebSocket to the Deepgram Platform. The platform will terminate the connection if no audio data is received in roughly 10 seconds. +A variation on the above... you have muted the audio source and are no longer sending an audio stream or data. +The audio encoding is not supported OR the encoding parameter does not match the encoding in the audio stream. +Invalid connection options via the query parameters are being used. This could be things like misspelling an option or using an incorrect value. +My WebSocket Connection Disconnects in the Middle of My Conversation +There are usually a few reasons why the Deepgram Platform will terminate the connection (similar to the above): + +You have muted the audio source and are no longer sending an audio stream or data. +If no audio data is being sent, you must implement the KeepAlive protocol message. +My Transcription Messages Are Getting Delayed +There are usually a few reasons why the Deepgram Platform delays sending transcription messages: + +You inadvertently send the KeepAlive protocol message as a Data or Stream message. This will cause the audio processing to choke or hiccup, thus causing the delay. Please refer to RFC-6455 to learn more about the difference between data and control messages. +Network connectivity issues. Please ensure your connection to the Deepgram domain/IP is good. Use ping and traceroute or tracert to map the network path from source to destination. diff --git a/src/cloud-providers/revai/revai-provider.cpp b/src/cloud-providers/revai/revai-provider.cpp new file mode 100644 index 0000000..67e80bd --- /dev/null +++ b/src/cloud-providers/revai/revai-provider.cpp @@ -0,0 +1,211 @@ +#include "revai-provider.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "nlohmann/json.hpp" +#include "language-codes/language-codes.h" + +namespace http = beast::http; +using json = nlohmann::json; + +// Response structure definitions +struct TranscriptElement { + std::string type; // "text" or "punctuation" + std::string value; // The actual text or punctuation mark + double ts; // Start time in seconds + double end_ts; // End time in seconds + double confidence; // Confidence score +}; + +struct TranscriptResponse { + std::string type; // "connected", "partial", or "final" + std::string id; // Job ID (only in connected message) + double ts; // Start time in seconds + double end_ts; // End time in seconds + std::vector elements; // Elements in the transcript +}; + +// JSON conversion functions +void from_json(const json &j, TranscriptElement &e) +{ + j.at("type").get_to(e.type); + j.at("value").get_to(e.value); + if (j.contains("ts")) + j.at("ts").get_to(e.ts); + if (j.contains("end_ts")) + j.at("end_ts").get_to(e.end_ts); + if (j.contains("confidence")) + j.at("confidence").get_to(e.confidence); +} + +void from_json(const json &j, TranscriptResponse &r) +{ + j.at("type").get_to(r.type); + if (j.contains("id")) + j.at("id").get_to(r.id); + if (j.contains("ts")) + j.at("ts").get_to(r.ts); + if (j.contains("end_ts")) + j.at("end_ts").get_to(r.end_ts); + if (j.contains("elements")) + j.at("elements").get_to(r.elements); +} + +RevAIProvider::RevAIProvider(TranscriptionCallback callback, cloudvocal_data *gf) + : CloudProvider(callback, gf), + is_connected(false), + ctx_(ssl::context::tlsv12_client), + ws_(ioc_, ctx_) +{ + is_connected = false; +} + +bool RevAIProvider::init() +{ + // Initialize SSL context + ctx_.set_verify_mode(ssl::verify_peer); + ctx_.set_default_verify_paths(); + + // These objects perform our I/O + tcp::resolver resolver{ioc_}; + + // Look up the domain name + auto const results = resolver.resolve(host_, "443"); + + // Make the connection on the IP address we get from a lookup + auto ep = net::connect(get_lowest_layer(ws_), results); + + // Set SNI Hostname (many hosts need this to handshake successfully) + if (!SSL_set_tlsext_host_name(ws_.next_layer().native_handle(), host_.c_str())) + throw beast::system_error(beast::error_code(static_cast(::ERR_get_error()), + net::error::get_ssl_category()), + "Failed to set SNI Hostname"); + + // Update the host string. This will provide the value of the + // Host HTTP header during the WebSocket handshake. + std::string host = host_ + ':' + std::to_string(ep.port()); + + // Perform the SSL handshake + ws_.next_layer().handshake(ssl::stream_base::client); + + // Perform the websocket handshake + std::string query = + target_ + "?access_token=" + this->gf->cloud_provider_api_key + + "&content_type=audio/x-raw;layout=interleaved;rate=16000;format=S16LE;channels=1;" + + "language=" + language_codes_from_underscore[gf->language]; + + ws_.set_option(websocket::stream_base::decorator([&host](websocket::request_type &req) { + req.set(http::field::host, host); + req.set(http::field::user_agent, "RevAI-CPP-Client"); + })); + + ws_.handshake(host, query); + return true; +} + +void RevAIProvider::sendAudioBufferToTranscription(const std::deque &audio_buffer) +{ + // Convert audio buffer to S16LE + std::vector converted = convertFloatToS16LE(audio_buffer); + + // Send audio buffer to Rev.ai + ws_.write(net::buffer(converted)); +} + +// Receive and handle messages +void RevAIProvider::readResultsFromTranscription() +{ + beast::flat_buffer buffer; + + try { + // Read a message + ws_.read(buffer); + + // Handle the message + std::string msg = beast::buffers_to_string(buffer.data()); + obs_log(LOG_INFO, "Received: %s", msg.c_str()); + + auto j = json::parse(msg); + TranscriptResponse response = j.get(); + + DetectionResultWithText result; + bool send_result = false; + + // Store job ID if this is a connected message + if (response.type == "connected") { + obs_log(LOG_INFO, "Connected to Rev AI. Job ID: %s", response.id.c_str()); + } + // Handle partial transcripts + else if (response.type == "partial") { + result.text = ""; + result.result = DetectionResult::DETECTION_RESULT_PARTIAL; + for (const auto &element : response.elements) { + if (element.type == "text") { + result.text += element.value + " "; + } else { + result.text += element.value; + } + } + send_result = true; + } + // Handle final transcripts + else if (response.type == "final") { + result.text = ""; + result.result = DetectionResult::DETECTION_RESULT_SPEECH; + for (const auto &element : response.elements) { + if (element.type == "text") { + result.text += element.value + " "; + } else { + result.text += element.value; + } + } + send_result = true; + } else { + obs_log(LOG_WARNING, "Unknown message type: %s", response.type.c_str()); + } + + if (send_result) { + result.language = language_codes_from_underscore[gf->language]; + result.start_timestamp_ms = (uint64_t)response.ts; + result.end_timestamp_ms = (uint64_t)response.end_ts; + this->transcription_callback(result); + } + + buffer.consume(buffer.size()); + } catch (beast::system_error const &se) { + // This indicates the connection was closed + if (se.code() != websocket::error::closed) { + obs_log(LOG_ERROR, "Error: %s", se.code().message().c_str()); + } + } catch (std::exception const &e) { + obs_log(LOG_ERROR, "Error: %s", e.what()); + } +} + +void RevAIProvider::shutdown() +{ + // Send EOS to signal end of stream + ws_.write(net::buffer(std::string("EOS"))); + + // Close the WebSocket connection + ws_.close(websocket::close_code::normal); +} + +std::vector RevAIProvider::convertFloatToS16LE(const std::deque &audio_buffer) +{ + std::vector converted; + converted.reserve(audio_buffer.size()); + + for (float sample : audio_buffer) { + // Clamp to [-1.0, 1.0] and convert to S16LE + sample = std::fmaxf(-1.0f, std::fminf(1.0f, sample)); + converted.push_back(static_cast(sample * 32767.0f)); + } + return converted; +} diff --git a/src/cloud-providers/revai/revai-provider.h b/src/cloud-providers/revai/revai-provider.h new file mode 100644 index 0000000..eb0650a --- /dev/null +++ b/src/cloud-providers/revai/revai-provider.h @@ -0,0 +1,50 @@ +#pragma once + +#include "cloud-providers/cloud-provider.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace beast = boost::beast; +namespace websocket = beast::websocket; +namespace net = boost::asio; +namespace ssl = boost::asio::ssl; +using tcp = boost::asio::ip::tcp; + +class RevAIProvider : public CloudProvider { +public: + RevAIProvider(TranscriptionCallback callback, cloudvocal_data *gf); + + virtual bool init() override; + +protected: + virtual void sendAudioBufferToTranscription(const std::deque &audio_buffer) override; + virtual void readResultsFromTranscription() override; + virtual void shutdown() override; + +private: + // Utility functions + std::vector convertFloatToS16LE(const std::deque &audio_buffer); + + // Member variables + bool is_connected; + std::string job_id; + + net::io_context ioc_; + ssl::context ctx_; + websocket::stream> ws_; + const std::string host_ = "api.rev.ai"; + const std::string target_ = "/speechtotext/v1/stream"; +}; diff --git a/src/cloud-providers/revai/revai_api_docs.md b/src/cloud-providers/revai/revai_api_docs.md new file mode 100644 index 0000000..a8617e3 --- /dev/null +++ b/src/cloud-providers/revai/revai_api_docs.md @@ -0,0 +1,142 @@ +# WebSocket protocol + +## API endpoint +All connections to Rev AI's Streaming Speech-to-Text API start as a WebSocket handshake HTTP request to wss://api.rev.ai/speechtotext/v1/stream. On successful authorization, the client can start sending binary WebSocket messages containing audio data in one of the supported formats. As speech is detected, Rev AI returns hypotheses of the recognized speech content. + +warning +The base URL is different from the base URL for the Asynchronous Speech-to-Text API. + +Example +wss://api.rev.ai/speechtotext/v1/stream?access_token=&content_type=audio/x-raw;layout=interleaved;rate=16000;format=S16LE;channels=1&metadata= + +## Responses +All transcript responses from the Streaming Speech-to-Text API are text messages and are returned as serialized JSON. The transcript response has two states: partial hypothesis and final hypothesis. + +The JSON will contain a type property which indicates what kind of response the message is. Valid values for this type property are: + +"connected" +"partial" +"final" +The "connected" type is only returned once during the initial handshake when opening a WebSocket connection. All other responses should be of the type "partial" or "final". + +### Response Object +Here is a brief description of the response object and its properties: + +Property Name Type Description +type string Either "partial" or "final" +ts double The start time of the hypothesis in seconds +end_ts double The end time of the hypothesis in seconds +elements array of Elements Only present if final property is true. A list of Rev AI transcript element properties. See Transcript object for details that are all the recognized words up to current point in audio + +## Request +A WebSocket request to the Streaming Speech-to-Text API consists of the following parts: + +Request parameter Required Default +Base URL (WebSocket) or read_url URL (RTMP) Yes +Access Token access_token Yes None +Content Type content_type Yes None +Language language No en +Metadata metadata No None +Custom Vocabulary custom_vocabulary_id No None +Profanity Filter filter_profanity No false +Disfluencies remove_disfluencies No false +Delete After Seconds delete_after_seconds No None +Detailed Partials detailed_partials No false +Start Timestamp start_ts No None +Maximum segment duration seconds max_segment_duration_seconds No None +Transcriber transcriber No See transcriber section +Speaker switch detection enable_speaker_switch No false +Skip Post-processing skip_postprocessing No false +Priority priority No speed +Maximum wait time for connection max_connection_wait_seconds No 60 +Access token +Clients must authenticate by including their Rev AI access token as a query parameter in their requests. If access_token is invalid or the query parameter is not present, the WebSocket connection will be closed with code 4001. + +Example +wss://api.rev.ai/speechtotext/v1/stream?access_token=&content_type=audio/x-raw;layout=interleaved;rate=16000;format=S16LE;channels=1 +Content type + +All requests must also contain a content_type query parameter. The content type describes the format of audio data being sent. If you are submitting raw audio, Rev AI requires extra parameters as shown below. If the content type is invalid or not set, the WebSocket connection is closed with a 4002 close code. + +Rev AI officially supports these content types: + +audio/x-raw (has additional requirements ) +audio/x-flac +audio/x-wav +RAW file content type +You are required to provide additional information in content_type when content_type is audio/x-raw. + +Parameter (type) Description Allowed Values Required +layout (string) The layout of channels within a buffer. Possible values are "interleaved" (for LRLRLRLR) and "non-interleaved" (LLLLRRRR). Not case-sensitive interleaved,non-interleaved audio/x-raw only +rate (int) Sample rate of the audio bytes Inclusive Range from 8000-48000Hz audio/x-raw only +format (string) Format of the audio samples. Case-sensitive. See Allowed Values column for valid values List of valid formats audio/x-raw only +channels (int) Number of audio channels that the audio samples contain Inclusive range from 1-10 channels audio/x-raw only +These parameters follow the content_type, delimited by semi-colons (;). Each parameter should be specified in the format parameter_name=parameter_value. + +Example +wss://api.rev.ai/speechtotext/v1/stream?access_token=&content_type=audio/x-raw;layout=interleaved;rate=16000;format=S16LE;channels=1&metadata= +Language +attention +Custom Prices (other than the default) are set independently by language. Please refer to your contract for pricing information. If you are not a contract customer the pricing is found here + +Specify the transcription language with the language query parameter. When the language is not provided, transcription will default to English. The language query parameter cannot be used along with the following options: filter_profanity, remove_disfluencies, and custom_vocabulary_id. + +Language Language Code +English en +French fr +German de +Italian it +Japanese ja +Korean ko +Mandarin cmn +Portuguese pt +Spanish es +Additional requirements for content type: + +content_type must be audio/x-raw or audio/x-flac +when providing raw audio, it must be formatted as S16LE +rate must be included, regardless of content_type + +### Request stages + +#### Initial connection +All requests begin as an HTTP GET request. A WebSocket request is declared by including the header value Upgrade: websocket and Connection: Upgrade. + +Client --> Rev AI +GET /speechtotext/v1/stream HTTP/1.1 +Host: api.rev.ai +Upgrade: websocket +Connection: Upgrade +Sec-WebSocket-Key: Chxzu/uTUCmjkFH9d/8NTg== +Sec-WebSocket-Version: 13 +Origin: http://api.rev.ai +If authorization is successful, the request is upgraded to a WebSocket connection. + +Client <-- Rev AI +HTTP/1.1 101 Switching Protocols +Upgrade: websocket +Connection: Upgrade +Sec-WebSocket-Accept: z0pcAwXZZRVlMcca8lmHCPzvrKU= +After the connection has been upgraded, the servers will return a "connected" message. You must wait for this connected message before sending binary audio data. The response includes an id, which is the corresponding job identifier, as shown in the example below: + +{ + "type": "connected", + "id": s1d24ax2fd21 +} +warning +If Rev AI currently does not have the capacity to handle the request, a WebSocket close message is returned with status code of 4013. A HTTP/1.1 400 Bad Request response indicates that the request is not a WebSocket upgrade request. + +#### Audio submission +WebSocket messages sent to Rev AI must be of one of these two WebSocket message types: + +Message type Message requirements Notes +Binary Audio data is transmitted as binary data and should be sent in chunks of 250ms or more. + +Streams sending audio chunks that are less than 250ms in size may experience increased transcription latency. The format of the audio must match that specified in the content_type parameter. +Text The client should send an End-Of-Stream("EOS") text message to signal the end of audio data, and thus gracefully close the WebSocket connection. + +On an EOS message, Rev AI will return a final hypothesis along with a WebSocket close message. Currently, only this one text message type is supported. + +WebSocket close type messages are explicitly not supported as a message type and will abruptly close the socket connection with a 1007 Invalid Payload error. Clients will not receive their final hypothesis in this case. + +Any other text messages, including incorrectly capitalized messages such as "eos" and "Eos", are invalid and will also close the socket connection with a 1007 Invalid Payload error. diff --git a/src/cloudvocal-properties.cpp b/src/cloudvocal-properties.cpp index 05f0102..b710f92 100644 --- a/src/cloudvocal-properties.cpp +++ b/src/cloudvocal-properties.cpp @@ -283,6 +283,10 @@ void add_general_group_properties(obs_properties_t *ppts) "clova"); obs_property_list_add_string(transcription_cloud_provider_select_list, MT_("Google"), "google"); + obs_property_list_add_string(transcription_cloud_provider_select_list, MT_("RevAI"), + "revai"); + obs_property_list_add_string(transcription_cloud_provider_select_list, MT_("Deepgram"), + "deepgram"); // obs_property_list_add_string(transcription_cloud_provider_select_list, MT_("AWS"), "aws"); obs_property_t *subs_output =